lataon's picture
runable eval
359afe5
raw
history blame
2.68 kB
import nltk
# Download the required POS tagger
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('cmudict') # also useful for g2p-en
from g2p_en import G2p
# Initialize g2p
g2p = G2p()
def safe_g2p(text: str):
try:
return g2p(text)
except Exception as e:
# fallback: remove digits and retry
cleaned = re.sub(r"\d+", "", text)
return g2p(cleaned)
import re
def clean_text(text):
# Keep letters, numbers, spaces, and apostrophes
return re.sub(r"[^a-zA-Z0-9' ]+", "", text)
def clean_cmu(text):
res = text.replace("0", "").replace("1", "").replace("2", "").replace("-", "").strip()
res = res.lower()
return res
CMU_TO_IPA = {
# Vowels
"AA": "ɑ", # odd
"AE": "æ", # at
"AH": "ʌ", # hut
"AH0": "ə", # about (unstressed)
"AO": "ɔ", # ought, story
"AW": "aʊ", # cow
"AY": "aɪ", # hide
"EH": "ɛ", # Ed
"ER": "ɝ", # stressed "ur", hurt
"ER0": "ɚ", # unstressed "ər"
"EY": "eɪ", # ate
"IH": "ɪ", # it
"IY": "i", # eat
"OW": "oʊ", # oat
"OY": "ɔɪ", # toy
"UH": "ʊ", # hood
"UW": "u", # two
# Consonants
"B": "b",
"CH": "tʃ",
"D": "d",
"DH": "ð",
"F": "f",
"G": "ɡ",
"HH": "h",
"JH": "dʒ",
"K": "k",
"L": "l",
"M": "m",
"N": "n",
"NG": "ŋ",
"P": "p",
"R": "r",
"S": "s",
"SH": "ʃ",
"T": "t",
"TH": "θ",
"V": "v",
"W": "w",
"Y": "j",
"Z": "z",
"ZH": "ʒ",
}
def cmu_to_ipa(cmu_sentence: str) -> str:
"""
Greedy match CMUdict/ARPAbet phoneme sequence into IPA.
- Try 2-character tokens first.
- Fallback to 1-character tokens.
Example: "DAWN T MEYK" -> "daʊn t meɪk"
"""
ipa_tokens = []
words = cmu_sentence.strip().split()
for word in words:
i = 0
while i < len(word):
# Try 2-char match
if i + 2 <= len(word) and word[i:i+2].upper() in CMU_TO_IPA:
ipa_tokens.append(CMU_TO_IPA[word[i:i+2].upper()])
i += 2
# Try 1-char match
elif word[i].upper() in CMU_TO_IPA:
ipa_tokens.append(CMU_TO_IPA[word[i].upper()])
i += 1
else:
# fallback: keep as lowercase character
ipa_tokens.append(word[i].lower())
i += 1
ipa_tokens.append(" ")
return "".join(ipa_tokens) # join chars without extra spaces
def text_to_phoneme(text):
phonemes = safe_g2p(clean_text(text))
res = "".join(phonemes)
res = clean_cmu(res)
return res