|
|
import nltk |
|
|
|
|
|
|
|
|
nltk.download('averaged_perceptron_tagger_eng') |
|
|
nltk.download('cmudict') |
|
|
|
|
|
|
|
|
from g2p_en import G2p |
|
|
|
|
|
|
|
|
g2p = G2p() |
|
|
def safe_g2p(text: str): |
|
|
try: |
|
|
return g2p(text) |
|
|
except Exception as e: |
|
|
|
|
|
cleaned = re.sub(r"\d+", "", text) |
|
|
return g2p(cleaned) |
|
|
|
|
|
import re |
|
|
|
|
|
def clean_text(text): |
|
|
|
|
|
return re.sub(r"[^a-zA-Z0-9' ]+", "", text) |
|
|
|
|
|
def clean_cmu(text): |
|
|
res = text.replace("0", "").replace("1", "").replace("2", "").replace("-", "").strip() |
|
|
res = res.lower() |
|
|
return res |
|
|
|
|
|
CMU_TO_IPA = { |
|
|
|
|
|
"AA": "ɑ", |
|
|
"AE": "æ", |
|
|
"AH": "ʌ", |
|
|
"AH0": "ə", |
|
|
"AO": "ɔ", |
|
|
"AW": "aʊ", |
|
|
"AY": "aɪ", |
|
|
"EH": "ɛ", |
|
|
"ER": "ɝ", |
|
|
"ER0": "ɚ", |
|
|
"EY": "eɪ", |
|
|
"IH": "ɪ", |
|
|
"IY": "i", |
|
|
"OW": "oʊ", |
|
|
"OY": "ɔɪ", |
|
|
"UH": "ʊ", |
|
|
"UW": "u", |
|
|
|
|
|
|
|
|
"B": "b", |
|
|
"CH": "tʃ", |
|
|
"D": "d", |
|
|
"DH": "ð", |
|
|
"F": "f", |
|
|
"G": "ɡ", |
|
|
"HH": "h", |
|
|
"JH": "dʒ", |
|
|
"K": "k", |
|
|
"L": "l", |
|
|
"M": "m", |
|
|
"N": "n", |
|
|
"NG": "ŋ", |
|
|
"P": "p", |
|
|
"R": "r", |
|
|
"S": "s", |
|
|
"SH": "ʃ", |
|
|
"T": "t", |
|
|
"TH": "θ", |
|
|
"V": "v", |
|
|
"W": "w", |
|
|
"Y": "j", |
|
|
"Z": "z", |
|
|
"ZH": "ʒ", |
|
|
} |
|
|
|
|
|
def cmu_to_ipa(cmu_sentence: str) -> str: |
|
|
""" |
|
|
Greedy match CMUdict/ARPAbet phoneme sequence into IPA. |
|
|
- Try 2-character tokens first. |
|
|
- Fallback to 1-character tokens. |
|
|
Example: "DAWN T MEYK" -> "daʊn t meɪk" |
|
|
""" |
|
|
ipa_tokens = [] |
|
|
words = cmu_sentence.strip().split() |
|
|
|
|
|
for word in words: |
|
|
i = 0 |
|
|
while i < len(word): |
|
|
|
|
|
if i + 2 <= len(word) and word[i:i+2].upper() in CMU_TO_IPA: |
|
|
ipa_tokens.append(CMU_TO_IPA[word[i:i+2].upper()]) |
|
|
i += 2 |
|
|
|
|
|
elif word[i].upper() in CMU_TO_IPA: |
|
|
ipa_tokens.append(CMU_TO_IPA[word[i].upper()]) |
|
|
i += 1 |
|
|
else: |
|
|
|
|
|
ipa_tokens.append(word[i].lower()) |
|
|
i += 1 |
|
|
ipa_tokens.append(" ") |
|
|
|
|
|
return "".join(ipa_tokens) |
|
|
|
|
|
def text_to_phoneme(text): |
|
|
phonemes = safe_g2p(clean_text(text)) |
|
|
res = "".join(phonemes) |
|
|
res = clean_cmu(res) |
|
|
return res |