File size: 2,677 Bytes
359afe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import nltk

# Download the required POS tagger
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('cmudict')   # also useful for g2p-en


from g2p_en import G2p

# Initialize g2p
g2p = G2p()
def safe_g2p(text: str):
    try:
        return g2p(text)
    except Exception as e:
        # fallback: remove digits and retry
        cleaned = re.sub(r"\d+", "", text)
        return g2p(cleaned)

import re

def clean_text(text):
    # Keep letters, numbers, spaces, and apostrophes
    return re.sub(r"[^a-zA-Z0-9' ]+", "", text)

def clean_cmu(text):
    res = text.replace("0", "").replace("1", "").replace("2", "").replace("-", "").strip()
    res = res.lower()
    return res

CMU_TO_IPA = {
    # Vowels
    "AA": "ɑ",    # odd
    "AE": "æ",    # at
    "AH": "ʌ",    # hut
    "AH0": "ə",   # about (unstressed)
    "AO": "ɔ",    # ought, story
    "AW": "aʊ",   # cow
    "AY": "aɪ",   # hide
    "EH": "ɛ",    # Ed
    "ER": "ɝ",    # stressed "ur", hurt
    "ER0": "ɚ",   # unstressed "ər"
    "EY": "eɪ",   # ate
    "IH": "ɪ",    # it
    "IY": "i",    # eat
    "OW": "oʊ",   # oat
    "OY": "ɔɪ",   # toy
    "UH": "ʊ",    # hood
    "UW": "u",    # two

    # Consonants
    "B": "b",
    "CH": "tʃ",
    "D": "d",
    "DH": "ð",
    "F": "f",
    "G": "ɡ",
    "HH": "h",
    "JH": "dʒ",
    "K": "k",
    "L": "l",
    "M": "m",
    "N": "n",
    "NG": "ŋ",
    "P": "p",
    "R": "r",
    "S": "s",
    "SH": "ʃ",
    "T": "t",
    "TH": "θ",
    "V": "v",
    "W": "w",
    "Y": "j",
    "Z": "z",
    "ZH": "ʒ",
}

def cmu_to_ipa(cmu_sentence: str) -> str:
    """
    Greedy match CMUdict/ARPAbet phoneme sequence into IPA.
    - Try 2-character tokens first.
    - Fallback to 1-character tokens.
    Example: "DAWN T MEYK" -> "daʊn t meɪk"
    """
    ipa_tokens = []
    words = cmu_sentence.strip().split()

    for word in words:
        i = 0
        while i < len(word):
            # Try 2-char match
            if i + 2 <= len(word) and word[i:i+2].upper() in CMU_TO_IPA:
                ipa_tokens.append(CMU_TO_IPA[word[i:i+2].upper()])
                i += 2
            # Try 1-char match
            elif word[i].upper() in CMU_TO_IPA:
                ipa_tokens.append(CMU_TO_IPA[word[i].upper()])
                i += 1
            else:
                # fallback: keep as lowercase character
                ipa_tokens.append(word[i].lower())
                i += 1
        ipa_tokens.append(" ")

    return "".join(ipa_tokens)  # join chars without extra spaces

def text_to_phoneme(text):
    phonemes = safe_g2p(clean_text(text))
    res = "".join(phonemes)
    res = clean_cmu(res)
    return res