Phoneme_Detection_Leaderboard

Running

App Files Files Community

Phoneme_Detection_Leaderboard / src /utils /cmu_process.py

lataon

runable eval

359afe5 3 months ago

raw

history blame

2.68 kB

	import nltk

	# Download the required POS tagger
	nltk.download('averaged_perceptron_tagger_eng')
	nltk.download('cmudict') # also useful for g2p-en


	from g2p_en import G2p

	# Initialize g2p
	g2p = G2p()
	def safe_g2p(text: str):
	try:
	return g2p(text)
	except Exception as e:
	# fallback: remove digits and retry
	cleaned = re.sub(r"\d+", "", text)
	return g2p(cleaned)

	import re

	def clean_text(text):
	# Keep letters, numbers, spaces, and apostrophes
	return re.sub(r"[^a-zA-Z0-9' ]+", "", text)

	def clean_cmu(text):
	res = text.replace("0", "").replace("1", "").replace("2", "").replace("-", "").strip()
	res = res.lower()
	return res

	CMU_TO_IPA = {
	# Vowels
	"AA": "ɑ", # odd
	"AE": "æ", # at
	"AH": "ʌ", # hut
	"AH0": "ə", # about (unstressed)
	"AO": "ɔ", # ought, story
	"AW": "aʊ", # cow
	"AY": "aɪ", # hide
	"EH": "ɛ", # Ed
	"ER": "ɝ", # stressed "ur", hurt
	"ER0": "ɚ", # unstressed "ər"
	"EY": "eɪ", # ate
	"IH": "ɪ", # it
	"IY": "i", # eat
	"OW": "oʊ", # oat
	"OY": "ɔɪ", # toy
	"UH": "ʊ", # hood
	"UW": "u", # two

	# Consonants
	"B": "b",
	"CH": "tʃ",
	"D": "d",
	"DH": "ð",
	"F": "f",
	"G": "ɡ",
	"HH": "h",
	"JH": "dʒ",
	"K": "k",
	"L": "l",
	"M": "m",
	"N": "n",
	"NG": "ŋ",
	"P": "p",
	"R": "r",
	"S": "s",
	"SH": "ʃ",
	"T": "t",
	"TH": "θ",
	"V": "v",
	"W": "w",
	"Y": "j",
	"Z": "z",
	"ZH": "ʒ",
	}

	def cmu_to_ipa(cmu_sentence: str) -> str:
	"""
	Greedy match CMUdict/ARPAbet phoneme sequence into IPA.
	- Try 2-character tokens first.
	- Fallback to 1-character tokens.
	Example: "DAWN T MEYK" -> "daʊn t meɪk"
	"""
	ipa_tokens = []
	words = cmu_sentence.strip().split()

	for word in words:
	i = 0
	while i < len(word):
	# Try 2-char match
	if i + 2 <= len(word) and word[i:i+2].upper() in CMU_TO_IPA:
	ipa_tokens.append(CMU_TO_IPA[word[i:i+2].upper()])
	i += 2
	# Try 1-char match
	elif word[i].upper() in CMU_TO_IPA:
	ipa_tokens.append(CMU_TO_IPA[word[i].upper()])
	i += 1
	else:
	# fallback: keep as lowercase character
	ipa_tokens.append(word[i].lower())
	i += 1
	ipa_tokens.append(" ")

	return "".join(ipa_tokens) # join chars without extra spaces

	def text_to_phoneme(text):
	phonemes = safe_g2p(clean_text(text))
	res = "".join(phonemes)
	res = clean_cmu(res)
	return res