Spaces:
Running
on
Zero
Running
on
Zero
Y Phung Nguyen
commited on
Commit
·
7009401
1
Parent(s):
c5a9b84
Upd maya configs
Browse files
voice.py
CHANGED
|
@@ -549,7 +549,11 @@ def chunk_text_for_tts(text: str, max_length: int = MAX_CHUNK_LENGTH, min_length
|
|
| 549 |
return chunks
|
| 550 |
|
| 551 |
def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
|
| 552 |
-
"""Build formatted prompt for Maya1.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
soh_token = tokenizer.decode([SOH_ID])
|
| 554 |
eoh_token = tokenizer.decode([EOH_ID])
|
| 555 |
soa_token = tokenizer.decode([SOA_ID])
|
|
@@ -557,11 +561,17 @@ def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
|
|
| 557 |
eot_token = tokenizer.decode([TEXT_EOT_ID])
|
| 558 |
bos_token = tokenizer.bos_token
|
| 559 |
|
| 560 |
-
|
|
|
|
|
|
|
| 561 |
prompt = (
|
| 562 |
soh_token + bos_token + formatted_text + eot_token +
|
| 563 |
eoh_token + soa_token + sos_token
|
| 564 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
return prompt
|
| 566 |
|
| 567 |
def unpack_snac_from_7(snac_tokens: list) -> list:
|
|
@@ -618,9 +628,16 @@ def _generate_speech_with_gpu(text: str, description: str = None):
|
|
| 618 |
description = DEFAULT_VOICE_DESCRIPTION
|
| 619 |
|
| 620 |
logger.info("[TTS] Running Maya1 TTS generation...")
|
|
|
|
|
|
|
| 621 |
|
| 622 |
-
# Build prompt
|
| 623 |
prompt = build_maya1_prompt(tokenizer, description, text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
inputs = tokenizer(prompt, return_tensors="pt")
|
| 625 |
|
| 626 |
if torch.cuda.is_available():
|
|
|
|
| 549 |
return chunks
|
| 550 |
|
| 551 |
def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
|
| 552 |
+
"""Build formatted prompt for Maya1.
|
| 553 |
+
|
| 554 |
+
The description is used only for voice characteristics and should not be spoken.
|
| 555 |
+
Only the text after the description tag should be synthesized.
|
| 556 |
+
"""
|
| 557 |
soh_token = tokenizer.decode([SOH_ID])
|
| 558 |
eoh_token = tokenizer.decode([EOH_ID])
|
| 559 |
soa_token = tokenizer.decode([SOA_ID])
|
|
|
|
| 561 |
eot_token = tokenizer.decode([TEXT_EOT_ID])
|
| 562 |
bos_token = tokenizer.bos_token
|
| 563 |
|
| 564 |
+
# Ensure description is only metadata - add newline after description tag
|
| 565 |
+
# to clearly separate it from the text to be spoken
|
| 566 |
+
formatted_text = f'<description="{description}">\n{text}'
|
| 567 |
prompt = (
|
| 568 |
soh_token + bos_token + formatted_text + eot_token +
|
| 569 |
eoh_token + soa_token + sos_token
|
| 570 |
)
|
| 571 |
+
|
| 572 |
+
# Log the prompt structure for debugging (without the actual description text)
|
| 573 |
+
logger.debug(f"[TTS] Prompt structure: <description=\"...\">\\n[text to speak] (text length: {len(text)} chars)")
|
| 574 |
+
|
| 575 |
return prompt
|
| 576 |
|
| 577 |
def unpack_snac_from_7(snac_tokens: list) -> list:
|
|
|
|
| 628 |
description = DEFAULT_VOICE_DESCRIPTION
|
| 629 |
|
| 630 |
logger.info("[TTS] Running Maya1 TTS generation...")
|
| 631 |
+
logger.debug(f"[TTS] Voice description (metadata only, not spoken): {description[:80]}...")
|
| 632 |
+
logger.debug(f"[TTS] Text to speak: {text[:100]}...")
|
| 633 |
|
| 634 |
+
# Build prompt - description is metadata, only text should be spoken
|
| 635 |
prompt = build_maya1_prompt(tokenizer, description, text)
|
| 636 |
+
|
| 637 |
+
# Verify prompt structure - the description should be in the attribute, not in the spoken text
|
| 638 |
+
if description.lower() in prompt.lower() and description.lower() not in f'<description="{description.lower()}">':
|
| 639 |
+
logger.warning("[TTS] Warning: Description text appears in prompt outside of description attribute")
|
| 640 |
+
|
| 641 |
inputs = tokenizer(prompt, return_tensors="pt")
|
| 642 |
|
| 643 |
if torch.cuda.is_available():
|