Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 16 days ago

Commit

7009401

1 Parent(s): c5a9b84

Upd maya configs

Browse files

Files changed (1) hide show

voice.py +20 -3

voice.py CHANGED Viewed

@@ -549,7 +549,11 @@ def chunk_text_for_tts(text: str, max_length: int = MAX_CHUNK_LENGTH, min_length
     return chunks
 def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
-    """Build formatted prompt for Maya1."""
     soh_token = tokenizer.decode([SOH_ID])
     eoh_token = tokenizer.decode([EOH_ID])
     soa_token = tokenizer.decode([SOA_ID])
@@ -557,11 +561,17 @@ def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
     eot_token = tokenizer.decode([TEXT_EOT_ID])
     bos_token = tokenizer.bos_token
-    formatted_text = f'<description="{description}"> {text}'
     prompt = (
         soh_token + bos_token + formatted_text + eot_token +
         eoh_token + soa_token + sos_token
     )
     return prompt
 def unpack_snac_from_7(snac_tokens: list) -> list:
@@ -618,9 +628,16 @@ def _generate_speech_with_gpu(text: str, description: str = None):
             description = DEFAULT_VOICE_DESCRIPTION
         logger.info("[TTS] Running Maya1 TTS generation...")
-        # Build prompt
         prompt = build_maya1_prompt(tokenizer, description, text)
         inputs = tokenizer(prompt, return_tensors="pt")
         if torch.cuda.is_available():

     return chunks
 def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
+    """Build formatted prompt for Maya1.
+    The description is used only for voice characteristics and should not be spoken.
+    Only the text after the description tag should be synthesized.
+    """
     soh_token = tokenizer.decode([SOH_ID])
     eoh_token = tokenizer.decode([EOH_ID])
     soa_token = tokenizer.decode([SOA_ID])
     eot_token = tokenizer.decode([TEXT_EOT_ID])
     bos_token = tokenizer.bos_token
+    # Ensure description is only metadata - add newline after description tag
+    # to clearly separate it from the text to be spoken
+    formatted_text = f'<description="{description}">\n{text}'
     prompt = (
         soh_token + bos_token + formatted_text + eot_token +
         eoh_token + soa_token + sos_token
     )
+    # Log the prompt structure for debugging (without the actual description text)
+    logger.debug(f"[TTS] Prompt structure: <description=\"...\">\\n[text to speak] (text length: {len(text)} chars)")
     return prompt
 def unpack_snac_from_7(snac_tokens: list) -> list:
             description = DEFAULT_VOICE_DESCRIPTION
         logger.info("[TTS] Running Maya1 TTS generation...")
+        logger.debug(f"[TTS] Voice description (metadata only, not spoken): {description[:80]}...")
+        logger.debug(f"[TTS] Text to speak: {text[:100]}...")
+        # Build prompt - description is metadata, only text should be spoken
         prompt = build_maya1_prompt(tokenizer, description, text)
+        # Verify prompt structure - the description should be in the attribute, not in the spoken text
+        if description.lower() in prompt.lower() and description.lower() not in f'<description="{description.lower()}">':
+            logger.warning("[TTS] Warning: Description text appears in prompt outside of description attribute")
         inputs = tokenizer(prompt, return_tensors="pt")
         if torch.cuda.is_available():