Y Phung Nguyen commited on
Commit
7009401
·
1 Parent(s): c5a9b84

Upd maya configs

Browse files
Files changed (1) hide show
  1. voice.py +20 -3
voice.py CHANGED
@@ -549,7 +549,11 @@ def chunk_text_for_tts(text: str, max_length: int = MAX_CHUNK_LENGTH, min_length
549
  return chunks
550
 
551
  def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
552
- """Build formatted prompt for Maya1."""
 
 
 
 
553
  soh_token = tokenizer.decode([SOH_ID])
554
  eoh_token = tokenizer.decode([EOH_ID])
555
  soa_token = tokenizer.decode([SOA_ID])
@@ -557,11 +561,17 @@ def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
557
  eot_token = tokenizer.decode([TEXT_EOT_ID])
558
  bos_token = tokenizer.bos_token
559
 
560
- formatted_text = f'<description="{description}"> {text}'
 
 
561
  prompt = (
562
  soh_token + bos_token + formatted_text + eot_token +
563
  eoh_token + soa_token + sos_token
564
  )
 
 
 
 
565
  return prompt
566
 
567
  def unpack_snac_from_7(snac_tokens: list) -> list:
@@ -618,9 +628,16 @@ def _generate_speech_with_gpu(text: str, description: str = None):
618
  description = DEFAULT_VOICE_DESCRIPTION
619
 
620
  logger.info("[TTS] Running Maya1 TTS generation...")
 
 
621
 
622
- # Build prompt
623
  prompt = build_maya1_prompt(tokenizer, description, text)
 
 
 
 
 
624
  inputs = tokenizer(prompt, return_tensors="pt")
625
 
626
  if torch.cuda.is_available():
 
549
  return chunks
550
 
551
  def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
552
+ """Build formatted prompt for Maya1.
553
+
554
+ The description is used only for voice characteristics and should not be spoken.
555
+ Only the text after the description tag should be synthesized.
556
+ """
557
  soh_token = tokenizer.decode([SOH_ID])
558
  eoh_token = tokenizer.decode([EOH_ID])
559
  soa_token = tokenizer.decode([SOA_ID])
 
561
  eot_token = tokenizer.decode([TEXT_EOT_ID])
562
  bos_token = tokenizer.bos_token
563
 
564
+ # Ensure description is only metadata - add newline after description tag
565
+ # to clearly separate it from the text to be spoken
566
+ formatted_text = f'<description="{description}">\n{text}'
567
  prompt = (
568
  soh_token + bos_token + formatted_text + eot_token +
569
  eoh_token + soa_token + sos_token
570
  )
571
+
572
+ # Log the prompt structure for debugging (without the actual description text)
573
+ logger.debug(f"[TTS] Prompt structure: <description=\"...\">\\n[text to speak] (text length: {len(text)} chars)")
574
+
575
  return prompt
576
 
577
  def unpack_snac_from_7(snac_tokens: list) -> list:
 
628
  description = DEFAULT_VOICE_DESCRIPTION
629
 
630
  logger.info("[TTS] Running Maya1 TTS generation...")
631
+ logger.debug(f"[TTS] Voice description (metadata only, not spoken): {description[:80]}...")
632
+ logger.debug(f"[TTS] Text to speak: {text[:100]}...")
633
 
634
+ # Build prompt - description is metadata, only text should be spoken
635
  prompt = build_maya1_prompt(tokenizer, description, text)
636
+
637
+ # Verify prompt structure - the description should be in the attribute, not in the spoken text
638
+ if description.lower() in prompt.lower() and description.lower() not in f'<description="{description.lower()}">':
639
+ logger.warning("[TTS] Warning: Description text appears in prompt outside of description attribute")
640
+
641
  inputs = tokenizer(prompt, return_tensors="pt")
642
 
643
  if torch.cuda.is_available():