Spaces:

anycoderapps
/

VibeVoice-Realtime-0.5B

Running on Zero

File size: 12,518 Bytes

26e0cd3

import gradio as gr
import torch
import os
import time
import copy
from pathlib import Path
from typing import Optional, Tuple
import spaces

from vibevoice.modular.modeling_vibevoice_streaming_inference import (
    VibeVoiceStreamingForConditionalGenerationInference,
)
from vibevoice.processor.vibevoice_streaming_processor import (
    VibeVoiceStreamingProcessor,
)


class VoiceMapper:
    """Maps speaker names to voice file paths"""

    def __init__(self):
        self.setup_voice_presets()

        # Change name according to our preset voice file
        new_dict = {}
        for name, path in self.voice_presets.items():
            if "_" in name:
                name = name.split("_")[0]

            if "-" in name:
                name = name.split("-")[-1]

            new_dict[name] = path
        self.voice_presets.update(new_dict)

    def setup_voice_presets(self):
        """Setup voice presets by scanning the voices directory."""
        voices_dir = os.path.join(os.path.dirname(__file__), "demo/voices/streaming_model")

        # Check if voices directory exists
        if not os.path.exists(voices_dir):
            print(f"Warning: Voices directory not found at {voices_dir}")
            self.voice_presets = {}
            self.available_voices = {}
            return

        # Scan for all VOICE files in the voices directory
        self.voice_presets = {}

        # Get all .pt files in the voices directory
        pt_files = [
            f
            for f in os.listdir(voices_dir)
            if f.lower().endswith(".pt") and os.path.isfile(os.path.join(voices_dir, f))
        ]

        # Create dictionary with filename (without extension) as key
        for pt_file in pt_files:
            # Remove .pt extension to get the name
            name = os.path.splitext(pt_file)[0]
            # Create full path
            full_path = os.path.join(voices_dir, pt_file)
            self.voice_presets[name] = full_path

        # Sort the voice presets alphabetically by name for better UI
        self.voice_presets = dict(sorted(self.voice_presets.items()))

        # Filter out voices that don't exist (this is now redundant but kept for safety)
        self.available_voices = {
            name: path for name, path in self.voice_presets.items() if os.path.exists(path)
        }

        print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
        print(f"Available voices: {', '.join(self.available_voices.keys())}")

    def get_voice_path(self, speaker_name: str) -> str:
        """Get voice file path for a given speaker name"""
        # First try exact match
        if speaker_name in self.voice_presets:
            return self.voice_presets[speaker_name]

        # Try partial matching (case insensitive)
        speaker_lower = speaker_name.lower()
        for preset_name, path in self.voice_presets.items():
            if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower():
                return path

        # Default to first voice if no match found
        default_voice = list(self.voice_presets.values())[0]
        print(
            f"Warning: No voice preset found for '{speaker_name}', using default voice: {default_voice}"
        )
        return default_voice


# Load model and processor directly
print("Loading VibeVoice-Realtime model...")

MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"

# Load processor (CPU operation)
PROCESSOR = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH)

# Load model on CPU initially (will be moved to GPU by @spaces.GPU decorator)
MODEL = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="cpu",
    attn_implementation="sdpa",
)

MODEL.eval()
MODEL.set_ddpm_inference_steps(num_steps=5)

# Initialize voice mapper
VOICE_MAPPER = VoiceMapper()

print("Model loaded successfully!")


@spaces.GPU(duration=60)  # Request GPU for 60 seconds
def generate_speech(
    text: str,
    speaker_name: str,
    cfg_scale: float = 1.5,
    progress=gr.Progress(),
) -> Tuple[Optional[str], str]:
    """
    Generate speech from text using VibeVoice-Realtime with ZeroGPU

    Args:
        text: Input text to convert to speech
        speaker_name: Name of the speaker voice to use
        cfg_scale: Classifier-Free Guidance scale (higher = more faithful to text)
        progress: Gradio progress tracker

    Returns:
        Tuple of (audio_path, status_message)
    """
    if not text or not text.strip():
        return None, "❌ Error: Please enter some text to convert to speech."

    try:
        progress(0, desc="Loading voice preset...")

        # Clean text
        full_script = text.strip().replace("'", "'").replace('"', '"').replace('"', '"')

        # Get voice sample
        voice_sample = VOICE_MAPPER.get_voice_path(speaker_name)
        
        # Load voice sample to GPU
        all_prefilled_outputs = torch.load(
            voice_sample, map_location="cuda", weights_only=False
        )

        progress(0.2, desc="Preparing inputs...")

        # Prepare inputs
        inputs = PROCESSOR.process_input_with_cached_prompt(
            text=full_script,
            cached_prompt=all_prefilled_outputs,
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # Move model and tensors to GPU
        MODEL.to("cuda")
        for k, v in inputs.items():
            if torch.is_tensor(v):
                inputs[k] = v.to("cuda")

        progress(0.4, desc="Generating speech on GPU...")

        # Generate audio
        start_time = time.time()
        with torch.cuda.amp.autocast():  # Enable automatic mixed precision
            outputs = MODEL.generate(
                **inputs,
                max_new_tokens=None,
                cfg_scale=cfg_scale,
                tokenizer=PROCESSOR.tokenizer,
                generation_config={"do_sample": False},
                verbose=False,
                all_prefilled_outputs=copy.deepcopy(all_prefilled_outputs)
                if all_prefilled_outputs is not None
                else None,
            )
        generation_time = time.time() - start_time

        progress(0.8, desc="Saving audio...")

        # Calculate metrics
        if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
            sample_rate = 24000
            audio_samples = (
                outputs.speech_outputs[0].shape[-1]
                if len(outputs.speech_outputs[0].shape) > 0
                else len(outputs.speech_outputs[0])
            )
            audio_duration = audio_samples / sample_rate
            rtf = generation_time / audio_duration if audio_duration > 0 else float("inf")

            # Save output
            output_dir = "./outputs"
            os.makedirs(output_dir, exist_ok=True)
            output_path = os.path.join(output_dir, f"generated_{int(time.time())}.wav")

            PROCESSOR.save_audio(
                outputs.speech_outputs[0].cpu(),  # Move to CPU for saving
                output_path=output_path,
            )

            progress(1.0, desc="Complete!")

            # Create status message
            status = f"""✅ **Generation Complete!**
            
📊 **Metrics:**
- Audio Duration: {audio_duration:.2f}s
- Generation Time: {generation_time:.2f}s
- Real-Time Factor: {rtf:.2f}x
- Speaker: {speaker_name}
- CFG Scale: {cfg_scale}
- Device: ZeroGPU (CUDA)
            """

            # Move model back to CPU to free GPU memory
            MODEL.to("cpu")
            torch.cuda.empty_cache()

            return output_path, status
        else:
            MODEL.to("cpu")
            torch.cuda.empty_cache()
            return None, "❌ Error: No audio output generated."

    except Exception as e:
        import traceback

        error_msg = f"❌ Error during generation:\n{str(e)}\n\n{traceback.format_exc()}"
        print(error_msg)
        
        # Clean up GPU memory on error
        try:
            MODEL.to("cpu")
            torch.cuda.empty_cache()
        except:
            pass
            
        return None, error_msg


# Create Gradio interface
with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(
        """
    # 🎙️ VibeVoice-Realtime Text-to-Speech
    
    Convert text to natural-sounding speech using Microsoft's VibeVoice-Realtime model.
    
    **🚀 Powered by ZeroGPU** - Efficient GPU allocation for fast inference!
    
    <div style="text-align: center; margin-top: 10px;">
        <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="text-decoration: none; color: #4F46E5; font-weight: 600;">
            Built with anycoder ✨
        </a>
    </div>
    """
    )

    with gr.Row():
        with gr.Column(scale=2):
            # Input section
            text_input = gr.Textbox(
                label="Text to Convert",
                placeholder="Enter the text you want to convert to speech...",
                lines=8,
                max_lines=20,
            )

            with gr.Row():
                speaker_dropdown = gr.Dropdown(
                    choices=list(VOICE_MAPPER.available_voices.keys()),
                    value=list(VOICE_MAPPER.available_voices.keys())[0]
                    if VOICE_MAPPER.available_voices
                    else None,
                    label="Speaker Voice",
                    info="Select the voice to use for speech generation",
                )

                cfg_slider = gr.Slider(
                    minimum=1.0,
                    maximum=3.0,
                    value=1.5,
                    step=0.1,
                    label="CFG Scale",
                    info="Higher values = more faithful to text (1.0-3.0)",
                )

            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

        with gr.Column(scale=1):
            # Output section
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False,
            )

            status_output = gr.Markdown(
                """
                **Status:** Ready to generate speech
                
                Enter text and click "Generate Speech" to start.
                
                ⚡ Using ZeroGPU for efficient processing
                """
            )

    # Example inputs
    gr.Examples(
        examples=[
            [
                "VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio.",
                list(VOICE_MAPPER.available_voices.keys())[0]
                if VOICE_MAPPER.available_voices
                else "Wayne",
                1.5,
            ],
            [
                "The quick brown fox jumps over the lazy dog. This is a test of the text-to-speech system.",
                list(VOICE_MAPPER.available_voices.keys())[0]
                if VOICE_MAPPER.available_voices
                else "Wayne",
                1.5,
            ],
        ],
        inputs=[text_input, speaker_dropdown, cfg_slider],
        label="Example Inputs",
    )

    # Event handlers
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, speaker_dropdown, cfg_slider],
        outputs=[audio_output, status_output],
        api_name="generate",
    )

    # Footer
    gr.Markdown(
        """
    ---
    
    ### 📝 Notes:
    - **Model**: Microsoft VibeVoice-Realtime-0.5B
    - **Sample Rate**: 24kHz
    - **Context Length**: 8K tokens
    - **Generation Length**: ~10 minutes
    - **Infrastructure**: ZeroGPU (Hugging Face Spaces)
    
    ### ⚠️ Important:
    - The model is designed for English text only
    - Very short inputs (< 3 words) may produce unstable results
    - Code, formulas, and special symbols are not supported
    - Please use responsibly and disclose AI-generated content
    - GPU is allocated dynamically - generation may take a few seconds to start
    """
    )

# Launch the app with Gradio 6 syntax
if __name__ == "__main__":
    demo.launch(
        theme=gr.themes.Soft(
            primary_hue="blue",
            secondary_hue="indigo",
            neutral_hue="slate",
        ),
        footer_links=[
            {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}
        ],
    )