Spaces:
Runtime error
Runtime error
| import os | |
| os.environ["COQUI_NO_TERMS"] = "1" # Add this line to accept the TOS | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| from TTS.api import TTS | |
| # Supported languages for both Whisper and XTTS | |
| languages = { | |
| "English": "en", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Polish": "pl", | |
| "Turkish": "tr", | |
| "Russian": "ru", | |
| "Dutch": "nl", | |
| "Czech": "cs", | |
| "Arabic": "ar", | |
| "Chinese": "zh-cn", | |
| "Japanese": "ja", | |
| "Hungarian": "hu", | |
| "Korean": "ko", | |
| "Hindi": "hi" | |
| } | |
| # Model and Device Configuration | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| whisper_model_id = "openai/whisper-small" | |
| tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # Replace with your actual TTS model | |
| # Load Whisper Model (for transcription and translation) | |
| whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| whisper_model_id, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ).to(device) | |
| whisper_processor = AutoProcessor.from_pretrained(whisper_model_id) | |
| # Load TTS Model (for text-to-speech) | |
| tts = TTS(model_name=tts_model_name, progress_bar=False) | |
| # Translation Pipeline | |
| def create_translate_pipeline(target_language): | |
| return pipeline( | |
| "automatic-speech-recognition", | |
| model=whisper_model, | |
| tokenizer=whisper_processor.tokenizer, | |
| feature_extractor=whisper_processor.feature_extractor, | |
| max_new_tokens=128, | |
| chunk_length_s=30, | |
| batch_size=1, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| return_timestamps=True, | |
| generate_kwargs={"task": "transcribe", "language": target_language} | |
| ) | |
| # Audio Processing Function | |
| def process_audio(audio_file, translate_language, tts_language): | |
| try: | |
| # Create translation pipeline | |
| translate_pipeline = create_translate_pipeline(translate_language) | |
| # Transcribe and translate | |
| result = translate_pipeline(audio_file)["text"] | |
| # Generate synthesized speech | |
| output_audio_file = "output.wav" | |
| tts.tts_to_file(result, speaker_wav=audio_file, language=tts_language, file_path=output_audio_file) | |
| return result, output_audio_file | |
| except Exception as e: | |
| return f"An error occurred: {e}", None | |
| # Gradio Interface | |
| with gr.Blocks() as interface: | |
| gr.Markdown("# AI VOX LAB POC") | |
| gr.Markdown("Upload/record audio, translate, and get synthesized speech!") | |
| # Add the image here | |
| gr.Image(value="/Users/mac/Desktop/VOX_AI/logo_transparent_background.png", label="App Logo", show_label=False, width=700, height=250) | |
| with gr.Row(): | |
| audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") | |
| translate_lang = gr.Dropdown(choices=list(languages.keys()), label="Translation Language") | |
| tts_lang = gr.Dropdown(choices=list(languages.values()), label="TTS Synthesis Language") | |
| with gr.Row(): | |
| translate_button = gr.Button("Translate and Synthesize") | |
| with gr.Row(): | |
| text_output = gr.Textbox(label="Translated Text") | |
| audio_output = gr.Audio(label="Generated Audio") | |
| translate_button.click( | |
| fn=process_audio, | |
| inputs=[audio_input, translate_lang, tts_lang], | |
| outputs=[text_output, audio_output] | |
| ) | |
| # Launch the App | |
| if __name__ == "__main__": | |
| interface.launch(share=True) | |