Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from TTS.api import TTS | |
| import numpy as np | |
| import tempfile | |
| import os | |
| # Model configurations | |
| MODELS = { | |
| "Hausa": { | |
| "model_repo": "CLEAR-Global/TWB-Voice-Hausa-TTS-1.0", | |
| "model_name": "best_model_498283.pth", | |
| "config_name": "config.json", | |
| "speakers": { | |
| "spk_f_1": "Female", | |
| "spk_m_1": "Male 1", | |
| "spk_m_2": "Male 2" | |
| }, | |
| "examples": [ | |
| "Lokacin damuna shuka kan koriya shar.", | |
| "Lafiyarku tafi kuΙinku muhimmanci.", | |
| "A kiyayi inda ake samun labarun magani ko kariya da cututtuka." | |
| ] | |
| } | |
| } | |
| # Initialize models | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| loaded_models = {} | |
| def load_model(language): | |
| """Load TTS model for the specified language""" | |
| if language not in loaded_models: | |
| model_repo = MODELS[language]["model_repo"] | |
| model_name = MODELS[language]["model_name"] | |
| config_name = MODELS[language]["config_name"] | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| import json | |
| # First download and read the config to get the required filenames | |
| config_path = hf_hub_download(repo_id=model_repo, filename=config_name) | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| # Extract filenames from config (get just the filename, not the full path) | |
| speakers_filename = os.path.basename(config.get("speakers_file", "speakers.pth")) | |
| language_ids_filename = os.path.basename(config.get("language_ids_file", "language_ids.json")) | |
| d_vector_filename = os.path.basename(config.get("d_vector_file", ["d_vector.pth"])[0]) | |
| config_se_filename = os.path.basename(config.get("model_args", {}).get("speaker_encoder_config_path", "config_se.json")) | |
| model_se_filename = os.path.basename(config.get("model_args", {}).get("speaker_encoder_model_path", "model_se.pth")) | |
| # Download specific model and config files from HuggingFace repo | |
| model_path = hf_hub_download(repo_id=model_repo, filename=model_name) | |
| speakers_file = hf_hub_download(repo_id=model_repo, filename=speakers_filename) | |
| language_ids_file = hf_hub_download(repo_id=model_repo, filename=language_ids_filename) | |
| d_vector_file = hf_hub_download(repo_id=model_repo, filename=d_vector_filename) | |
| config_se_file = hf_hub_download(repo_id=model_repo, filename=config_se_filename) | |
| model_se_file = hf_hub_download(repo_id=model_repo, filename=model_se_filename) | |
| # Update the config paths to point to the downloaded files | |
| config["speakers_file"] = speakers_file | |
| config["language_ids_file"] = language_ids_file | |
| config["d_vector_file"] = [d_vector_file] | |
| config["model_args"]["speakers_file"] = speakers_file | |
| config["model_args"]["language_ids_file"] = language_ids_file | |
| config["model_args"]["d_vector_file"] = [d_vector_file] | |
| config["model_args"]["speaker_encoder_config_path"] = config_se_file | |
| config["model_args"]["speaker_encoder_model_path"] = model_se_file | |
| # Save the updated config to a temporary file | |
| import tempfile | |
| temp_config = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) | |
| json.dump(config, temp_config, indent=2) | |
| temp_config.close() | |
| print(f"Loading {language} model with config:") | |
| print(f"- language_ids_file: {config.get('language_ids_file')}") | |
| print(f"- use_speaker_embedding: {config.get('use_speaker_embedding')}") | |
| print(f"- speakers_file: {config.get('speakers_file')}") | |
| print(f"- d_vector_file: {config.get('d_vector_file')}") | |
| # Load TTS model with specific model and config paths | |
| loaded_models[language] = TTS(model_path=model_path, | |
| config_path=temp_config.name, | |
| gpu=torch.cuda.is_available()) | |
| except Exception as e: | |
| print(f"Error loading {language} model: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| return loaded_models[language] | |
| def update_speakers(language): | |
| """Update speaker dropdown based on selected language""" | |
| if language in MODELS: | |
| speakers = MODELS[language]["speakers"] | |
| choices = [(f"{speaker_id}: {description}", speaker_id) | |
| for speaker_id, description in speakers.items()] | |
| return gr.Dropdown(choices=choices, value=choices[0][1], interactive=True) | |
| return gr.Dropdown(choices=[], interactive=False) | |
| def get_example_text(language, example_idx): | |
| """Get example text for the selected language""" | |
| if language in MODELS and 0 <= example_idx < len(MODELS[language]["examples"]): | |
| return MODELS[language]["examples"][example_idx] | |
| return "" | |
| def synthesize_speech(text, language, speaker): | |
| """Synthesize speech from text""" | |
| if not text.strip(): | |
| return None, "Please enter some text to synthesize." | |
| # Load the model | |
| tts_model = load_model(language) | |
| if tts_model is None: | |
| return None, f"Failed to load {language} model." | |
| try: | |
| text = text.lower().strip() | |
| print(f"DEBUG: Processing text: '{text}'") | |
| print(f"DEBUG: Speaker name: '{speaker}'") | |
| synthesizer = tts_model.synthesizer | |
| try: | |
| wav = synthesizer.tts(text=text, speaker_name=speaker) | |
| except TypeError: | |
| wav = synthesizer.tts(text=text) | |
| print(f"DEBUG: synthesizer.tts() completed successfully") | |
| # Convert to numpy array and save to temporary file | |
| wav_array = np.array(wav, dtype=np.float32) | |
| # Create temporary file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
| # Save audio using the synthesizer's sample rate | |
| import scipy.io.wavfile as wavfile | |
| wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array) | |
| print("Speech synthesized successfully!") | |
| return temp_file.name, "Speech synthesized successfully!" | |
| except Exception as e: | |
| return None, f"Error during synthesis: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="TWB Voice TTS Demo") as demo: | |
| gr.Markdown(""" | |
| # TWB Voice Text-to-Speech Demo Space | |
| This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global. | |
| Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project. | |
| ### Features: | |
| - **Hausa**: 3 speakers (1 female, 2 male) | |
| - **Kanuri**: 1 female speaker | |
| - High-quality 24kHz audio output | |
| - Based on YourTTS architecture | |
| ### Links: | |
| - π€ [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0) | |
| - π€ [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0) | |
| - π [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset) | |
| - π [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset) | |
| - π [TWB Voice Project](https://twbvoice.org/) | |
| --- | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Language selection | |
| language_dropdown = gr.Dropdown( | |
| choices=list(MODELS.keys()), | |
| value="Hausa", | |
| label="Language", | |
| info="Select the language for synthesis" | |
| ) | |
| # Speaker selection | |
| speaker_dropdown = gr.Dropdown( | |
| choices=list(MODELS["Hausa"]["speakers"].keys()), | |
| value="spk_f_1", | |
| label="Speaker", | |
| info="Select the voice speaker" | |
| ) | |
| # Text input | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| placeholder="Enter text in the selected language (will be converted to lowercase)", | |
| lines=3, | |
| info="Note: Text will be automatically converted to lowercase as required by the models" | |
| ) | |
| # Example buttons | |
| gr.Markdown("**Press to load a sentence in selected language:**") | |
| with gr.Row(): | |
| example_btn_1 = gr.Button("Example 1", size="sm") | |
| example_btn_2 = gr.Button("Example 2", size="sm") | |
| example_btn_3 = gr.Button("Example 3", size="sm") | |
| # Synthesize button | |
| synthesize_btn = gr.Button("π€ Synthesize Speech", variant="primary") | |
| with gr.Column(): | |
| # Audio output | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath" | |
| ) | |
| # Status message | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False | |
| ) | |
| # Event handlers | |
| language_dropdown.change( | |
| fn=update_speakers, | |
| inputs=[language_dropdown], | |
| outputs=[speaker_dropdown] | |
| ) | |
| example_btn_1.click( | |
| fn=lambda lang: get_example_text(lang, 0), | |
| inputs=[language_dropdown], | |
| outputs=[text_input] | |
| ) | |
| example_btn_2.click( | |
| fn=lambda lang: get_example_text(lang, 1), | |
| inputs=[language_dropdown], | |
| outputs=[text_input] | |
| ) | |
| example_btn_3.click( | |
| fn=lambda lang: get_example_text(lang, 2), | |
| inputs=[language_dropdown], | |
| outputs=[text_input] | |
| ) | |
| synthesize_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[text_input, language_dropdown, speaker_dropdown], | |
| outputs=[audio_output, status_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### Notes: | |
| - Models work with **lowercase input text** (automatically converted) | |
| - Audio output is generated at 24kHz sample rate | |
| ### License: | |
| This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only). | |
| **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |