Spaces:

chenxie95
/

Cross-Lingual_F5-TTS_Space

Running on Zero

App Files Files Community

QingyuLiu1 commited on Oct 10

Commit

4d7c9ce

1 Parent(s): 796c25a

test

Browse files

Files changed (6) hide show

.DS_Store +0 -0
.vscode/settings.json +5 -0
README.md +11 -2
app.py +125 -4
requirement.txt +1 -0
utils_clf5space.py +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python-envs.pythonProjects": []
+}

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Cross-Lingual F5-TTS Space
-emoji: 🏆
 colorFrom: pink
 colorTo: purple
 sdk: gradio
@@ -11,4 +11,13 @@ license: apache-2.0
 short_description: Cross-Lingual F5-TTS Online Demo for Dev Test
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Cross-Lingual F5-TTS Space
+emoji: 🎙️
 colorFrom: pink
 colorTo: purple
 sdk: gradio
 short_description: Cross-Lingual F5-TTS Online Demo for Dev Test
 ---
+# Cross-Lingual F5-TTS Demo
+This Space demonstrates the Cross-Lingual F5-TTS model for multilingual text-to-speech synthesis.
+## Features
+- Cross-lingual voice cloning
+- High-quality speech synthesis
+## Model
+The model checkpoint is hosted at: [QingyuLiu1/Cross-Lingual_F5-TTS](https://huggingface.co/QingyuLiu1/Cross-Lingual_F5-TTS)

app.py CHANGED Viewed

@@ -1,7 +1,128 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import numpy as np
+import spaces
+import torch
+from cached_path import cached_path
+from f5_tts.infer.utils_infer import (
+    infer_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
+)
+from f5_tts.model import DiT
+vocoder = load_vocoder()
+# Cross-Lingual F5-TTS configuration
+model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+vocab_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt"))  # Using the same vocab as base model
+# Load Cross-Lingual F5-TTS model
+cross_lingual_model = load_model(
+    DiT,
+    model_cfg,
+    str(cached_path("hf://QingyuLiu1/Cross-Lingual_F5-TTS/clf5_950000.safetensors")),
+    vocab_file=vocab_path,
+)
+@spaces.GPU
+def infer(
+    ref_audio_orig,
+    gen_text,
+    seed,
+    show_info=gr.Info,
+):
+    # Fixed reference text
+    ref_text = "Hello World! I'm Qingyu Liu."
+    if not ref_audio_orig or not gen_text.strip():
+        gr.Warning("Please ensure [Reference Audio] and [Text to Generate] are both provided.")
+        return gr.update(), seed
+    if seed < 0 or seed > 2**31 - 1:
+        gr.Warning("Please set a seed in range 0 ~ 2**31 - 1.")
+        seed = np.random.randint(0, 2**31 - 1)
+    torch.manual_seed(seed)
+    used_seed = seed
+    ref_audio, ref_text_processed = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
+    final_wave, final_sample_rate, _ = infer_process(
+        ref_audio,
+        ref_text_processed,
+        gen_text,
+        cross_lingual_model,
+        vocoder,
+        show_info=show_info,
+        progress=gr.Progress(),
+    )
+    return (final_sample_rate, final_wave), used_seed
+with gr.Blocks() as app_basic_tts:
+    with gr.Row():
+        with gr.Column():
+            ref_wav_input = gr.Audio(label="Reference Audio", type="filepath")
+            # Removed ref_txt_input - using fixed text instead
+            gen_txt_input = gr.Textbox(label="Text to Generate")
+            generate_btn = gr.Button("Synthesize", variant="primary")
+            with gr.Row():
+                randomize_seed = gr.Checkbox(
+                    label="Randomize Seed",
+                    info="Check to use a random seed for each generation. Uncheck to use the seed specified.",
+                    value=True,
+                    scale=3,
+                )
+                seed_input = gr.Number(show_label=False, value=0, precision=0, scale=1)
+        audio_output = gr.Audio(label="Synthesized Audio")
+    def basic_tts(
+        ref_wav_input,
+        gen_txt_input,
+        randomize_seed,
+        seed_input,
+    ):
+        if randomize_seed:
+            seed_input = np.random.randint(0, 2**31 - 1)
+        audio_out, used_seed = infer(
+            ref_wav_input,
+            gen_txt_input,
+            seed_input,
+        )
+        return audio_out, used_seed
+    generate_btn.click(
+        basic_tts,
+        inputs=[
+            ref_wav_input,
+            gen_txt_input,
+            randomize_seed,
+            seed_input,
+        ],
+        outputs=[audio_output, seed_input],
+    )
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # 🗣️ Cross-Lingual F5-TTS Online Demo
+        Upload or record a reference voice, then input the text to generate and have fun!
+        """
+    )
+    # Removed language and model selection dropdowns
+    gr.TabbedInterface(
+        [app_basic_tts],
+        ["Basic-TTS"],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirement.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ f5-tts

utils_clf5space.py ADDED Viewed

File without changes