QingyuLiu1 commited on
Commit
4d7c9ce
·
1 Parent(s): 796c25a
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. .vscode/settings.json +5 -0
  3. README.md +11 -2
  4. app.py +125 -4
  5. requirement.txt +1 -0
  6. utils_clf5space.py +0 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.vscode/settings.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "python-envs.defaultEnvManager": "ms-python.python:conda",
3
+ "python-envs.defaultPackageManager": "ms-python.python:conda",
4
+ "python-envs.pythonProjects": []
5
+ }
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Cross-Lingual F5-TTS Space
3
- emoji: 🏆
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: gradio
@@ -11,4 +11,13 @@ license: apache-2.0
11
  short_description: Cross-Lingual F5-TTS Online Demo for Dev Test
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Cross-Lingual F5-TTS Space
3
+ emoji: 🎙️
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: gradio
 
11
  short_description: Cross-Lingual F5-TTS Online Demo for Dev Test
12
  ---
13
 
14
+ # Cross-Lingual F5-TTS Demo
15
+
16
+ This Space demonstrates the Cross-Lingual F5-TTS model for multilingual text-to-speech synthesis.
17
+
18
+ ## Features
19
+ - Cross-lingual voice cloning
20
+ - High-quality speech synthesis
21
+
22
+ ## Model
23
+ The model checkpoint is hosted at: [QingyuLiu1/Cross-Lingual_F5-TTS](https://huggingface.co/QingyuLiu1/Cross-Lingual_F5-TTS)
app.py CHANGED
@@ -1,7 +1,128 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import spaces
4
+ import torch
5
+ from cached_path import cached_path
6
+ from f5_tts.infer.utils_infer import (
7
+ infer_process,
8
+ load_model,
9
+ load_vocoder,
10
+ preprocess_ref_audio_text,
11
+ )
12
+ from f5_tts.model import DiT
13
 
 
 
14
 
15
+ vocoder = load_vocoder()
16
+
17
+ # Cross-Lingual F5-TTS configuration
18
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
19
+ vocab_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt")) # Using the same vocab as base model
20
+
21
+ # Load Cross-Lingual F5-TTS model
22
+ cross_lingual_model = load_model(
23
+ DiT,
24
+ model_cfg,
25
+ str(cached_path("hf://QingyuLiu1/Cross-Lingual_F5-TTS/clf5_950000.safetensors")),
26
+ vocab_file=vocab_path,
27
+ )
28
+
29
+
30
+ @spaces.GPU
31
+ def infer(
32
+ ref_audio_orig,
33
+ gen_text,
34
+ seed,
35
+ show_info=gr.Info,
36
+ ):
37
+ # Fixed reference text
38
+ ref_text = "Hello World! I'm Qingyu Liu."
39
+
40
+ if not ref_audio_orig or not gen_text.strip():
41
+ gr.Warning("Please ensure [Reference Audio] and [Text to Generate] are both provided.")
42
+ return gr.update(), seed
43
+
44
+ if seed < 0 or seed > 2**31 - 1:
45
+ gr.Warning("Please set a seed in range 0 ~ 2**31 - 1.")
46
+ seed = np.random.randint(0, 2**31 - 1)
47
+ torch.manual_seed(seed)
48
+ used_seed = seed
49
+
50
+ ref_audio, ref_text_processed = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
51
+
52
+ final_wave, final_sample_rate, _ = infer_process(
53
+ ref_audio,
54
+ ref_text_processed,
55
+ gen_text,
56
+ cross_lingual_model,
57
+ vocoder,
58
+ show_info=show_info,
59
+ progress=gr.Progress(),
60
+ )
61
+
62
+ return (final_sample_rate, final_wave), used_seed
63
+
64
+
65
+ with gr.Blocks() as app_basic_tts:
66
+ with gr.Row():
67
+ with gr.Column():
68
+ ref_wav_input = gr.Audio(label="Reference Audio", type="filepath")
69
+ # Removed ref_txt_input - using fixed text instead
70
+ gen_txt_input = gr.Textbox(label="Text to Generate")
71
+ generate_btn = gr.Button("Synthesize", variant="primary")
72
+ with gr.Row():
73
+ randomize_seed = gr.Checkbox(
74
+ label="Randomize Seed",
75
+ info="Check to use a random seed for each generation. Uncheck to use the seed specified.",
76
+ value=True,
77
+ scale=3,
78
+ )
79
+ seed_input = gr.Number(show_label=False, value=0, precision=0, scale=1)
80
+ audio_output = gr.Audio(label="Synthesized Audio")
81
+
82
+ def basic_tts(
83
+ ref_wav_input,
84
+ gen_txt_input,
85
+ randomize_seed,
86
+ seed_input,
87
+ ):
88
+ if randomize_seed:
89
+ seed_input = np.random.randint(0, 2**31 - 1)
90
+
91
+ audio_out, used_seed = infer(
92
+ ref_wav_input,
93
+ gen_txt_input,
94
+ seed_input,
95
+ )
96
+ return audio_out, used_seed
97
+
98
+ generate_btn.click(
99
+ basic_tts,
100
+ inputs=[
101
+ ref_wav_input,
102
+ gen_txt_input,
103
+ randomize_seed,
104
+ seed_input,
105
+ ],
106
+ outputs=[audio_output, seed_input],
107
+ )
108
+
109
+
110
+ with gr.Blocks() as demo:
111
+ gr.Markdown(
112
+ """
113
+ # 🗣️ Cross-Lingual F5-TTS Online Demo
114
+
115
+ Upload or record a reference voice, then input the text to generate and have fun!
116
+ """
117
+ )
118
+
119
+ # Removed language and model selection dropdowns
120
+
121
+ gr.TabbedInterface(
122
+ [app_basic_tts],
123
+ ["Basic-TTS"],
124
+ )
125
+
126
+
127
+ if __name__ == "__main__":
128
+ demo.launch()
requirement.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ f5-tts
utils_clf5space.py ADDED
File without changes