Spaces:
Paused
Paused
License and minor adjusts
Browse files- Dockerfile +2 -0
- app.py +103 -7
Dockerfile
CHANGED
|
@@ -13,6 +13,8 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
|
|
| 13 |
RUN python -m unidic download
|
| 14 |
RUN mkdir -p /app/tts_models
|
| 15 |
|
|
|
|
|
|
|
| 16 |
COPY xtts.py .
|
| 17 |
COPY app.py .
|
| 18 |
|
|
|
|
| 13 |
RUN python -m unidic download
|
| 14 |
RUN mkdir -p /app/tts_models
|
| 15 |
|
| 16 |
+
RUN python -m pip install spaces
|
| 17 |
+
|
| 18 |
COPY xtts.py .
|
| 19 |
COPY app.py .
|
| 20 |
|
app.py
CHANGED
|
@@ -7,7 +7,7 @@ from os.path import abspath
|
|
| 7 |
import zipfile
|
| 8 |
import random
|
| 9 |
import xtts
|
| 10 |
-
|
| 11 |
|
| 12 |
DO_CHECK = os.getenv('DO_CHECK', '1')
|
| 13 |
OUTPUT = "./demo_outputs"
|
|
@@ -84,6 +84,32 @@ def ExtractVars(input_string):
|
|
| 84 |
return result_dict, filtered_string
|
| 85 |
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
def FindSpeakerByName(name, speakerType):
|
| 88 |
|
| 89 |
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
|
|
@@ -105,11 +131,12 @@ def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
|
|
| 105 |
cloned_speaker_names.append(clone_speaker_name)
|
| 106 |
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
|
| 107 |
|
| 108 |
-
def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
| 109 |
,speed,top_p,top_k, AllFileList,progress=gr.Progress()
|
| 110 |
):
|
| 111 |
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
|
| 112 |
|
|
|
|
| 113 |
# break at line!
|
| 114 |
lines = text.split("---");
|
| 115 |
totalLines = len(lines);
|
|
@@ -122,6 +149,12 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
|
|
| 122 |
CurrentPrefix = DefaultPrefix
|
| 123 |
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
AudioList = [];
|
| 126 |
for line in progress.tqdm(lines, desc="Gerando fala..."):
|
| 127 |
audioNum += 1;
|
|
@@ -154,11 +187,27 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
|
|
| 154 |
|
| 155 |
if not speakerName:
|
| 156 |
raise ValueError("InvalidSpeaker: "+speakerName)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
ipts = xtts.TTSInputs(
|
| 159 |
speaker_embedding=embeddings["speaker_embedding"],
|
| 160 |
gpt_cond_latent=embeddings["gpt_cond_latent"],
|
| 161 |
-
text=
|
| 162 |
language=lang,
|
| 163 |
temperature=temperature,
|
| 164 |
speed=speed,
|
|
@@ -246,6 +295,7 @@ with gr.Blocks(js=js) as demo:
|
|
| 246 |
cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
|
| 247 |
AllFileList = gr.State(list([]))
|
| 248 |
|
|
|
|
| 249 |
|
| 250 |
with gr.Tab("TTS"):
|
| 251 |
with gr.Column() as row4:
|
|
@@ -268,9 +318,12 @@ with gr.Blocks(js=js) as demo:
|
|
| 268 |
top_k = gr.Number(label="TOP K",value=50)
|
| 269 |
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
| 270 |
with gr.Column() as col2:
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
| 274 |
with gr.Column() as col3:
|
| 275 |
# FileList = gr.FileExplorer(
|
| 276 |
# glob="*.wav",
|
|
@@ -302,6 +355,49 @@ with gr.Blocks(js=js) as demo:
|
|
| 302 |
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
|
| 303 |
clone_button = gr.Button(value="Clone speaker")
|
| 304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
clone_button.click(
|
| 306 |
fn=clone_speaker,
|
| 307 |
inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
|
|
@@ -310,7 +406,7 @@ with gr.Blocks(js=js) as demo:
|
|
| 310 |
|
| 311 |
tts_button.click(
|
| 312 |
fn=tts,
|
| 313 |
-
inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
| 314 |
,speed,top_p,top_k,AllFileList
|
| 315 |
],
|
| 316 |
outputs=[AudioList],
|
|
|
|
| 7 |
import zipfile
|
| 8 |
import random
|
| 9 |
import xtts
|
| 10 |
+
import re
|
| 11 |
|
| 12 |
DO_CHECK = os.getenv('DO_CHECK', '1')
|
| 13 |
OUTPUT = "./demo_outputs"
|
|
|
|
| 84 |
return result_dict, filtered_string
|
| 85 |
|
| 86 |
|
| 87 |
+
def ParsePronucs(PronuncStr):
|
| 88 |
+
# Split the string into lines
|
| 89 |
+
lines = PronuncStr.split('\n')
|
| 90 |
+
|
| 91 |
+
# Initialize an empty dictionary to store key-value pairs
|
| 92 |
+
PronuncWords = []
|
| 93 |
+
|
| 94 |
+
# Iterate through each line
|
| 95 |
+
for line in lines:
|
| 96 |
+
if len(line.strip()) > 0:
|
| 97 |
+
word,*text = line.strip().split('=',1)
|
| 98 |
+
word = word.strip()
|
| 99 |
+
text,*opts = text[0].split("|",1);
|
| 100 |
+
text = text.strip();
|
| 101 |
+
|
| 102 |
+
if len(opts) > 0:
|
| 103 |
+
opts = opts[0].strip().split(",");
|
| 104 |
+
else:
|
| 105 |
+
opts = [];
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
PronuncWords.append({'word':word, 'text':text, 'opts':opts})
|
| 109 |
+
|
| 110 |
+
return PronuncWords
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def FindSpeakerByName(name, speakerType):
|
| 114 |
|
| 115 |
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
|
|
|
|
| 131 |
cloned_speaker_names.append(clone_speaker_name)
|
| 132 |
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
|
| 133 |
|
| 134 |
+
def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
| 135 |
,speed,top_p,top_k, AllFileList,progress=gr.Progress()
|
| 136 |
):
|
| 137 |
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
|
| 138 |
|
| 139 |
+
|
| 140 |
# break at line!
|
| 141 |
lines = text.split("---");
|
| 142 |
totalLines = len(lines);
|
|
|
|
| 149 |
CurrentPrefix = DefaultPrefix
|
| 150 |
|
| 151 |
|
| 152 |
+
# break pronuc
|
| 153 |
+
Pronuncs = ParsePronucs(pronunc)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
|
| 158 |
AudioList = [];
|
| 159 |
for line in progress.tqdm(lines, desc="Gerando fala..."):
|
| 160 |
audioNum += 1;
|
|
|
|
| 187 |
|
| 188 |
if not speakerName:
|
| 189 |
raise ValueError("InvalidSpeaker: "+speakerName)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
FixedText = cleanLine;
|
| 194 |
+
|
| 195 |
+
for pronunc in Pronuncs:
|
| 196 |
+
word = pronunc['word']
|
| 197 |
+
text = pronunc['text']
|
| 198 |
+
opts = pronunc['opts'];
|
| 199 |
+
|
| 200 |
+
flg = re.IGNORECASE
|
| 201 |
+
|
| 202 |
+
if 'cs' in opts:
|
| 203 |
+
flg = 0;
|
| 204 |
+
|
| 205 |
+
FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
|
| 206 |
|
| 207 |
ipts = xtts.TTSInputs(
|
| 208 |
speaker_embedding=embeddings["speaker_embedding"],
|
| 209 |
gpt_cond_latent=embeddings["gpt_cond_latent"],
|
| 210 |
+
text=FixedText,
|
| 211 |
language=lang,
|
| 212 |
temperature=temperature,
|
| 213 |
speed=speed,
|
|
|
|
| 295 |
cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
|
| 296 |
AllFileList = gr.State(list([]))
|
| 297 |
|
| 298 |
+
gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
|
| 299 |
|
| 300 |
with gr.Tab("TTS"):
|
| 301 |
with gr.Column() as row4:
|
|
|
|
| 318 |
top_k = gr.Number(label="TOP K",value=50)
|
| 319 |
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
| 320 |
with gr.Column() as col2:
|
| 321 |
+
with gr.Row():
|
| 322 |
+
text = gr.Textbox(label="text",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
|
| 323 |
+
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
|
| 324 |
+
with gr.Row():
|
| 325 |
+
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
|
| 326 |
+
tts_button = gr.Button(value="TTS")
|
| 327 |
with gr.Column() as col3:
|
| 328 |
# FileList = gr.FileExplorer(
|
| 329 |
# glob="*.wav",
|
|
|
|
| 355 |
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
|
| 356 |
clone_button = gr.Button(value="Clone speaker")
|
| 357 |
|
| 358 |
+
|
| 359 |
+
with gr.Tab("Help"):
|
| 360 |
+
gr.Markdown("""
|
| 361 |
+
Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
|
| 362 |
+
|
| 363 |
+
The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
|
| 364 |
+
|
| 365 |
+
In this version, we have some customizations that are quite useful.
|
| 366 |
+
|
| 367 |
+
# Multiple audios
|
| 368 |
+
You can generate multiple audios at once by separating the text with three dashes. For example:
|
| 369 |
+
|
| 370 |
+
```
|
| 371 |
+
Text 1
|
| 372 |
+
---
|
| 373 |
+
Text 2, line 1
|
| 374 |
+
Text 2, line 2
|
| 375 |
+
```
|
| 376 |
+
|
| 377 |
+
In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
|
| 378 |
+
You can also specify variables that modify certain aspects.
|
| 379 |
+
|
| 380 |
+
For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
|
| 381 |
+
|
| 382 |
+
List of variables:
|
| 383 |
+
- `speaker` = name of the speaker
|
| 384 |
+
- `num` = file number (by default, it's the sequential number)
|
| 385 |
+
- `prefix` = file name prefix
|
| 386 |
+
|
| 387 |
+
# Pronunciation adjustment
|
| 388 |
+
|
| 389 |
+
If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
|
| 390 |
+
|
| 391 |
+
Simply separate them by each line. Example:
|
| 392 |
+
|
| 393 |
+
```
|
| 394 |
+
API = A,P,I
|
| 395 |
+
SomeFunctionCode = Function Code
|
| 396 |
+
```
|
| 397 |
+
|
| 398 |
+
This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
|
| 399 |
+
""")
|
| 400 |
+
|
| 401 |
clone_button.click(
|
| 402 |
fn=clone_speaker,
|
| 403 |
inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
|
|
|
|
| 406 |
|
| 407 |
tts_button.click(
|
| 408 |
fn=tts,
|
| 409 |
+
inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
| 410 |
,speed,top_p,top_k,AllFileList
|
| 411 |
],
|
| 412 |
outputs=[AudioList],
|