Spaces:

chenxie95
/

Cross-Lingual_F5-TTS_Space

Running on Zero

App Files Files Community

QingyuLiu1 commited on Oct 11

Commit

ad3a05c

1 Parent(s): fbabfd7

update2

Browse files

Files changed (5) hide show

app.py +16 -7
module_clf5.py +223 -0
requirements.txt +2 -1
utils_clf5_space.py +302 -0
utils_clf5space.py +0 -0

app.py CHANGED Viewed

@@ -4,18 +4,23 @@ import spaces
 import torch
 from cached_path import cached_path
 from f5_tts.infer.utils_infer import (
-    infer_process,
     load_model,
     load_vocoder,
     preprocess_ref_audio_text,
 )
 from f5_tts.model import DiT
 vocoder = load_vocoder()
 # Cross-Lingual F5-TTS configuration
 model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
 vocab_path = str(cached_path("hf://QingyuLiu1/Cross-Lingual_F5-TTS/vocab.txt"))  # Using the same vocab as base model
 # Load Cross-Lingual F5-TTS model
@@ -26,6 +31,11 @@ cross_lingual_model = load_model(
     vocab_file=vocab_path,
 )
 @spaces.GPU
 def infer(
@@ -35,7 +45,7 @@ def infer(
     show_info=gr.Info,
 ):
     # Fixed reference text
-    ref_text = "Hello World! I'm Qingyu Liu."
     if not ref_audio_orig or not gen_text.strip():
         gr.Warning("Please ensure [Reference Audio] and [Text to Generate] are both provided.")
@@ -47,11 +57,11 @@ def infer(
     torch.manual_seed(seed)
     used_seed = seed
-    ref_audio, ref_text_processed = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
-    final_wave, final_sample_rate, _ = infer_process(
         ref_audio,
-        ref_text_processed,
         gen_text,
         cross_lingual_model,
         vocoder,
@@ -66,7 +76,6 @@ with gr.Blocks() as app_basic_tts:
     with gr.Row():
         with gr.Column():
             ref_wav_input = gr.Audio(label="Reference Audio", type="filepath")
-            # Removed ref_txt_input - using fixed text instead
             gen_txt_input = gr.Textbox(label="Text to Generate")
             generate_btn = gr.Button("Synthesize", variant="primary")
             with gr.Row():

 import torch
 from cached_path import cached_path
 from f5_tts.infer.utils_infer import (
     load_model,
     load_vocoder,
     preprocess_ref_audio_text,
 )
 from f5_tts.model import DiT
+from utils_clf5_space import (
+    load_model_sp,
+    infer_process_clf5,
+)
 vocoder = load_vocoder()
 # Cross-Lingual F5-TTS configuration
 model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+model_cfg_sp = dict(dim=512, depth=6, heads=8, ff_mult=4)
+mel_spec_kwargs = dict(target_sample_rate=24000, n_mel_channels=100, hop_length=256, win_length=1024, n_fft=1024, mel_spec_type="vocos")
 vocab_path = str(cached_path("hf://QingyuLiu1/Cross-Lingual_F5-TTS/vocab.txt"))  # Using the same vocab as base model
 # Load Cross-Lingual F5-TTS model
     vocab_file=vocab_path,
 )
+speakingrate_model = load_model_sp(
+    model_cfg_sp,
+    str(cached_path("hf://QingyuLiu1/Cross-Lingual_F5-TTS/syllables_gce_20000.safetensors")),
+    mel_spec_kwargs,
+)
 @spaces.GPU
 def infer(
     show_info=gr.Info,
 ):
     # Fixed reference text
+    ref_text = "Useless here."
     if not ref_audio_orig or not gen_text.strip():
         gr.Warning("Please ensure [Reference Audio] and [Text to Generate] are both provided.")
     torch.manual_seed(seed)
     used_seed = seed
+    ref_audio, _ = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
+    final_wave, final_sample_rate, _ = infer_process_clf5(
+        speakingrate_model,
         ref_audio,
         gen_text,
         cross_lingual_model,
         vocoder,
     with gr.Row():
         with gr.Column():
             ref_wav_input = gr.Audio(label="Reference Audio", type="filepath")
             gen_txt_input = gr.Textbox(label="Text to Generate")
             generate_btn = gr.Button("Synthesize", variant="primary")
             with gr.Row():

module_clf5.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from __future__ import annotations
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import Literal
+from f5_tts.model.modules import MelSpec
+from f5_tts.model.utils import (
+    default,
+    exists,
+    lens_to_mask,
+)
+from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model.modules import (
+    ConvPositionEmbedding,
+    Attention,
+    AttnProcessor,
+    FeedForward
+)
+class SpeedPredictorLayer(nn.Module):
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, qk_norm=None, pe_attn_head=None):
+        super().__init__()
+        self.attn = Attention(
+            processor=AttnProcessor(pe_attn_head=pe_attn_head),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            qk_norm=qk_norm,
+        )
+        self.ln1 = nn.LayerNorm(dim, elementwise_affine=True, eps=1e-6)
+        self.ln2 = nn.LayerNorm(dim, elementwise_affine=True, eps=1e-6)
+        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, mask=None, rope=None):  # x: noised input, t: time embedding
+        # mha sublayer (Pre norm)
+        x_norm_atte = self.ln1(x)
+        attn_output = self.attn(x=x_norm_atte, mask=mask, rope=rope)
+        x = x + attn_output
+        # ffn sublayer (Pre norm)
+        x_norm_ffn = self.ln2(x)
+        ffn_output = self.ff(x=x_norm_ffn)
+        output = x + ffn_output
+        return output
+class GaussianCrossEntropyLoss(nn.Module):
+    def __init__(self, num_classes, sigma_factor=2.0):
+        super().__init__()
+        self.num_classes = num_classes
+        self.sigma_factor = sigma_factor
+    def forward(self, y_pred, y_true, device):  # y_pred.shape: [b, num_classes]    y_true.shape: [b]
+        # gt
+        centers = y_true.unsqueeze(-1)  # shape: [b, 1]
+        # 位置索引
+        positions = torch.arange(self.num_classes, device=device).float()  # shape: [num_classes]
+        positions = positions.expand(y_true.shape[0], -1)  # shape: [b, num_classes]
+        # sigma
+        sigma = self.sigma_factor * torch.ones_like(y_true, device=device).float()
+        # 高斯分布
+        diff = positions - centers  # (c-gt).shape: [b, num_classes]
+        y_true_soft = torch.exp(-(diff.pow(2) / (2 * sigma.pow(2).unsqueeze(-1))))  # shape: [b, num_classes]
+        loss = -(y_true_soft * F.log_softmax(y_pred, dim=-1)).sum(dim=-1).mean()
+        return loss
+class SpeedTransformer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        depth=6,
+        heads=8,
+        dropout=0.1,
+        ff_mult=4,
+        qk_norm=None,
+        pe_attn_head=None,
+        mel_dim=100,
+        num_classes=32,
+    ):
+        super().__init__()
+        self.dim_head = dim // heads
+        self.num_classes = num_classes
+        self.mel_proj = nn.Linear(mel_dim, dim)
+        self.conv_layer = ConvPositionEmbedding(dim=dim)
+        self.rotary_embed = RotaryEmbedding(self.dim_head)
+        self.transformer_blocks = nn.ModuleList([
+                SpeedPredictorLayer(
+                    dim=dim,
+                    heads=heads,
+                    dim_head = self.dim_head,
+                    ff_mult=ff_mult,
+                    dropout=dropout,
+                    qk_norm=qk_norm,
+                    pe_attn_head=pe_attn_head
+                ) for _ in range(depth)
+            ])
+        self.pool = nn.Sequential(
+            nn.Linear(dim, dim),
+            nn.Tanh(),
+            nn.Linear(dim, 1)
+        )
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, dim),
+            nn.GELU(),  # nn.ReLU()
+            nn.Linear(dim, num_classes)
+        )
+        # self.initialize_weights()
+    # def initialize_weights(self):
+    def forward(self, x, lens):                     # x.shape = [b, seq_len, d_mel]
+        seq_len = x.shape[1]
+        mask = lens_to_mask(lens, length=seq_len)   # shape = [b, seq_len]
+        x = self.mel_proj(x)                        # shape = [b, seq_len, h]
+        x = self.conv_layer(x, mask)                # shape = [b, seq_len, h]
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+        for block in self.transformer_blocks:
+            x = block(x, mask=mask, rope=rope)      # shape = [b, seq_len, h]
+        # sequence pooling
+        weights = self.pool(x)                      # shape = [b, seq_len, 1]
+        # 将 padding 位置的 weights 设为 -inf
+        weights.masked_fill_(~mask.unsqueeze(-1), -torch.finfo(weights.dtype).max)
+        weights = F.softmax(weights, dim=1)         # shape = [b, seq_len, 1]
+        x = (x * weights).sum(dim=1)                # shape = [b, h]
+        output = self.classifier(x)                 # shape: [b, num_classes]
+        return output
+class SpeedMapper:
+    def __init__(
+        self,
+        num_classes: Literal[32, 72],
+        delta: float = 0.25
+    ):
+        self.num_classes = num_classes
+        self.delta = delta
+        self.max_speed = float(num_classes) * delta
+        self.speed_values = torch.arange(0.25, self.max_speed + self.delta, self.delta)
+        assert len(self.speed_values) == num_classes, f"Generated {len(self.speed_values)} classes, expected {num_classes}"
+    def label_to_speed(self, label: torch.Tensor) -> torch.Tensor:
+        return self.speed_values.to(label.device)[label] # label * 0.25 + 0.25
+class SpeedPredictor(nn.Module):
+    def __init__(
+        self,
+        speed_type: Literal["phonemes", "syllables", "words"] = "phonemes",
+        mel_spec_kwargs: dict = dict(),
+        arch_kwargs: dict | None = None,
+        sigma_factor: int = 2,
+        mel_spec_module: nn.Module | None = None,
+        num_channels: int = 100,
+    ):
+        super().__init__()
+        num_classes_map = {
+            "phonemes": 72,
+            "syllables": 32,
+            "words": 32
+        }
+        self.num_classes = num_classes_map[speed_type]
+        # mel spec
+        self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
+        num_channels = default(num_channels, self.mel_spec.n_mel_channels)
+        self.num_channels = num_channels
+        self.speed_transformer = SpeedTransformer(**arch_kwargs, num_classes=self.num_classes)
+        self.gce = GaussianCrossEntropyLoss(num_classes=self.num_classes, sigma_factor=sigma_factor)
+        self.speed_mapper = SpeedMapper(self.num_classes)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @torch.no_grad()
+    def predict_speed(self, audio: torch.Tensor, lens: torch.Tensor | None = None):
+        # raw wave
+        if audio.ndim == 2:
+            audio = self.mel_spec(audio).permute(0, 2, 1)
+        batch, seq_len, device = *audio.shape[:2], audio.device
+        if not exists(lens):
+            lens = torch.full((batch,), seq_len, device=device, dtype=torch.long)
+        logits = self.speed_transformer(audio, lens)
+        probs = F.softmax(logits, dim=-1)
+        pred_class = torch.argmax(probs, dim=-1)
+        pred_speed = self.speed_mapper.label_to_speed(pred_class)
+        return pred_speed
+    def forward(
+        self,
+        inp: float["b n d"] | float["b nw"],  # mel or raw wave  # noqa: F722
+        speed: float["b"],      # speed groundtruth
+        lens: int["b"] | None = None,  # noqa: F821
+    ):
+        if inp.ndim == 2:
+            inp = self.mel_spec(inp)
+            inp = inp.permute(0, 2, 1)
+            assert inp.shape[-1] == self.num_channels
+        device = self.device
+        pred = self.speed_transformer(inp, lens)
+        loss = self.gce(pred, speed, device)
+        return loss

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- f5-tts


1	+ f5-tts
2	+ pyphen

utils_clf5_space.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import torch
+import torchaudio
+from f5_tts.infer.utils_infer import (
+    load_checkpoint,
+    chunk_text,
+    convert_char_to_pinyin,
+)
+from module_clf5 import SpeedPredictor
+import tqdm
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import pyphen
+import re
+def count(text, speed_type="syllables"):
+    def count_syllables(text):
+        # 初始化 pyphen 字典
+        dic = pyphen.Pyphen(lang='en_US')
+        total_syllables = 0
+        # 1. 定义正则表达式
+        pattern = re.compile(r"[a-zA-Z']+|[\u4e00-\u9fff]")
+        # 2. 找出所有匹配的令牌（英文单词和中文字符）
+        tokens = pattern.findall(text)
+        # 3. 遍历令牌并计算音节
+        for token in tokens:
+            # 检查是否为中文字符
+            if '\u4e00' <= token <= '\u9fff':
+                # 中文单字计为1个音节
+                total_syllables += 1
+            else:
+                # 英文单词处理逻辑
+                try:
+                    # 使用 pyphen 划分音节
+                    syllables = dic.inserted(token.lower()).split("-")
+                    total_syllables += len(syllables)
+                except Exception:
+                    # 如果出现任何错误，估算为1个音节
+                    total_syllables += 1
+        return total_syllables
+    count_functions = {
+        "syllables": count_syllables,
+    }
+    if speed_type not in count_functions:
+        raise ValueError(f"Unknown speed_type: {speed_type}")
+    return count_functions[speed_type](text)
+device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "xpu"
+    if torch.xpu.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+# -----------------------------------------
+target_sample_rate = 24000
+n_mel_channels = 100
+hop_length = 256
+win_length = 1024
+n_fft = 1024
+mel_spec_type = "vocos"
+target_rms = 0.1
+cross_fade_duration = 0.15
+ode_method = "euler"
+nfe_step = 32  # 16, 32
+cfg_strength = 2.0
+sway_sampling_coef = -1.0
+speed = 1.0
+fix_duration = None
+# -----------------------------------------
+def infer_process_clf5(
+    speakingrate_model,
+    ref_audio,
+    gen_text,
+    model_obj,
+    vocoder,
+    mel_spec_type=mel_spec_type,
+    show_info=print,
+    progress=tqdm,
+    target_rms=target_rms,
+    cross_fade_duration=cross_fade_duration,
+    nfe_step=nfe_step,
+    cfg_strength=cfg_strength,
+    sway_sampling_coef=sway_sampling_coef,
+    speed=speed,
+    fix_duration=fix_duration,
+    device=device,
+):
+    # Split the input text into batches
+    audio, sr = torchaudio.load(ref_audio)
+    # max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr))
+    gen_text_batches = chunk_text(gen_text)
+    for i, gen_text in enumerate(gen_text_batches):
+        print(f"gen_text {i}", gen_text)
+    print("\n")
+    show_info(f"Generating audio in {len(gen_text_batches)} batches...")
+    return next(
+        infer_batch_process_clf5(
+            (audio, sr),
+            speakingrate_model,
+            gen_text_batches,
+            model_obj,
+            vocoder,
+            mel_spec_type=mel_spec_type,
+            progress=progress,
+            target_rms=target_rms,
+            cross_fade_duration=cross_fade_duration,
+            nfe_step=nfe_step,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=sway_sampling_coef,
+            speed=speed,
+            fix_duration=fix_duration,
+            device=device,
+        )
+    )
+def infer_batch_process_clf5(
+    ref_audio,
+    speakingrate_model,
+    gen_text_batches,
+    model_obj,
+    vocoder,
+    mel_spec_type="vocos",
+    progress=tqdm,
+    target_rms=0.1,
+    cross_fade_duration=0.15,
+    nfe_step=32,
+    cfg_strength=2.0,
+    sway_sampling_coef=-1,
+    speed=1,
+    fix_duration=None,
+    device=None,
+    streaming=False,
+    chunk_size=2048,
+):
+    audio, sr = ref_audio
+    if audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True)
+    rms = torch.sqrt(torch.mean(torch.square(audio)))
+    if rms < target_rms:
+        audio = audio * target_rms / rms
+    if sr != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
+        audio = resampler(audio)
+    audio = audio.to(device)
+    pred_speed = speakingrate_model.predict_speed(
+        audio=audio
+    )
+    generated_waves = []
+    spectrograms = []
+    def process_batch(gen_text):
+        local_speed = speed
+        if len(gen_text.encode("utf-8")) < 10:
+            local_speed = 0.3
+        # Prepare the text
+        text_list = [gen_text]
+        final_text_list = convert_char_to_pinyin(text_list)
+        ref_audio_len = audio.shape[-1] // hop_length
+        if fix_duration is not None:
+            duration = int(fix_duration * target_sample_rate / hop_length)
+        else:
+            # Calculate duration
+            # ref_text_len = len(ref_text.encode("utf-8"))
+            # gen_text_len = len(gen_text.encode("utf-8"))
+            # duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / local_speed)
+            gt_num_unit = count(gen_text)
+            pred_duration = max(gt_num_unit / pred_speed.item(), 1)
+            gen_audio_len = int((pred_duration * target_sample_rate) / hop_length)
+            duration = ref_audio_len + gen_audio_len
+        # inference
+        with torch.inference_mode():
+            generated, _ = model_obj.sample(
+                cond=audio,
+                text=final_text_list,
+                duration=duration,
+                steps=nfe_step,
+                cfg_strength=cfg_strength,
+                sway_sampling_coef=sway_sampling_coef,
+            )
+            generated = generated.to(torch.float32)
+            generated = generated[:, ref_audio_len:, :]
+            generated_mel_spec = generated.permute(0, 2, 1)
+            if mel_spec_type == "vocos":
+                generated_wave = vocoder.decode(generated_mel_spec)
+            elif mel_spec_type == "bigvgan":
+                generated_wave = vocoder(generated_mel_spec)
+            if rms < target_rms:
+                generated_wave = generated_wave * rms / target_rms
+            # wav -> numpy
+            generated_wave = generated_wave.squeeze().cpu().numpy()
+            if streaming:
+                for j in range(0, len(generated_wave), chunk_size):
+                    yield generated_wave[j : j + chunk_size], target_sample_rate
+            else:
+                yield generated_wave, generated_mel_spec[0].cpu().numpy()
+    if streaming:
+        for gen_text in progress.tqdm(gen_text_batches) if progress is not None else gen_text_batches:
+            for chunk in process_batch(gen_text):
+                yield chunk
+    else:
+        with ThreadPoolExecutor() as executor:
+            futures = [executor.submit(process_batch, gen_text) for gen_text in gen_text_batches]
+            for future in progress.tqdm(futures) if progress is not None else futures:
+                result = future.result()
+                if result:
+                    generated_wave, generated_mel_spec = next(result)
+                    generated_waves.append(generated_wave)
+                    spectrograms.append(generated_mel_spec)
+        if generated_waves:
+            if cross_fade_duration <= 0:
+                # Simply concatenate
+                final_wave = np.concatenate(generated_waves)
+            else:
+                # Combine all generated waves with cross-fading
+                final_wave = generated_waves[0]
+                for i in range(1, len(generated_waves)):
+                    prev_wave = final_wave
+                    next_wave = generated_waves[i]
+                    # Calculate cross-fade samples, ensuring it does not exceed wave lengths
+                    cross_fade_samples = int(cross_fade_duration * target_sample_rate)
+                    cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
+                    if cross_fade_samples <= 0:
+                        # No overlap possible, concatenate
+                        final_wave = np.concatenate([prev_wave, next_wave])
+                        continue
+                    # Overlapping parts
+                    prev_overlap = prev_wave[-cross_fade_samples:]
+                    next_overlap = next_wave[:cross_fade_samples]
+                    # Fade out and fade in
+                    fade_out = np.linspace(1, 0, cross_fade_samples)
+                    fade_in = np.linspace(0, 1, cross_fade_samples)
+                    # Cross-faded overlap
+                    cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
+                    # Combine
+                    new_wave = np.concatenate(
+                        [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
+                    )
+                    final_wave = new_wave
+            # Create a combined spectrogram
+            combined_spectrogram = np.concatenate(spectrograms, axis=1)
+            yield final_wave, target_sample_rate, combined_spectrogram
+        else:
+            yield None, target_sample_rate, None
+def load_model_sp(
+    model_cfg,
+    ckpt_path,
+    mel_spec_kwargs,
+    speed_type="syllables",
+    use_ema=True,
+    device=device,
+):
+    print("model : ", ckpt_path, "\n")
+    model_sp = SpeedPredictor(
+        speed_type=speed_type,
+        mel_spec_kwargs=mel_spec_kwargs,
+        arch_kwargs = model_cfg
+    ).to(device)
+    dtype = torch.float32
+    model = load_checkpoint(model_sp, ckpt_path, device, dtype=dtype, use_ema=use_ema)
+    return model

utils_clf5space.py DELETED Viewed

File without changes