Spaces:

united-link
/

hakka-e2-tts-test

Runtime error

App Files Files Community

txya900619 commited on Oct 25, 2024

Commit

ec8f857

1 Parent(s): 4ab241b

feat: upload new app.py

Browse files

Files changed (3) hide show

app.py +35 -16
patch/e2_tts_pytorch.py +155 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import os
 import gradio as gr
 import spaces
 import torch
-from e2_tts_pytorch import E2TTS, DurationPredictor
 from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from tokenizers import Tokenizer
 from transformers import PreTrainedTokenizerFast
 from ipa.ipa import get_ipa, parse_ipa
 def load_model(model_id):
@@ -67,24 +69,32 @@ models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
 @spaces.GPU
-def _do_tts(model_id, ipa, ref_wav, ref_transcript):
-    model = models_config[model_id]["model"].cuda()
-    generated = model.sample(
-        cond=torch.from_numpy(ref_wav).float().unsqueeze(0).cuda(),
-        text=[ref_transcript + ipa],
-        steps=32,
-        cfg_strength=1.0,
-    )[0]
-    return generated.cpu().numpy()
 def text_to_speech(
     model_id: str,
     text: str,
     ref_wav: str,
     ref_transcript: str,
-    dialect: str,
-    # speed: float,
 ):
     if len(text) == 0:
         raise gr.Error("請勿輸入空字串。")
@@ -96,13 +106,13 @@ def text_to_speech(
     parsed_ipa = parse_ipa(ipa)
     if dialect == "nansixian":
         dialect = "sixian"
-    models_config[model_id]["model"].tts_model.length_scale = speed
     wav = _do_tts(
         model_id,
         parsed_ipa,
         ref_wav,
         ref_transcript,
     )
     return (
@@ -180,12 +190,20 @@ with demo:
     ref_wav = gr.Audio(
         visible=False,
         type="filepath",
         waveform_options=gr.WaveformOptions(
             show_controls=False,
             sample_rate=24000,
         ),
     )
-    ref_transcript = gr.Textbox(visible=False)
     speaker_wav = gr.Audio(
         label="客製化語音",
@@ -259,12 +277,13 @@ with demo:
         text_to_speech,
         inputs=[
             model_drop_down,
-            input_text,
             use_default_or_custom_radio,
-            speaker_wav,
             speaker_drop_down,
             dialect_radio,
             speed,
         ],
         outputs=[
             gr.Textbox(interactive=False, label="斷詞"),

 import os
 import gradio as gr
+import librosa
 import spaces
 import torch
+from e2_tts_pytorch import DurationPredictor
 from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from tokenizers import Tokenizer
 from transformers import PreTrainedTokenizerFast
 from ipa.ipa import get_ipa, parse_ipa
+from patch.e2_tts_pytorch import E2TTSPatched as E2TTS
 def load_model(model_id):
 @spaces.GPU
+def _do_tts(model_id, ipa, ref_wav, ref_transcript, speed):
+    with torch.inference_mode():
+        model = models_config[model_id]["model"].cuda()
+        ref_wav = librosa.load(ref_wav, sr=model.sampling_rate)[0]
+        print(ref_transcript + ipa)
+        text = model.tokenizer([ref_transcript + ipa]).to(model.device)
+        generated = model.sample(
+            cond=torch.from_numpy(ref_wav).float().unsqueeze(0).cuda(),
+            text=text,
+            steps=32,
+            cfg_strength=1.0,
+            speed=speed,
+        )[0]
+        return generated.cpu().numpy()
 def text_to_speech(
     model_id: str,
+    use_default_or_custom: str,
+    speaker_name: str,
+    dialect: str,
+    speed: float,
     text: str,
     ref_wav: str,
     ref_transcript: str,
 ):
     if len(text) == 0:
         raise gr.Error("請勿輸入空字串。")
     parsed_ipa = parse_ipa(ipa)
     if dialect == "nansixian":
         dialect = "sixian"
     wav = _do_tts(
         model_id,
         parsed_ipa,
         ref_wav,
         ref_transcript,
+        speed,
     )
     return (
     ref_wav = gr.Audio(
         visible=False,
         type="filepath",
+        value=list(models_config[default_model_id]["speaker_mapping"].values())[0][
+            "ref_wav"
+        ],
         waveform_options=gr.WaveformOptions(
             show_controls=False,
             sample_rate=24000,
         ),
     )
+    ref_transcript = gr.Textbox(
+        value=list(models_config[default_model_id]["speaker_mapping"].values())[0][
+            "ref_transcript"
+        ],
+        visible=False,
+    )
     speaker_wav = gr.Audio(
         label="客製化語音",
         text_to_speech,
         inputs=[
             model_drop_down,
             use_default_or_custom_radio,
             speaker_drop_down,
             dialect_radio,
             speed,
+            input_text,
+            ref_wav,
+            ref_transcript,
         ],
         outputs=[
             gr.Textbox(interactive=False, label="斷詞"),

patch/e2_tts_pytorch.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Callable
+import torch
+import torch.nn.functional as F
+import torchaudio
+from e2_tts_pytorch import E2TTS
+from e2_tts_pytorch.e2_tts import Float, Int, exists, lens_to_mask
+from einops import rearrange
+from torchdiffeq import odeint
+class E2TTSPatched(E2TTS):
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Float["b n d"] | Float["b nw"],
+        *,
+        text: Int["b nt"] | list[str] | None = None,
+        lens: Int["b"] | None = None,
+        duration: int | Int["b"] | None = None,
+        steps=32,
+        cfg_strength=1.0,  # they used a classifier free guidance strength of 1.
+        max_duration=4096,  # in case the duration predictor goes haywire
+        vocoder: Callable[[Float["b d n"]], list[Float["_"]]] | None = None,
+        return_raw_output: bool | None = None,
+        save_to_filename: str | None = None,
+        speed: float = 1.0,
+    ) -> (Float["b n d"], list[Float["_"]]):
+        self.eval()
+        # raw wave
+        if cond.ndim == 2:
+            cond = self.mel_spec(cond)
+            cond = rearrange(cond, "b d n -> b n d")
+            assert cond.shape[-1] == self.num_channels
+        batch, cond_seq_len, device = *cond.shape[:2], cond.device
+        if not exists(lens):
+            lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
+        # text
+        if isinstance(text, list):
+            text = self.tokenizer(text).to(device)
+            assert text.shape[0] == batch
+        if exists(text):
+            text_lens = (text != -1).sum(dim=-1)
+            lens = torch.maximum(
+                text_lens, lens
+            )  # make sure lengths are at least those of the text characters
+        # duration
+        cond_mask = lens_to_mask(lens)
+        if exists(duration):
+            if isinstance(duration, int):
+                duration = torch.full(
+                    (batch,), duration, device=device, dtype=torch.long
+                )
+        elif exists(self.duration_predictor):
+            duration = (
+                self.duration_predictor(cond, text=text, lens=lens, return_loss=False)
+                * speed
+            ).long()
+        duration = torch.maximum(
+            lens + 1, duration
+        )  # just add one token so something is generated
+        duration = duration.clamp(max=max_duration)
+        assert duration.shape[0] == batch
+        max_duration = duration.amax()
+        cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
+        cond_mask = F.pad(
+            cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False
+        )
+        cond_mask = rearrange(cond_mask, "... -> ... 1")
+        mask = lens_to_mask(duration)
+        # neural ode
+        def fn(t, x):
+            # at each step, conditioning is fixed
+            step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
+            # predict flow
+            return self.cfg_transformer_with_pred_head(
+                x, step_cond, times=t, text=text, mask=mask, cfg_strength=cfg_strength
+            )
+        y0 = torch.randn_like(cond)
+        t = torch.linspace(0, 1, steps, device=self.device)
+        trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
+        sampled = trajectory[-1]
+        out = sampled
+        out = torch.where(cond_mask, cond, out)
+        # able to return raw untransformed output, if not using mel rep
+        if exists(return_raw_output) and return_raw_output:
+            return out
+        # take care of transforming mel to audio if `vocoder` is passed in, or if `use_vocos` is turned on
+        if exists(vocoder):
+            assert not exists(
+                self.vocos
+            ), "`use_vocos` should not be turned on if you are passing in a custom `vocoder` on sampling"
+            out = rearrange(out, "b n d -> b d n")
+            out = vocoder(out)
+        elif exists(self.vocos):
+            audio = []
+            for mel, one_mask in zip(out, mask):
+                one_out = mel[one_mask]
+                one_out = rearrange(one_out, "n d -> 1 d n")
+                one_audio = self.vocos.decode(one_out)
+                one_audio = rearrange(one_audio, "1 nw -> nw")
+                audio.append(one_audio)
+            out = audio
+        if exists(save_to_filename):
+            assert exists(vocoder) or exists(self.vocos)
+            assert exists(self.sampling_rate)
+            path = Path(save_to_filename)
+            parent_path = path.parents[0]
+            parent_path.mkdir(exist_ok=True, parents=True)
+            for ind, one_audio in enumerate(out):
+                one_audio = rearrange(one_audio, "nw -> 1 nw")
+                save_path = str(parent_path / f"{ind + 1}.{path.name}")
+                torchaudio.save(
+                    save_path, one_audio.detach().cpu(), sample_rate=self.sampling_rate
+                )
+        return out

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ opencc
 omegaconf
 e2_tts_pytorch
 transformers
-matplotlib

 omegaconf
 e2_tts_pytorch
 transformers
+matplotlib
+librosa