Spaces:

Flux9665
/

SpeechCloning

Running

App Files Files

Flux9665 commited on Nov 3, 2022

Commit

52c413f

•

1 Parent(s): 464096d

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -23

app.py CHANGED Viewed

@@ -2,8 +2,10 @@ import os
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import torch
 os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
 os.system("mv toucan_codebase/* .")
@@ -17,6 +19,7 @@ from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
 from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
 from InferenceInterfaces.UtteranceCloner import UtteranceCloner
 def float2pcm(sig, dtype='int16'):
@@ -39,6 +42,7 @@ class TTS_Interface:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
         self.speaker_path_lookup = {
             "Voice 1": "reference_audios/voice_1.flac",
@@ -50,25 +54,54 @@ class TTS_Interface:
         self.acoustic_model = self.acoustic_model.to(self.device)
         self.dc = DurationCalculator(reduction_factor=1)
         self.tf = ArticulatoryCombinedTextFrontend(language="en")
-    def read(self, prompt, speaker_1, speaker_2, speaker_3):
-        if prompt == "Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?":
-            reference_audio = "reference_audios/clone_me_1.wav"
-        elif prompt == "I am excited! And my prosody is rather flat. And this sentence is shocking!":
-            reference_audio = "reference_audios/clone_me_2.wav"
-        elif prompt == "Don't do it! But I want to! Then go ahead.":
-            reference_audio = "reference_audios/clone_me_3.wav"
-        elif prompt == "How many examples do I realistically need? How about five? That should do it!":
-            reference_audio = "reference_audios/clone_me_4.wav"
-        elif prompt == "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.":
-            reference_audio = "reference_audios/clone_me_5.wav"
         text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
         # we don't split on the punctuation marks because we want to retain them.
         self.split_audio(reference_audio, text_list)
         # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
         self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
         part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
                                                        reference_transcription=text_list[0],
@@ -103,9 +136,12 @@ class TTS_Interface:
         # extract phonemes
         lines = list()
         for segment in text_list:
             if segment.strip() != "":
                 lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
         # postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
         processed_lines = list()
         for index, line in enumerate(lines):
@@ -119,10 +155,10 @@ class TTS_Interface:
         # get durations of each phone in audio as average of an ensemble
         alignment_paths = list()
         ensemble_of_durations = list()
-        for ensemble in range(2):
             alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
                                                                  tokens=joined_phonemes.to(self.device),
-                                                                 save_img_for_debug="alignment.png" if ensemble == 1 else None,
                                                                  return_ctc=False))
         for alignment_path in alignment_paths:
             ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
@@ -161,13 +197,10 @@ article = "<p style='text-align: left'>This is still a work in progress, models
 iface = gr.Interface(fn=meta_model.read,
                      inputs=[gr.inputs.Dropdown(
-                         ["Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?",
-                          "I am excited! And my prosody is rather flat. And this sentence is shocking!",
-                          "Don't do it! But I want to! Then go ahead.",
-                          "How many examples do I realistically need? How about five? That should do it!",
-                          "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
                          type="value",
-                         default="Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?",
                          label="Select which utterance should be customized"),
                          gr.inputs.Dropdown(["Voice 1",
                                              "Voice 2",
@@ -185,11 +218,11 @@ iface = gr.Interface(fn=meta_model.read,
                               gr.outputs.Audio(type="file", label="Reference-Voice 3"),
                               gr.outputs.Audio(type="numpy", label="Customized Audio")],
                      layout="vertical",
-                     title="Speech Customization through Prosody Cloning",
                      thumbnail="Utility/toucan.png",
                      theme="default",
                      allow_flagging="never",
                      allow_screenshot=False,
                      description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
                      article=article)
-iface.launch(enable_queue=True)

 import gradio as gr
 import numpy as np
+import soundfile
 import soundfile as sf
 import torch
+from tqdm import tqdm
 os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
 os.system("mv toucan_codebase/* .")
 from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
 from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
 from InferenceInterfaces.UtteranceCloner import UtteranceCloner
+from Preprocessing.articulatory_features import get_feature_to_index_lookup
 def float2pcm(sig, dtype='int16'):
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
         self.speaker_path_lookup = {
             "Voice 1": "reference_audios/voice_1.flac",
         self.acoustic_model = self.acoustic_model.to(self.device)
         self.dc = DurationCalculator(reduction_factor=1)
         self.tf = ArticulatoryCombinedTextFrontend(language="en")
+        example_audio, sr = soundfile.read("reference_audios/clone_me_5.wav")
+        self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, )
+        ## finetune aligner
+        steps = 10
+        tokens = list()  # we need an ID sequence for training rather than a sequence of phonological features
+        for vector in self.tf.string_to_tensor(
+                "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."):
+            if vector[get_feature_to_index_lookup()["word-boundary"]] == 0:
+                # we don't include word boundaries when performing alignment, since they are not always present in audio.
+                for phone in self.tf.phone_to_vector:
+                    if vector.numpy().tolist()[13:] == self.tf.phone_to_vector[phone][13:]:
+                        # the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup
+                        tokens.append(self.tf.phone_to_id[phone])
+                        # this is terribly inefficient, but it's fine
+                        break
+        tokens = torch.LongTensor(tokens).squeeze().to(self.device)
+        tokens_len = torch.LongTensor([len(tokens)]).to(self.device)
+        mel = self.ap.audio_to_mel_spec_tensor(example_audio, normalize=True).transpose(0, 1).unsqueeze(0).to(self.device)
+        mel.requires_grad = True
+        mel_len = torch.LongTensor([len(mel[0])]).to(self.device)
+        # actual fine-tuning starts here
+        optim_asr = torch.optim.SGD(self.acoustic_model.parameters(), lr=0.1)
+        self.acoustic_model.train()
+        for _ in tqdm(list(range(steps))):
+            pred = self.acoustic_model(mel)
+            loss = self.acoustic_model.ctc_loss(pred.transpose(0, 1).log_softmax(2), tokens, mel_len, tokens_len)
+            optim_asr.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(self.acoustic_model.parameters(), 1.0)
+            optim_asr.step()
+        self.acoustic_model.eval()
+        ## done finetuning
+        reference_audio = "reference_audios/clone_me_5.wav"
+        prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
         text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
         # we don't split on the punctuation marks because we want to retain them.
         self.split_audio(reference_audio, text_list)
         # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
+    def read(self, _, speaker_1, speaker_2, speaker_3):
+        reference_audio = "reference_audios/clone_me_5.wav"
+        prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
+        text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
+        # we don't split on the punctuation marks because we want to retain them.
         self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
         part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
                                                        reference_transcription=text_list[0],
         # extract phonemes
         lines = list()
+        self.tf.use_word_boundaries = False  # this causes problems when splitting otherwise
         for segment in text_list:
             if segment.strip() != "":
                 lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
+        self.tf.use_word_boundaries = True
         # postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
         processed_lines = list()
         for index, line in enumerate(lines):
         # get durations of each phone in audio as average of an ensemble
         alignment_paths = list()
         ensemble_of_durations = list()
+        for ensemble in range(1):
             alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
                                                                  tokens=joined_phonemes.to(self.device),
+                                                                 save_img_for_debug="alignment.png" if ensemble == 0 else None,
                                                                  return_ctc=False))
         for alignment_path in alignment_paths:
             ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
 iface = gr.Interface(fn=meta_model.read,
                      inputs=[gr.inputs.Dropdown(
+                         [
+                             "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
                          type="value",
+                         default="Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.",
                          label="Select which utterance should be customized"),
                          gr.inputs.Dropdown(["Voice 1",
                                              "Voice 2",
                               gr.outputs.Audio(type="file", label="Reference-Voice 3"),
                               gr.outputs.Audio(type="numpy", label="Customized Audio")],
                      layout="vertical",
+                     title="Speech Customization",
                      thumbnail="Utility/toucan.png",
                      theme="default",
                      allow_flagging="never",
                      allow_screenshot=False,
                      description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
                      article=article)
+iface.launch(enable_queue=True)