Flux9665 commited on
Commit
52c413f
1 Parent(s): 464096d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -23
app.py CHANGED
@@ -2,8 +2,10 @@ import os
2
 
3
  import gradio as gr
4
  import numpy as np
 
5
  import soundfile as sf
6
  import torch
 
7
 
8
  os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
9
  os.system("mv toucan_codebase/* .")
@@ -17,6 +19,7 @@ from Preprocessing.AudioPreprocessor import AudioPreprocessor
17
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
18
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
19
  from InferenceInterfaces.UtteranceCloner import UtteranceCloner
 
20
 
21
 
22
  def float2pcm(sig, dtype='int16'):
@@ -39,6 +42,7 @@ class TTS_Interface:
39
 
40
  def __init__(self):
41
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
42
  self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
43
  self.speaker_path_lookup = {
44
  "Voice 1": "reference_audios/voice_1.flac",
@@ -50,25 +54,54 @@ class TTS_Interface:
50
  self.acoustic_model = self.acoustic_model.to(self.device)
51
  self.dc = DurationCalculator(reduction_factor=1)
52
  self.tf = ArticulatoryCombinedTextFrontend(language="en")
53
-
54
- def read(self, prompt, speaker_1, speaker_2, speaker_3):
55
- if prompt == "Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?":
56
- reference_audio = "reference_audios/clone_me_1.wav"
57
- elif prompt == "I am excited! And my prosody is rather flat. And this sentence is shocking!":
58
- reference_audio = "reference_audios/clone_me_2.wav"
59
- elif prompt == "Don't do it! But I want to! Then go ahead.":
60
- reference_audio = "reference_audios/clone_me_3.wav"
61
- elif prompt == "How many examples do I realistically need? How about five? That should do it!":
62
- reference_audio = "reference_audios/clone_me_4.wav"
63
- elif prompt == "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.":
64
- reference_audio = "reference_audios/clone_me_5.wav"
65
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
67
  # we don't split on the punctuation marks because we want to retain them.
68
 
69
  self.split_audio(reference_audio, text_list)
70
  # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
71
 
 
 
 
 
 
 
72
  self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
73
  part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
74
  reference_transcription=text_list[0],
@@ -103,9 +136,12 @@ class TTS_Interface:
103
 
104
  # extract phonemes
105
  lines = list()
 
106
  for segment in text_list:
107
  if segment.strip() != "":
108
  lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
 
 
109
  # postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
110
  processed_lines = list()
111
  for index, line in enumerate(lines):
@@ -119,10 +155,10 @@ class TTS_Interface:
119
  # get durations of each phone in audio as average of an ensemble
120
  alignment_paths = list()
121
  ensemble_of_durations = list()
122
- for ensemble in range(2):
123
  alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
124
  tokens=joined_phonemes.to(self.device),
125
- save_img_for_debug="alignment.png" if ensemble == 1 else None,
126
  return_ctc=False))
127
  for alignment_path in alignment_paths:
128
  ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
@@ -161,13 +197,10 @@ article = "<p style='text-align: left'>This is still a work in progress, models
161
 
162
  iface = gr.Interface(fn=meta_model.read,
163
  inputs=[gr.inputs.Dropdown(
164
- ["Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?",
165
- "I am excited! And my prosody is rather flat. And this sentence is shocking!",
166
- "Don't do it! But I want to! Then go ahead.",
167
- "How many examples do I realistically need? How about five? That should do it!",
168
- "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
169
  type="value",
170
- default="Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?",
171
  label="Select which utterance should be customized"),
172
  gr.inputs.Dropdown(["Voice 1",
173
  "Voice 2",
@@ -185,11 +218,11 @@ iface = gr.Interface(fn=meta_model.read,
185
  gr.outputs.Audio(type="file", label="Reference-Voice 3"),
186
  gr.outputs.Audio(type="numpy", label="Customized Audio")],
187
  layout="vertical",
188
- title="Speech Customization through Prosody Cloning",
189
  thumbnail="Utility/toucan.png",
190
  theme="default",
191
  allow_flagging="never",
192
  allow_screenshot=False,
193
  description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
194
  article=article)
195
- iface.launch(enable_queue=True)
2
 
3
  import gradio as gr
4
  import numpy as np
5
+ import soundfile
6
  import soundfile as sf
7
  import torch
8
+ from tqdm import tqdm
9
 
10
  os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
11
  os.system("mv toucan_codebase/* .")
19
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
20
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
21
  from InferenceInterfaces.UtteranceCloner import UtteranceCloner
22
+ from Preprocessing.articulatory_features import get_feature_to_index_lookup
23
 
24
 
25
  def float2pcm(sig, dtype='int16'):
42
 
43
  def __init__(self):
44
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
45
+
46
  self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
47
  self.speaker_path_lookup = {
48
  "Voice 1": "reference_audios/voice_1.flac",
54
  self.acoustic_model = self.acoustic_model.to(self.device)
55
  self.dc = DurationCalculator(reduction_factor=1)
56
  self.tf = ArticulatoryCombinedTextFrontend(language="en")
57
+ example_audio, sr = soundfile.read("reference_audios/clone_me_5.wav")
58
+ self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, )
59
+
60
+ ## finetune aligner
61
+ steps = 10
62
+ tokens = list() # we need an ID sequence for training rather than a sequence of phonological features
63
+ for vector in self.tf.string_to_tensor(
64
+ "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."):
65
+ if vector[get_feature_to_index_lookup()["word-boundary"]] == 0:
66
+ # we don't include word boundaries when performing alignment, since they are not always present in audio.
67
+ for phone in self.tf.phone_to_vector:
68
+ if vector.numpy().tolist()[13:] == self.tf.phone_to_vector[phone][13:]:
69
+ # the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup
70
+ tokens.append(self.tf.phone_to_id[phone])
71
+ # this is terribly inefficient, but it's fine
72
+ break
73
+ tokens = torch.LongTensor(tokens).squeeze().to(self.device)
74
+ tokens_len = torch.LongTensor([len(tokens)]).to(self.device)
75
+ mel = self.ap.audio_to_mel_spec_tensor(example_audio, normalize=True).transpose(0, 1).unsqueeze(0).to(self.device)
76
+ mel.requires_grad = True
77
+ mel_len = torch.LongTensor([len(mel[0])]).to(self.device)
78
+ # actual fine-tuning starts here
79
+ optim_asr = torch.optim.SGD(self.acoustic_model.parameters(), lr=0.1)
80
+ self.acoustic_model.train()
81
+ for _ in tqdm(list(range(steps))):
82
+ pred = self.acoustic_model(mel)
83
+ loss = self.acoustic_model.ctc_loss(pred.transpose(0, 1).log_softmax(2), tokens, mel_len, tokens_len)
84
+ optim_asr.zero_grad()
85
+ loss.backward()
86
+ torch.nn.utils.clip_grad_norm_(self.acoustic_model.parameters(), 1.0)
87
+ optim_asr.step()
88
+ self.acoustic_model.eval()
89
+ ## done finetuning
90
+
91
+ reference_audio = "reference_audios/clone_me_5.wav"
92
+ prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
93
  text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
94
  # we don't split on the punctuation marks because we want to retain them.
95
 
96
  self.split_audio(reference_audio, text_list)
97
  # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
98
 
99
+ def read(self, _, speaker_1, speaker_2, speaker_3):
100
+ reference_audio = "reference_audios/clone_me_5.wav"
101
+ prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
102
+ text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
103
+ # we don't split on the punctuation marks because we want to retain them.
104
+
105
  self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
106
  part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
107
  reference_transcription=text_list[0],
136
 
137
  # extract phonemes
138
  lines = list()
139
+ self.tf.use_word_boundaries = False # this causes problems when splitting otherwise
140
  for segment in text_list:
141
  if segment.strip() != "":
142
  lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
143
+ self.tf.use_word_boundaries = True
144
+
145
  # postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
146
  processed_lines = list()
147
  for index, line in enumerate(lines):
155
  # get durations of each phone in audio as average of an ensemble
156
  alignment_paths = list()
157
  ensemble_of_durations = list()
158
+ for ensemble in range(1):
159
  alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
160
  tokens=joined_phonemes.to(self.device),
161
+ save_img_for_debug="alignment.png" if ensemble == 0 else None,
162
  return_ctc=False))
163
  for alignment_path in alignment_paths:
164
  ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
197
 
198
  iface = gr.Interface(fn=meta_model.read,
199
  inputs=[gr.inputs.Dropdown(
200
+ [
201
+ "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
 
 
 
202
  type="value",
203
+ default="Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.",
204
  label="Select which utterance should be customized"),
205
  gr.inputs.Dropdown(["Voice 1",
206
  "Voice 2",
218
  gr.outputs.Audio(type="file", label="Reference-Voice 3"),
219
  gr.outputs.Audio(type="numpy", label="Customized Audio")],
220
  layout="vertical",
221
+ title="Speech Customization",
222
  thumbnail="Utility/toucan.png",
223
  theme="default",
224
  allow_flagging="never",
225
  allow_screenshot=False,
226
  description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
227
  article=article)
228
+ iface.launch(enable_queue=True)