Florian Lux commited on
Commit
e8958d3
1 Parent(s): 562e6d0

implement enjabements demo

Browse files
InferenceInterfaces/Meta_FastSpeech2.py CHANGED
@@ -40,9 +40,13 @@ class Meta_FastSpeech2(torch.nn.Module):
40
  self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, silent=True)
41
  self.lang_id = get_language_id(lang_id).to(self.device)
42
 
43
- def forward(self, text, view=False, durations=None, pitch=None, energy=None):
44
- with torch.no_grad():
45
- phones = self.text2phone.string_to_tensor(text).to(torch.device(self.device))
 
 
 
 
46
  mel, durations, pitch, energy = self.phone2mel(phones,
47
  return_duration_pitch_energy=True,
48
  utterance_embedding=self.default_utterance_embedding,
 
40
  self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, silent=True)
41
  self.lang_id = get_language_id(lang_id).to(self.device)
42
 
43
+ def forward(self, text, view=False, durations=None, pitch=None, energy=None, phones = False):
44
+ with torch.inference_mode():
45
+ if phones is False:
46
+ phones = self.text2phone.string_to_tensor(text).to(torch.device(self.device))
47
+ else:
48
+ phones = self.text2phone.string_to_tensor(text, input_phonemes=True).to(torch.device(self.device))
49
+
50
  mel, durations, pitch, energy = self.phone2mel(phones,
51
  return_duration_pitch_energy=True,
52
  utterance_embedding=self.default_utterance_embedding,
app.py CHANGED
@@ -1,18 +1,15 @@
1
- import os
2
-
3
  import gradio as gr
4
  import numpy as np
5
- import soundfile as sf
6
  import torch
7
-
8
  from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
9
- from Preprocessing.AudioPreprocessor import AudioPreprocessor
10
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
11
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
12
  from run_utterance_cloner import UtteranceCloner
13
 
14
- os.system("pip uninstall -y gradio")
15
- os.system("pip install gradio==2.7.5.2")
 
16
 
17
 
18
  def float2pcm(sig, dtype='int16'):
@@ -36,148 +33,173 @@ class TTS_Interface:
36
  def __init__(self):
37
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
38
  self.utterance_cloner = UtteranceCloner(device=self.device)
39
- self.speaker_path_lookup = {
40
- "Voice 1": "reference_audios/voice_1.flac",
41
- "Voice 2": "reference_audios/voice_2.wav",
42
- "Voice 3": "reference_audios/voice_3.wav",
43
- }
44
  self.acoustic_model = Aligner()
45
  self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
46
  self.acoustic_model = self.acoustic_model.to(self.device)
47
  self.dc = DurationCalculator(reduction_factor=1)
48
  self.tf = ArticulatoryCombinedTextFrontend(language="en")
49
-
50
- def read(self, prompt, speaker_1, speaker_2, speaker_3):
51
- if prompt == "Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?":
52
- reference_audio = "reference_audios/clone_me_1.wav"
53
- elif prompt == "I am excited! And my prosody is rather flat. And this sentence is shocking!":
54
- reference_audio = "reference_audios/clone_me_2.wav"
55
- elif prompt == "Don't do it! But I want to! Then go ahead.":
56
- reference_audio = "reference_audios/clone_me_3.wav"
57
- elif prompt == "How many examples do I realistically need? How about five? That should do it!":
58
- reference_audio = "reference_audios/clone_me_4.wav"
59
- elif prompt == "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better.":
60
- reference_audio = "reference_audios/clone_me_5.wav"
61
-
62
- text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
63
- # we don't split on the punctuation marks because we want to retain them.
64
-
65
- self.split_audio(reference_audio, text_list)
66
- # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
67
-
68
- self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
69
- part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
70
- reference_transcription=text_list[0],
71
- clone_speaker_identity=False,
72
- lang="en")
73
-
74
- self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_2])
75
- part_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
76
- reference_transcription=text_list[1],
77
- clone_speaker_identity=False,
78
- lang="en")
79
-
80
- self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_3])
81
- part_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
82
- reference_transcription=text_list[2],
83
- clone_speaker_identity=False,
84
- lang="en")
85
-
86
- return "alignment.png", reference_audio, (48000, float2pcm(torch.cat([part_1, part_2, part_3], dim=0).numpy()))
87
-
88
- def split_audio(self, path_to_audio, text_list):
89
- # extract audio
90
- audio, sr = sf.read(path_to_audio)
91
- ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
92
- norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=audio)
93
- melspec = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1)
94
-
95
- # extract phonemes
96
- lines = list()
97
- for segment in text_list:
98
- if segment.strip() != "":
99
- lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
100
- # postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
101
- processed_lines = list()
102
- for index, line in enumerate(lines):
103
- if index == 0:
104
- processed_lines.append(line[:-1])
105
- else:
106
- processed_lines.append(line[1:-1])
107
- lines = processed_lines
108
- joined_phonemes = torch.cat(lines, dim=0)
109
-
110
- # get durations of each phone in audio as average of an ensemble
111
- alignment_paths = list()
112
- ensemble_of_durations = list()
113
- for ensemble in range(2):
114
- alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
115
- tokens=joined_phonemes.to(self.device),
116
- save_img_for_debug="alignment.png" if ensemble == 1 else None,
117
- return_ctc=False))
118
- for alignment_path in alignment_paths:
119
- ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
120
- durations = list()
121
- for i, _ in enumerate(ensemble_of_durations[0]):
122
- duration_of_phone = list()
123
- for ensemble_member in ensemble_of_durations:
124
- duration_of_phone.append(ensemble_member.squeeze()[i])
125
- durations.append(sum(duration_of_phone) / len(duration_of_phone))
126
-
127
- # cut audio according to duration sum of each line in transcript
128
- line_lens = [len(x) for x in lines]
129
- index = 0
130
- segment_durations = list()
131
- for num_phones in line_lens:
132
- segment_durations.append(sum(durations[index: index + num_phones]))
133
- index += num_phones
134
- spec_to_wave_factor = len(norm_wave) / sum(segment_durations)
135
- wave_segment_lens = [int(x * spec_to_wave_factor) for x in segment_durations]
136
- start_index = 0
137
- wave_segments = list()
138
- for index, segment_len in enumerate(wave_segment_lens):
139
- if index == len(wave_segment_lens) - 1:
140
- wave_segments.append(norm_wave[start_index:])
141
- else:
142
- wave_segments.append(norm_wave[start_index: start_index + segment_len])
143
- start_index += segment_len
144
-
145
- # write the audio segments into new files
146
- for index, wave_segment in enumerate(wave_segments):
147
- sf.write(f"split_{index + 1}.wav", wave_segment, 16000)
148
-
149
-
150
- meta_model = TTS_Interface()
151
- article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
152
-
153
- iface = gr.Interface(fn=meta_model.read,
154
- inputs=[gr.inputs.Dropdown(
155
- ["Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?",
156
- "I am excited! And my prosody is rather flat. And this sentence is shocking!",
157
- "Don't do it! But I want to! Then go ahead.",
158
- "How many examples do I realistically need? How about five? That should do it!",
159
- "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  type="value",
161
- default="Hello, here is the first sentence. And here comes the second one. I think three sentences is enough to get the point across, right?",
162
- label="Select which utterance should be customized"),
163
- gr.inputs.Dropdown(["Voice 1",
164
- "Voice 2",
165
- "Voice 3"], type="value", default="Voice 1", label="Speaker selection for the first sentence"),
166
- gr.inputs.Dropdown(["Voice 1",
167
- "Voice 2",
168
- "Voice 3"], type="value", default="Voice 2", label="Speaker selection for the second sentence"),
169
- gr.inputs.Dropdown(["Voice 1",
170
- "Voice 2",
171
- "Voice 3"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
172
- outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
173
- gr.outputs.Audio(type="file", label="Original Audio"),
174
- gr.outputs.Audio(type="numpy", label="Customized Audio")],
175
  layout="vertical",
176
- title="IMS Toucan Speech Customization through Voice Cloning Demo",
177
  thumbnail="Utility/toucan.png",
178
  theme="default",
179
  allow_flagging="never",
180
  allow_screenshot=False,
181
- description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible.",
182
  article=article)
183
  iface.launch(enable_queue=True)
 
 
 
1
  import gradio as gr
2
  import numpy as np
 
3
  import torch
4
+ import math
5
  from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
 
6
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
7
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
8
  from run_utterance_cloner import UtteranceCloner
9
 
10
+
11
+ # os.system("pip uninstall -y gradio")
12
+ # os.system("pip install gradio==2.7.5.2")
13
 
14
 
15
  def float2pcm(sig, dtype='int16'):
 
33
  def __init__(self):
34
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
35
  self.utterance_cloner = UtteranceCloner(device=self.device)
36
+ self.utterance_cloner.tts.set_language("de")
 
 
 
 
37
  self.acoustic_model = Aligner()
38
  self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
39
  self.acoustic_model = self.acoustic_model.to(self.device)
40
  self.dc = DurationCalculator(reduction_factor=1)
41
  self.tf = ArticulatoryCombinedTextFrontend(language="en")
42
+ self.text = "Quellen hattest du ihm, hattest dem Flüchtigen, kühle Schatten geschenkt, und die Gestade sahen, all ihm nach, und es bebte, aus den Wellen ihr lieblich Bild."
43
+ reference_audio = "reference_audios/2.wav"
44
+ self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=False)
45
+ self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
46
+
47
+ #######
48
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
49
+ self.current_voice = "male"
50
+ self.cloned_speech_male = self.utterance_cloner.tts(self.phones,
51
+ view=False,
52
+ durations=self.duration,
53
+ pitch=self.pitch,
54
+ energy=self.energy,
55
+ phones=True).cpu().numpy()
56
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
57
+ self.current_voice = "female"
58
+ self.cloned_speech_female = self.utterance_cloner.tts(self.phones,
59
+ view=False,
60
+ durations=self.duration,
61
+ pitch=self.pitch,
62
+ energy=self.energy,
63
+ phones=True).cpu().numpy()
64
+
65
+ #######
66
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
67
+ self.current_voice = "male"
68
+ self.reg_speech_male = self.utterance_cloner.tts(
69
+ "Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
70
+ view=False).cpu().numpy()
71
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
72
+ self.current_voice = "female"
73
+ self.reg_speech_female = self.utterance_cloner.tts(
74
+ "Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
75
+ view=False).cpu().numpy()
76
+
77
+ def read(self, _, speaker, lengthening, pause_dur, pitch_up):
78
+
79
+ if speaker == "Female Voice" and self.current_voice != "female":
80
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
81
+ self.current_voice = "female"
82
+ elif speaker == "Male Voice" and self.current_voice != "male":
83
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
84
+ self.current_voice = "male"
85
+
86
+ duration = self.duration.clone()
87
+ # lengthening
88
+ lenghtening_candidates = [ # ('f', 27),
89
+ # ('l', 28),
90
+ ('ʏ', 29),
91
+ ('ç', 30),
92
+ # ('t', 31),
93
+ ('ɪ', 32),
94
+ # ('ɡ', 33),
95
+ ('ə', 34),
96
+ ('n', 35),
97
+
98
+ # ('z', 66),
99
+ ('ɑ', 67),
100
+ # ('ə', 68),
101
+ ('n', 69),
102
+
103
+ # ('b', 84),
104
+ ('e', 85),
105
+ # ('p', 86),
106
+ # ('t', 87),
107
+ ('ə', 88)
108
+ ]
109
+
110
+ for lenghtening_candidate in lenghtening_candidates:
111
+ duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening
112
+
113
+ # pauses
114
+ pause_candidates = [('~', 36),
115
+ ('~', 70),
116
+ ('~', 89)]
117
+
118
+ for pause_candidate in pause_candidates:
119
+ duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur
120
+
121
+ pitch = self.pitch.clone()
122
+ # pitch raise
123
+
124
+ pitch_candidates = [ # ('k', 37),
125
+ ('y', 38),
126
+ ('l', 39),
127
+ ('ə', 40),
128
+ ('ʃ', 41),
129
+ ('a', 42),
130
+ ('t', 43),
131
+ # ('ə', 44),
132
+ # ('n', 45),
133
+
134
+ ('a', 71),
135
+ ('l', 72),
136
+
137
+ ('v', 96),
138
+ ('ɛ', 97),
139
+ ('l', 98),
140
+ # ('ə', 99),
141
+ # ('n', 100)
142
+ ]
143
+
144
+ for pitch_candidate in pitch_candidates:
145
+ pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up
146
+
147
+ fixme = [('f', 27),
148
+ ('l', 28),
149
+ ('ʏ', 29),
150
+ ('ç', 30),
151
+ ('t', 31),
152
+ ('ɪ', 32),
153
+ ('ɡ', 33),
154
+ ('ə', 34),
155
+ ('n', 35)
156
+ ]
157
+ for pitch_candidate in fixme:
158
+ pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)
159
+
160
+ manipulated_speech = self.utterance_cloner.tts(self.phones,
161
+ view=False,
162
+ durations=duration,
163
+ pitch=pitch,
164
+ energy=self.energy,
165
+ phones=True).cpu()
166
+
167
+ if self.current_voice == "female":
168
+ cloned_speech = self.cloned_speech_female
169
+ reg_speech = self.reg_speech_female
170
+ else:
171
+ cloned_speech = self.cloned_speech_male
172
+ reg_speech = self.reg_speech_male
173
+
174
+ return (48000, float2pcm(reg_speech)), (48000, float2pcm(cloned_speech)), (48000, float2pcm(manipulated_speech.numpy()))
175
+
176
+
177
+ poem_model = TTS_Interface()
178
+ article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning and more controllability. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
179
+
180
+ iface = gr.Interface(fn=poem_model.read,
181
+ inputs=[gr.inputs.Dropdown([
182
+ "Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild."],
183
  type="value",
184
+ default="Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild.",
185
+ label="Poem Transcript"),
186
+ gr.inputs.Dropdown(["Female Voice", "Male Voice"],
187
+ type="value",
188
+ default="Female Voice",
189
+ label="Select a Speaker"),
190
+ gr.inputs.Slider(minimum=0, maximum=4, step=1, default=2, label="Lengthening on verse end"),
191
+ gr.inputs.Slider(minimum=0, maximum=20, step=1, default=8, label="Length of Pause after verse end"),
192
+ gr.inputs.Slider(minimum=-0.4, maximum=0.4, step=0.01, default=0.2, label="Raise Pitch on new verse")
193
+ ],
194
+ outputs=[gr.outputs.Audio(type="numpy", label="Poem read with prose reading"),
195
+ gr.outputs.Audio(type="numpy", label="Poem cloned from a reference"),
196
+ gr.outputs.Audio(type="numpy", label="Poem after human-in-the-loop adjustments")],
 
197
  layout="vertical",
198
+ title="PoeticTTS - Customizing Poetry for Literary Studies",
199
  thumbnail="Utility/toucan.png",
200
  theme="default",
201
  allow_flagging="never",
202
  allow_screenshot=False,
203
+ description="Customize how a poem is read by a text-to-speech system with intuitive high-level controls. You can control markers of syntactic phrasing ",
204
  article=article)
205
  iface.launch(enable_queue=True)
run_utterance_cloner.py CHANGED
@@ -82,8 +82,6 @@ class UtteranceCloner:
82
  torch.nn.utils.clip_grad_norm_(acoustic_model.parameters(), 1.0)
83
  optim_asr.step()
84
  acoustic_model.eval()
85
- torch.save({"asr_model": acoustic_model.state_dict()},
86
- os.path.join(os.path.join("Models", "Aligner", "aligner.pt")))
87
 
88
  alignment_path = acoustic_model.inference(mel=melspec.to(self.device),
89
  tokens=text.to(self.device),
 
82
  torch.nn.utils.clip_grad_norm_(acoustic_model.parameters(), 1.0)
83
  optim_asr.step()
84
  acoustic_model.eval()
 
 
85
 
86
  alignment_path = acoustic_model.inference(mel=melspec.to(self.device),
87
  tokens=text.to(self.device),