Ahsen Khaliq commited on
Commit
86ac495
1 Parent(s): 46b8fc3

Update demo_cli.py

Browse files
Files changed (1) hide show
  1. demo_cli.py +74 -80
demo_cli.py CHANGED
@@ -132,89 +132,83 @@ if __name__ == '__main__':
132
 
133
  print("Interactive generation loop")
134
  # while True:
135
- try:
136
- # Get the reference audio filepath
137
- message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " "wav, m4a, flac, ...):\n"
138
- in_fpath = args.audio_path
139
 
140
- if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
141
- print("Can't Use mp3 files please try again:")
142
- ## Computing the embedding
143
- # First, we load the wav using the function that the speaker encoder provides. This is
144
- # important: there is preprocessing that must be applied.
145
-
146
- # The following two methods are equivalent:
147
- # - Directly load from the filepath:
148
- preprocessed_wav = encoder.preprocess_wav(in_fpath)
149
- # - If the wav is already loaded:
150
- original_wav, sampling_rate = librosa.load(str(in_fpath))
151
- preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
152
- print("Loaded file succesfully")
153
-
154
- # Then we derive the embedding. There are many functions and parameters that the
155
- # speaker encoder interfaces. These are mostly for in-depth research. You will typically
156
- # only use this function (with its default parameters):
157
- embed = encoder.embed_utterance(preprocessed_wav)
158
- print("Created the embedding")
159
-
160
-
161
- ## Generating the spectrogram
162
- text = args.text
163
-
164
- # If seed is specified, reset torch seed and force synthesizer reload
165
- if args.seed is not None:
166
- torch.manual_seed(args.seed)
167
- synthesizer = Synthesizer(args.syn_model_fpath)
168
 
169
- # The synthesizer works in batch, so you need to put your data in a list or numpy array
170
- texts = [text]
171
- embeds = [embed]
172
- # If you know what the attention layer alignments are, you can retrieve them here by
173
- # passing return_alignments=True
174
- specs = synthesizer.synthesize_spectrograms(texts, embeds)
175
- spec = specs[0]
176
- print("Created the mel spectrogram")
177
-
178
-
179
- ## Generating the waveform
180
- print("Synthesizing the waveform:")
181
 
182
- # If seed is specified, reset torch seed and reload vocoder
183
- if args.seed is not None:
184
- torch.manual_seed(args.seed)
185
- vocoder.load_model(args.voc_model_fpath)
186
 
187
- # Synthesizing the waveform is fairly straightforward. Remember that the longer the
188
- # spectrogram, the more time-efficient the vocoder.
189
- generated_wav = vocoder.infer_waveform(spec)
190
-
191
-
192
- ## Post-generation
193
- # There's a bug with sounddevice that makes the audio cut one second earlier, so we
194
- # pad it.
195
- generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
196
 
197
- # Trim excess silences to compensate for gaps in spectrograms (issue #53)
198
- generated_wav = encoder.preprocess_wav(generated_wav)
199
-
200
- # Play the audio (non-blocking)
201
- if not args.no_sound:
202
- try:
203
- sd.stop()
204
- sd.play(generated_wav, synthesizer.sample_rate)
205
- except sd.PortAudioError as e:
206
- print("\nCaught exception: %s" % repr(e))
207
- print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
208
- except:
209
- raise
210
-
211
- # Save it on the disk
212
- filename = "demo_output_1.wav"
213
- print(generated_wav.dtype)
214
- sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
215
- print("\nSaved output as %s\n\n" % filename)
216
-
217
 
218
- except Exception as e:
219
- print("Caught exception: %s" % repr(e))
220
- print("Restarting\n")
 
 
132
 
133
  print("Interactive generation loop")
134
  # while True:
135
+ # Get the reference audio filepath
136
+ message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " "wav, m4a, flac, ...):\n"
137
+ in_fpath = args.audio_path
 
138
 
139
+ if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
140
+ print("Can't Use mp3 files please try again:")
141
+ ## Computing the embedding
142
+ # First, we load the wav using the function that the speaker encoder provides. This is
143
+ # important: there is preprocessing that must be applied.
144
+
145
+ # The following two methods are equivalent:
146
+ # - Directly load from the filepath:
147
+ preprocessed_wav = encoder.preprocess_wav(in_fpath)
148
+ # - If the wav is already loaded:
149
+ original_wav, sampling_rate = librosa.load(str(in_fpath))
150
+ preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
151
+ print("Loaded file succesfully")
152
+
153
+ # Then we derive the embedding. There are many functions and parameters that the
154
+ # speaker encoder interfaces. These are mostly for in-depth research. You will typically
155
+ # only use this function (with its default parameters):
156
+ embed = encoder.embed_utterance(preprocessed_wav)
157
+ print("Created the embedding")
158
+
159
+
160
+ ## Generating the spectrogram
161
+ text = args.text
162
+
163
+ # If seed is specified, reset torch seed and force synthesizer reload
164
+ if args.seed is not None:
165
+ torch.manual_seed(args.seed)
166
+ synthesizer = Synthesizer(args.syn_model_fpath)
167
 
168
+ # The synthesizer works in batch, so you need to put your data in a list or numpy array
169
+ texts = [text]
170
+ embeds = [embed]
171
+ # If you know what the attention layer alignments are, you can retrieve them here by
172
+ # passing return_alignments=True
173
+ specs = synthesizer.synthesize_spectrograms(texts, embeds)
174
+ spec = specs[0]
175
+ print("Created the mel spectrogram")
176
+
177
+
178
+ ## Generating the waveform
179
+ print("Synthesizing the waveform:")
180
 
181
+ # If seed is specified, reset torch seed and reload vocoder
182
+ if args.seed is not None:
183
+ torch.manual_seed(args.seed)
184
+ vocoder.load_model(args.voc_model_fpath)
185
 
186
+ # Synthesizing the waveform is fairly straightforward. Remember that the longer the
187
+ # spectrogram, the more time-efficient the vocoder.
188
+ generated_wav = vocoder.infer_waveform(spec)
189
+
190
+
191
+ ## Post-generation
192
+ # There's a bug with sounddevice that makes the audio cut one second earlier, so we
193
+ # pad it.
194
+ generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
195
 
196
+ # Trim excess silences to compensate for gaps in spectrograms (issue #53)
197
+ generated_wav = encoder.preprocess_wav(generated_wav)
198
+
199
+ # Play the audio (non-blocking)
200
+ if not args.no_sound:
201
+ try:
202
+ sd.stop()
203
+ sd.play(generated_wav, synthesizer.sample_rate)
204
+ except sd.PortAudioError as e:
205
+ print("\nCaught exception: %s" % repr(e))
206
+ print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
207
+ except:
208
+ raise
 
 
 
 
 
 
 
209
 
210
+ # Save it on the disk
211
+ filename = "demo_output_1.wav"
212
+ print(generated_wav.dtype)
213
+ sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
214
+ print("\nSaved output as %s\n\n" % filename)