Ahsen Khaliq commited on
Commit
c8546f3
1 Parent(s): 24829a1

Update demo_cli.py

Browse files
Files changed (1) hide show
  1. demo_cli.py +86 -85
demo_cli.py CHANGED
@@ -36,6 +36,10 @@ if __name__ == '__main__':
36
  "Optional random number seed value to make toolbox deterministic.")
37
  parser.add_argument("--no_mp3_support", action="store_true", help=\
38
  "If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
 
 
 
 
39
  args = parser.parse_args()
40
  print_args(args, parser)
41
  if not args.no_sound:
@@ -132,94 +136,91 @@ if __name__ == '__main__':
132
  "an explanation of what is happening.\n")
133
 
134
  print("Interactive generation loop")
135
- num_generated = 0
136
- while True:
137
- try:
138
- # Get the reference audio filepath
139
- message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
140
- "wav, m4a, flac, ...):\n"
141
- in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
142
 
143
- if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
144
- print("Can't Use mp3 files please try again:")
145
- continue
146
- ## Computing the embedding
147
- # First, we load the wav using the function that the speaker encoder provides. This is
148
- # important: there is preprocessing that must be applied.
149
-
150
- # The following two methods are equivalent:
151
- # - Directly load from the filepath:
152
- preprocessed_wav = encoder.preprocess_wav(in_fpath)
153
- # - If the wav is already loaded:
154
- original_wav, sampling_rate = librosa.load(str(in_fpath))
155
- preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
156
- print("Loaded file succesfully")
157
-
158
- # Then we derive the embedding. There are many functions and parameters that the
159
- # speaker encoder interfaces. These are mostly for in-depth research. You will typically
160
- # only use this function (with its default parameters):
161
- embed = encoder.embed_utterance(preprocessed_wav)
162
- print("Created the embedding")
163
-
164
-
165
- ## Generating the spectrogram
166
- text = input("Write a sentence (+-20 words) to be synthesized:\n")
167
-
168
- # If seed is specified, reset torch seed and force synthesizer reload
169
- if args.seed is not None:
170
- torch.manual_seed(args.seed)
171
- synthesizer = Synthesizer(args.syn_model_fpath)
172
 
173
- # The synthesizer works in batch, so you need to put your data in a list or numpy array
174
- texts = [text]
175
- embeds = [embed]
176
- # If you know what the attention layer alignments are, you can retrieve them here by
177
- # passing return_alignments=True
178
- specs = synthesizer.synthesize_spectrograms(texts, embeds)
179
- spec = specs[0]
180
- print("Created the mel spectrogram")
181
-
182
-
183
- ## Generating the waveform
184
- print("Synthesizing the waveform:")
185
 
186
- # If seed is specified, reset torch seed and reload vocoder
187
- if args.seed is not None:
188
- torch.manual_seed(args.seed)
189
- vocoder.load_model(args.voc_model_fpath)
190
 
191
- # Synthesizing the waveform is fairly straightforward. Remember that the longer the
192
- # spectrogram, the more time-efficient the vocoder.
193
- generated_wav = vocoder.infer_waveform(spec)
194
-
195
-
196
- ## Post-generation
197
- # There's a bug with sounddevice that makes the audio cut one second earlier, so we
198
- # pad it.
199
- generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
200
 
201
- # Trim excess silences to compensate for gaps in spectrograms (issue #53)
202
- generated_wav = encoder.preprocess_wav(generated_wav)
203
-
204
- # Play the audio (non-blocking)
205
- if not args.no_sound:
206
- try:
207
- sd.stop()
208
- sd.play(generated_wav, synthesizer.sample_rate)
209
- except sd.PortAudioError as e:
210
- print("\nCaught exception: %s" % repr(e))
211
- print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
212
- except:
213
- raise
214
-
215
- # Save it on the disk
216
- filename = "demo_output_%02d.wav" % num_generated
217
- print(generated_wav.dtype)
218
- sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
219
- num_generated += 1
220
- print("\nSaved output as %s\n\n" % filename)
221
-
222
 
223
- except Exception as e:
224
- print("Caught exception: %s" % repr(e))
225
- print("Restarting\n")
 
 
 
 
 
 
 
 
36
  "Optional random number seed value to make toolbox deterministic.")
37
  parser.add_argument("--no_mp3_support", action="store_true", help=\
38
  "If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
39
+ parser.add_argument("-audio", "--audio_path", type=Path, required = True,
40
+ help="Path to a audio file")
41
+ parser.add_argument("--text", type=str, required = True, help=\
42
+ "Text Input")
43
  args = parser.parse_args()
44
  print_args(args, parser)
45
  if not args.no_sound:
 
136
  "an explanation of what is happening.\n")
137
 
138
  print("Interactive generation loop")
139
+ # while True:
140
+ try:
141
+ # Get the reference audio filepath
142
+ message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
143
+ "wav, m4a, flac, ...):\n"
144
+ in_fpath = args.audio_path
 
145
 
146
+ if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
147
+ print("Can't Use mp3 files please try again:")
148
+ ## Computing the embedding
149
+ # First, we load the wav using the function that the speaker encoder provides. This is
150
+ # important: there is preprocessing that must be applied.
151
+
152
+ # The following two methods are equivalent:
153
+ # - Directly load from the filepath:
154
+ preprocessed_wav = encoder.preprocess_wav(in_fpath)
155
+ # - If the wav is already loaded:
156
+ original_wav, sampling_rate = librosa.load(str(in_fpath))
157
+ preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
158
+ print("Loaded file succesfully")
159
+
160
+ # Then we derive the embedding. There are many functions and parameters that the
161
+ # speaker encoder interfaces. These are mostly for in-depth research. You will typically
162
+ # only use this function (with its default parameters):
163
+ embed = encoder.embed_utterance(preprocessed_wav)
164
+ print("Created the embedding")
165
+
166
+
167
+ ## Generating the spectrogram
168
+ text = args.text
169
+
170
+ # If seed is specified, reset torch seed and force synthesizer reload
171
+ if args.seed is not None:
172
+ torch.manual_seed(args.seed)
173
+ synthesizer = Synthesizer(args.syn_model_fpath)
 
174
 
175
+ # The synthesizer works in batch, so you need to put your data in a list or numpy array
176
+ texts = [text]
177
+ embeds = [embed]
178
+ # If you know what the attention layer alignments are, you can retrieve them here by
179
+ # passing return_alignments=True
180
+ specs = synthesizer.synthesize_spectrograms(texts, embeds)
181
+ spec = specs[0]
182
+ print("Created the mel spectrogram")
183
+
184
+
185
+ ## Generating the waveform
186
+ print("Synthesizing the waveform:")
187
 
188
+ # If seed is specified, reset torch seed and reload vocoder
189
+ if args.seed is not None:
190
+ torch.manual_seed(args.seed)
191
+ vocoder.load_model(args.voc_model_fpath)
192
 
193
+ # Synthesizing the waveform is fairly straightforward. Remember that the longer the
194
+ # spectrogram, the more time-efficient the vocoder.
195
+ generated_wav = vocoder.infer_waveform(spec)
196
+
197
+
198
+ ## Post-generation
199
+ # There's a bug with sounddevice that makes the audio cut one second earlier, so we
200
+ # pad it.
201
+ generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
202
 
203
+ # Trim excess silences to compensate for gaps in spectrograms (issue #53)
204
+ generated_wav = encoder.preprocess_wav(generated_wav)
205
+
206
+ # Play the audio (non-blocking)
207
+ if not args.no_sound:
208
+ try:
209
+ sd.stop()
210
+ sd.play(generated_wav, synthesizer.sample_rate)
211
+ except sd.PortAudioError as e:
212
+ print("\nCaught exception: %s" % repr(e))
213
+ print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
214
+ except:
215
+ raise
 
 
 
 
 
 
 
 
216
 
217
+ # Save it on the disk
218
+ filename = "demo_output_1.wav"
219
+ print(generated_wav.dtype)
220
+ sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
221
+ print("\nSaved output as %s\n\n" % filename)
222
+
223
+
224
+ except Exception as e:
225
+ print("Caught exception: %s" % repr(e))
226
+ print("Restarting\n")