Spaces:

akhaliq
/

Real-Time-Voice-Cloning

Runtime error

App Files Files Community

Ahsen Khaliq commited on Sep 30, 2021

Commit

86ac495

•

1 Parent(s): 46b8fc3

Update demo_cli.py

Browse files

Files changed (1) hide show

demo_cli.py +74 -80

demo_cli.py CHANGED Viewed

@@ -132,89 +132,83 @@ if __name__ == '__main__':
     print("Interactive generation loop")
     # while True:
-    try:
-        # Get the reference audio filepath
-        message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, "                   "wav, m4a, flac, ...):\n"
-        in_fpath = args.audio_path
-        if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
-            print("Can't Use mp3 files please try again:")
-        ## Computing the embedding
-        # First, we load the wav using the function that the speaker encoder provides. This is
-        # important: there is preprocessing that must be applied.
-        # The following two methods are equivalent:
-        # - Directly load from the filepath:
-        preprocessed_wav = encoder.preprocess_wav(in_fpath)
-        # - If the wav is already loaded:
-        original_wav, sampling_rate = librosa.load(str(in_fpath))
-        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
-        print("Loaded file succesfully")
-        # Then we derive the embedding. There are many functions and parameters that the
-        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
-        # only use this function (with its default parameters):
-        embed = encoder.embed_utterance(preprocessed_wav)
-        print("Created the embedding")
-        ## Generating the spectrogram
-        text = args.text
-        # If seed is specified, reset torch seed and force synthesizer reload
-        if args.seed is not None:
-            torch.manual_seed(args.seed)
-            synthesizer = Synthesizer(args.syn_model_fpath)
-        # The synthesizer works in batch, so you need to put your data in a list or numpy array
-        texts = [text]
-        embeds = [embed]
-        # If you know what the attention layer alignments are, you can retrieve them here by
-        # passing return_alignments=True
-        specs = synthesizer.synthesize_spectrograms(texts, embeds)
-        spec = specs[0]
-        print("Created the mel spectrogram")
-        ## Generating the waveform
-        print("Synthesizing the waveform:")
-        # If seed is specified, reset torch seed and reload vocoder
-        if args.seed is not None:
-            torch.manual_seed(args.seed)
-            vocoder.load_model(args.voc_model_fpath)
-        # Synthesizing the waveform is fairly straightforward. Remember that the longer the
-        # spectrogram, the more time-efficient the vocoder.
-        generated_wav = vocoder.infer_waveform(spec)
-        ## Post-generation
-        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
-        # pad it.
-        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
-        # Trim excess silences to compensate for gaps in spectrograms (issue #53)
-        generated_wav = encoder.preprocess_wav(generated_wav)
-        # Play the audio (non-blocking)
-        if not args.no_sound:
-            try:
-                sd.stop()
-                sd.play(generated_wav, synthesizer.sample_rate)
-            except sd.PortAudioError as e:
-                print("\nCaught exception: %s" % repr(e))
-                print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
-            except:
-                raise
-        # Save it on the disk
-        filename = "demo_output_1.wav"
-        print(generated_wav.dtype)
-        sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
-        print("\nSaved output as %s\n\n" % filename)
-    except Exception as e:
-        print("Caught exception: %s" % repr(e))
-        print("Restarting\n")

     print("Interactive generation loop")
     # while True:
+    # Get the reference audio filepath
+    message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, "                   "wav, m4a, flac, ...):\n"
+    in_fpath = args.audio_path
+    if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
+        print("Can't Use mp3 files please try again:")
+    ## Computing the embedding
+    # First, we load the wav using the function that the speaker encoder provides. This is
+    # important: there is preprocessing that must be applied.
+    # The following two methods are equivalent:
+    # - Directly load from the filepath:
+    preprocessed_wav = encoder.preprocess_wav(in_fpath)
+    # - If the wav is already loaded:
+    original_wav, sampling_rate = librosa.load(str(in_fpath))
+    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
+    print("Loaded file succesfully")
+    # Then we derive the embedding. There are many functions and parameters that the
+    # speaker encoder interfaces. These are mostly for in-depth research. You will typically
+    # only use this function (with its default parameters):
+    embed = encoder.embed_utterance(preprocessed_wav)
+    print("Created the embedding")
+    ## Generating the spectrogram
+    text = args.text
+    # If seed is specified, reset torch seed and force synthesizer reload
+    if args.seed is not None:
+        torch.manual_seed(args.seed)
+        synthesizer = Synthesizer(args.syn_model_fpath)
+    # The synthesizer works in batch, so you need to put your data in a list or numpy array
+    texts = [text]
+    embeds = [embed]
+    # If you know what the attention layer alignments are, you can retrieve them here by
+    # passing return_alignments=True
+    specs = synthesizer.synthesize_spectrograms(texts, embeds)
+    spec = specs[0]
+    print("Created the mel spectrogram")
+    ## Generating the waveform
+    print("Synthesizing the waveform:")
+    # If seed is specified, reset torch seed and reload vocoder
+    if args.seed is not None:
+        torch.manual_seed(args.seed)
+        vocoder.load_model(args.voc_model_fpath)
+    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
+    # spectrogram, the more time-efficient the vocoder.
+    generated_wav = vocoder.infer_waveform(spec)
+    ## Post-generation
+    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
+    # pad it.
+    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
+    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
+    generated_wav = encoder.preprocess_wav(generated_wav)
+    # Play the audio (non-blocking)
+    if not args.no_sound:
+        try:
+            sd.stop()
+            sd.play(generated_wav, synthesizer.sample_rate)
+        except sd.PortAudioError as e:
+            print("\nCaught exception: %s" % repr(e))
+            print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
+        except:
+            raise
+    # Save it on the disk
+    filename = "demo_output_1.wav"
+    print(generated_wav.dtype)
+    sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
+    print("\nSaved output as %s\n\n" % filename)