Spaces:

Flux9665
/

SpeechCloning

Running

App Files Files

Flux9665 commited on Feb 23, 2022

Commit

42616ca

•

1 Parent(s): 6c51fe8

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -2

app.py CHANGED Viewed

@@ -83,7 +83,12 @@ class TTS_Interface:
                                                        clone_speaker_identity=False,
                                                        lang="en")
-        return "alignment.png", reference_audio, (48000, float2pcm(torch.cat([part_1, part_2, part_3], dim=0).numpy()))
     def split_audio(self, path_to_audio, text_list):
         # extract audio
@@ -171,6 +176,9 @@ iface = gr.Interface(fn=meta_model.read,
                                              "Voice 3"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
                      outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
                               gr.outputs.Audio(type="file", label="Original Audio"),
                               gr.outputs.Audio(type="numpy", label="Customized Audio")],
                      layout="vertical",
                      title="IMS Toucan Speech Customization through Voice Cloning Demo",
@@ -178,6 +186,6 @@ iface = gr.Interface(fn=meta_model.read,
                      theme="default",
                      allow_flagging="never",
                      allow_screenshot=False,
-                     description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible.",
                      article=article)
 iface.launch(enable_queue=True)

                                                        clone_speaker_identity=False,
                                                        lang="en")
+        return "alignment.png", \
+               reference_audio, \
+               self.speaker_path_lookup["Voice 1"], \
+               self.speaker_path_lookup["Voice 2"], \
+               self.speaker_path_lookup["Voice 3"], \
+               (48000, float2pcm(torch.cat([part_1, part_2, part_3], dim=0).numpy()))
     def split_audio(self, path_to_audio, text_list):
         # extract audio
                                              "Voice 3"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
                      outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
                               gr.outputs.Audio(type="file", label="Original Audio"),
+                              gr.outputs.Audio(type="file", label="Reference-Voice 1"),
+                              gr.outputs.Audio(type="file", label="Reference-Voice 2"),
+                              gr.outputs.Audio(type="file", label="Reference-Voice 3"),
                               gr.outputs.Audio(type="numpy", label="Customized Audio")],
                      layout="vertical",
                      title="IMS Toucan Speech Customization through Voice Cloning Demo",
                      theme="default",
                      allow_flagging="never",
                      allow_screenshot=False,
+                     description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
                      article=article)
 iface.launch(enable_queue=True)