Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

Flux9665 commited on Jun 10, 2024

Commit

ab12c36

1 Parent(s): 791a0ff

use explicit code instead of relying on release download

Browse files

Files changed (2) hide show

Architectures/ControllabilityGAN/GAN.py +3 -2
app.py +43 -35

Architectures/ControllabilityGAN/GAN.py CHANGED Viewed

@@ -3,9 +3,10 @@ import torch
 from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
-class GanWrapper:
-    def __init__(self, path_wgan, device):
         self.device = device
         self.path_wgan = path_wgan

 from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
+class GanWrapper(torch.nn.Module):
+    def __init__(self, path_wgan, device, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.device = device
         self.path_wgan = path_wgan

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ from run_model_downloader import download_models
 download_models()
 import gradio as gr
-import torch.cuda
 from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
 from Utility.utils import float2pcm
@@ -22,14 +21,20 @@ from Utility.storage_config import MODELS_DIR
 class ControllableInterface(torch.nn.Module):
     def __init__(self, available_artificial_voices=1000):
         super().__init__()
-        self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
-        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
         self.generated_speaker_embeds = list()
         self.available_artificial_voices = available_artificial_voices
         self.current_language = ""
         self.current_accent = ""
     def read(self,
              prompt,
@@ -114,13 +119,7 @@ class ControllableInterface(torch.nn.Module):
         return sr, wav, fig
-title = "Controllable Text-to-Speech for over 7000 Languages"
-article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
-available_artificial_voices = 1000
-path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
-iso_to_name = load_json_from_path(path_to_iso_list)
-text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
-controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
 @spaces.GPU
@@ -162,28 +161,37 @@ def read(prompt,
     return (sr, float2pcm(wav)), fig
-iface = gr.Interface(fn=read,
-                     inputs=[gr.Textbox(lines=2,
-                                        placeholder="write what you want the synthesis to read here...",
-                                        value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
-                                        label="Text input"),
-                             gr.Dropdown(text_selection,
-                                         type="value",
-                                         value='English Text (eng)',
-                                         label="Select the Language of the Text (type on your keyboard to find it quickly)"),
-                             gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
-                                       value=279,
-                                       label="Random Seed for the artificial Voice"),
-                             gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
-                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
-                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
-                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
-                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
-                             ],
-                     outputs=[gr.Audio(type="numpy", label="Speech"),
-                              gr.Image(label="Visualization")],
-                     title=title,
-                     theme="default",
-                     allow_flagging="never",
-                     article=article)
-iface.launch()

 download_models()
 import gradio as gr
 from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
 from Utility.utils import float2pcm
 class ControllableInterface(torch.nn.Module):
+    @spaces.GPU
     def __init__(self, available_artificial_voices=1000):
         super().__init__()
+        self.model = ToucanTTSInterface(device="cuda", tts_model_path="Meta")
+        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda")
         self.generated_speaker_embeds = list()
         self.available_artificial_voices = available_artificial_voices
         self.current_language = ""
         self.current_accent = ""
+        self.device="cpu"
+        self.model.to("cpu")
+        self.model.device = "cpu"
+        self.wgan.to("cpu")
+        self.wgan.device = "cpu"
     def read(self,
              prompt,
         return sr, wav, fig
 @spaces.GPU
     return (sr, float2pcm(wav)), fig
+if __name__ == '__main__':
+    title = "Controllable Text-to-Speech for over 7000 Languages"
+    article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
+    available_artificial_voices = 1000
+    path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
+    iso_to_name = load_json_from_path(path_to_iso_list)
+    text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
+    controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
+    iface = gr.Interface(fn=read,
+                         inputs=[gr.Textbox(lines=2,
+                                            placeholder="write what you want the synthesis to read here...",
+                                            value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
+                                            label="Text input"),
+                                 gr.Dropdown(text_selection,
+                                             type="value",
+                                             value='English Text (eng)',
+                                             label="Select the Language of the Text (type on your keyboard to find it quickly)"),
+                                 gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
+                                           value=279,
+                                           label="Random Seed for the artificial Voice"),
+                                 gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
+                                 gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
+                                 gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
+                                 gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
+                                 gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
+                                 ],
+                         outputs=[gr.Audio(type="numpy", label="Speech"),
+                                  gr.Image(label="Visualization")],
+                         title=title,
+                         theme="default",
+                         allow_flagging="never",
+                         article=article)
+    iface.launch()