Spaces:
Running
on
T4
Running
on
T4
use explicit code instead of relying on release download
Browse files- Architectures/ControllabilityGAN/GAN.py +3 -2
- app.py +43 -35
Architectures/ControllabilityGAN/GAN.py
CHANGED
@@ -3,9 +3,10 @@ import torch
|
|
3 |
from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
|
4 |
|
5 |
|
6 |
-
class GanWrapper:
|
7 |
|
8 |
-
def __init__(self, path_wgan, device):
|
|
|
9 |
self.device = device
|
10 |
self.path_wgan = path_wgan
|
11 |
|
|
|
3 |
from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
|
4 |
|
5 |
|
6 |
+
class GanWrapper(torch.nn.Module):
|
7 |
|
8 |
+
def __init__(self, path_wgan, device, *args, **kwargs):
|
9 |
+
super().__init__(*args, **kwargs)
|
10 |
self.device = device
|
11 |
self.path_wgan = path_wgan
|
12 |
|
app.py
CHANGED
@@ -7,7 +7,6 @@ from run_model_downloader import download_models
|
|
7 |
download_models()
|
8 |
|
9 |
import gradio as gr
|
10 |
-
import torch.cuda
|
11 |
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
|
12 |
from Utility.utils import float2pcm
|
13 |
|
@@ -22,14 +21,20 @@ from Utility.storage_config import MODELS_DIR
|
|
22 |
|
23 |
class ControllableInterface(torch.nn.Module):
|
24 |
|
|
|
25 |
def __init__(self, available_artificial_voices=1000):
|
26 |
super().__init__()
|
27 |
-
self.model = ToucanTTSInterface(device="
|
28 |
-
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="
|
29 |
self.generated_speaker_embeds = list()
|
30 |
self.available_artificial_voices = available_artificial_voices
|
31 |
self.current_language = ""
|
32 |
self.current_accent = ""
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def read(self,
|
35 |
prompt,
|
@@ -114,13 +119,7 @@ class ControllableInterface(torch.nn.Module):
|
|
114 |
return sr, wav, fig
|
115 |
|
116 |
|
117 |
-
|
118 |
-
article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
|
119 |
-
available_artificial_voices = 1000
|
120 |
-
path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
|
121 |
-
iso_to_name = load_json_from_path(path_to_iso_list)
|
122 |
-
text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
|
123 |
-
controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
|
124 |
|
125 |
|
126 |
@spaces.GPU
|
@@ -162,28 +161,37 @@ def read(prompt,
|
|
162 |
return (sr, float2pcm(wav)), fig
|
163 |
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
download_models()
|
8 |
|
9 |
import gradio as gr
|
|
|
10 |
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
|
11 |
from Utility.utils import float2pcm
|
12 |
|
|
|
21 |
|
22 |
class ControllableInterface(torch.nn.Module):
|
23 |
|
24 |
+
@spaces.GPU
|
25 |
def __init__(self, available_artificial_voices=1000):
|
26 |
super().__init__()
|
27 |
+
self.model = ToucanTTSInterface(device="cuda", tts_model_path="Meta")
|
28 |
+
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda")
|
29 |
self.generated_speaker_embeds = list()
|
30 |
self.available_artificial_voices = available_artificial_voices
|
31 |
self.current_language = ""
|
32 |
self.current_accent = ""
|
33 |
+
self.device="cpu"
|
34 |
+
self.model.to("cpu")
|
35 |
+
self.model.device = "cpu"
|
36 |
+
self.wgan.to("cpu")
|
37 |
+
self.wgan.device = "cpu"
|
38 |
|
39 |
def read(self,
|
40 |
prompt,
|
|
|
119 |
return sr, wav, fig
|
120 |
|
121 |
|
122 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
|
125 |
@spaces.GPU
|
|
|
161 |
return (sr, float2pcm(wav)), fig
|
162 |
|
163 |
|
164 |
+
if __name__ == '__main__':
|
165 |
+
title = "Controllable Text-to-Speech for over 7000 Languages"
|
166 |
+
article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
|
167 |
+
available_artificial_voices = 1000
|
168 |
+
path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
|
169 |
+
iso_to_name = load_json_from_path(path_to_iso_list)
|
170 |
+
text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
|
171 |
+
controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
|
172 |
+
|
173 |
+
iface = gr.Interface(fn=read,
|
174 |
+
inputs=[gr.Textbox(lines=2,
|
175 |
+
placeholder="write what you want the synthesis to read here...",
|
176 |
+
value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
|
177 |
+
label="Text input"),
|
178 |
+
gr.Dropdown(text_selection,
|
179 |
+
type="value",
|
180 |
+
value='English Text (eng)',
|
181 |
+
label="Select the Language of the Text (type on your keyboard to find it quickly)"),
|
182 |
+
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
|
183 |
+
value=279,
|
184 |
+
label="Random Seed for the artificial Voice"),
|
185 |
+
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
|
186 |
+
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
187 |
+
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
188 |
+
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
|
189 |
+
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
|
190 |
+
],
|
191 |
+
outputs=[gr.Audio(type="numpy", label="Speech"),
|
192 |
+
gr.Image(label="Visualization")],
|
193 |
+
title=title,
|
194 |
+
theme="default",
|
195 |
+
allow_flagging="never",
|
196 |
+
article=article)
|
197 |
+
iface.launch()
|