Flux9665 commited on
Commit
ab12c36
·
1 Parent(s): 791a0ff

use explicit code instead of relying on release download

Browse files
Files changed (2) hide show
  1. Architectures/ControllabilityGAN/GAN.py +3 -2
  2. app.py +43 -35
Architectures/ControllabilityGAN/GAN.py CHANGED
@@ -3,9 +3,10 @@ import torch
3
  from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
4
 
5
 
6
- class GanWrapper:
7
 
8
- def __init__(self, path_wgan, device):
 
9
  self.device = device
10
  self.path_wgan = path_wgan
11
 
 
3
  from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
4
 
5
 
6
+ class GanWrapper(torch.nn.Module):
7
 
8
+ def __init__(self, path_wgan, device, *args, **kwargs):
9
+ super().__init__(*args, **kwargs)
10
  self.device = device
11
  self.path_wgan = path_wgan
12
 
app.py CHANGED
@@ -7,7 +7,6 @@ from run_model_downloader import download_models
7
  download_models()
8
 
9
  import gradio as gr
10
- import torch.cuda
11
  from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
12
  from Utility.utils import float2pcm
13
 
@@ -22,14 +21,20 @@ from Utility.storage_config import MODELS_DIR
22
 
23
  class ControllableInterface(torch.nn.Module):
24
 
 
25
  def __init__(self, available_artificial_voices=1000):
26
  super().__init__()
27
- self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
28
- self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
29
  self.generated_speaker_embeds = list()
30
  self.available_artificial_voices = available_artificial_voices
31
  self.current_language = ""
32
  self.current_accent = ""
 
 
 
 
 
33
 
34
  def read(self,
35
  prompt,
@@ -114,13 +119,7 @@ class ControllableInterface(torch.nn.Module):
114
  return sr, wav, fig
115
 
116
 
117
- title = "Controllable Text-to-Speech for over 7000 Languages"
118
- article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
119
- available_artificial_voices = 1000
120
- path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
121
- iso_to_name = load_json_from_path(path_to_iso_list)
122
- text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
123
- controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
124
 
125
 
126
  @spaces.GPU
@@ -162,28 +161,37 @@ def read(prompt,
162
  return (sr, float2pcm(wav)), fig
163
 
164
 
165
- iface = gr.Interface(fn=read,
166
- inputs=[gr.Textbox(lines=2,
167
- placeholder="write what you want the synthesis to read here...",
168
- value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
169
- label="Text input"),
170
- gr.Dropdown(text_selection,
171
- type="value",
172
- value='English Text (eng)',
173
- label="Select the Language of the Text (type on your keyboard to find it quickly)"),
174
- gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
175
- value=279,
176
- label="Random Seed for the artificial Voice"),
177
- gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
178
- gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
179
- gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
180
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
181
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
182
- ],
183
- outputs=[gr.Audio(type="numpy", label="Speech"),
184
- gr.Image(label="Visualization")],
185
- title=title,
186
- theme="default",
187
- allow_flagging="never",
188
- article=article)
189
- iface.launch()
 
 
 
 
 
 
 
 
 
 
7
  download_models()
8
 
9
  import gradio as gr
 
10
  from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
11
  from Utility.utils import float2pcm
12
 
 
21
 
22
  class ControllableInterface(torch.nn.Module):
23
 
24
+ @spaces.GPU
25
  def __init__(self, available_artificial_voices=1000):
26
  super().__init__()
27
+ self.model = ToucanTTSInterface(device="cuda", tts_model_path="Meta")
28
+ self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda")
29
  self.generated_speaker_embeds = list()
30
  self.available_artificial_voices = available_artificial_voices
31
  self.current_language = ""
32
  self.current_accent = ""
33
+ self.device="cpu"
34
+ self.model.to("cpu")
35
+ self.model.device = "cpu"
36
+ self.wgan.to("cpu")
37
+ self.wgan.device = "cpu"
38
 
39
  def read(self,
40
  prompt,
 
119
  return sr, wav, fig
120
 
121
 
122
+
 
 
 
 
 
 
123
 
124
 
125
  @spaces.GPU
 
161
  return (sr, float2pcm(wav)), fig
162
 
163
 
164
+ if __name__ == '__main__':
165
+ title = "Controllable Text-to-Speech for over 7000 Languages"
166
+ article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
167
+ available_artificial_voices = 1000
168
+ path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
169
+ iso_to_name = load_json_from_path(path_to_iso_list)
170
+ text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
171
+ controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
172
+
173
+ iface = gr.Interface(fn=read,
174
+ inputs=[gr.Textbox(lines=2,
175
+ placeholder="write what you want the synthesis to read here...",
176
+ value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
177
+ label="Text input"),
178
+ gr.Dropdown(text_selection,
179
+ type="value",
180
+ value='English Text (eng)',
181
+ label="Select the Language of the Text (type on your keyboard to find it quickly)"),
182
+ gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
183
+ value=279,
184
+ label="Random Seed for the artificial Voice"),
185
+ gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
186
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
187
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
188
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
189
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
190
+ ],
191
+ outputs=[gr.Audio(type="numpy", label="Speech"),
192
+ gr.Image(label="Visualization")],
193
+ title=title,
194
+ theme="default",
195
+ allow_flagging="never",
196
+ article=article)
197
+ iface.launch()