ntt123 commited on
Commit
1e2a90a
·
1 Parent(s): 3fbb4b8
Files changed (4) hide show
  1. app.py +4 -2
  2. gaussian_diffusion.py +16 -8
  3. sample.py +21 -13
  4. synthesize.py +3 -2
app.py CHANGED
@@ -1,9 +1,11 @@
1
- import spaces
2
  import gradio as gr
3
- import torch
4
  import numpy as np
 
 
 
5
  from synthesize import synthesize
6
 
 
7
  @spaces.GPU
8
  def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
9
  audio, sample_rate = synthesize(
 
 
1
  import gradio as gr
 
2
  import numpy as np
3
+ import spaces
4
+ import torch
5
+
6
  from synthesize import synthesize
7
 
8
+
9
  @spaces.GPU
10
  def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
11
  audio, sample_rate = synthesize(
gaussian_diffusion.py CHANGED
@@ -202,22 +202,30 @@ class GaussianDiffusion:
202
  )
203
 
204
  # convert all numpy arrays to torch tensors
205
- DEVICE = th.device("cuda") if th.cuda.is_available() else th.device("cpu")
206
  self.betas = th.from_numpy(self.betas).to(DEVICE)
207
  self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
208
  self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
209
  self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
210
  self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
211
- self.sqrt_one_minus_alphas_cumprod = th.from_numpy(self.sqrt_one_minus_alphas_cumprod).to(DEVICE)
212
- self.log_one_minus_alphas_cumprod = th.from_numpy(self.log_one_minus_alphas_cumprod).to(DEVICE)
213
- self.sqrt_recip_alphas_cumprod = th.from_numpy(self.sqrt_recip_alphas_cumprod).to(DEVICE)
214
- self.sqrt_recipm1_alphas_cumprod = th.from_numpy(self.sqrt_recipm1_alphas_cumprod).to(DEVICE)
 
 
 
 
 
 
 
 
215
  self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
216
- self.posterior_log_variance_clipped = th.from_numpy(self.posterior_log_variance_clipped).to(DEVICE)
 
 
217
  self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
218
  self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
219
-
220
-
221
 
222
  def q_mean_variance(self, x_start, t):
223
  """
 
202
  )
203
 
204
  # convert all numpy arrays to torch tensors
205
+ DEVICE = th.device("cuda") # if th.cuda.is_available() else th.device("cpu")
206
  self.betas = th.from_numpy(self.betas).to(DEVICE)
207
  self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
208
  self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
209
  self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
210
  self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
211
+ self.sqrt_one_minus_alphas_cumprod = th.from_numpy(
212
+ self.sqrt_one_minus_alphas_cumprod
213
+ ).to(DEVICE)
214
+ self.log_one_minus_alphas_cumprod = th.from_numpy(
215
+ self.log_one_minus_alphas_cumprod
216
+ ).to(DEVICE)
217
+ self.sqrt_recip_alphas_cumprod = th.from_numpy(
218
+ self.sqrt_recip_alphas_cumprod
219
+ ).to(DEVICE)
220
+ self.sqrt_recipm1_alphas_cumprod = th.from_numpy(
221
+ self.sqrt_recipm1_alphas_cumprod
222
+ ).to(DEVICE)
223
  self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
224
+ self.posterior_log_variance_clipped = th.from_numpy(
225
+ self.posterior_log_variance_clipped
226
+ ).to(DEVICE)
227
  self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
228
  self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
 
 
229
 
230
  def q_mean_variance(self, x_start, t):
231
  """
sample.py CHANGED
@@ -81,7 +81,7 @@ def get_data(config_path, seed=0):
81
 
82
  data_config = config["data"]
83
  model_config = config["model"]
84
- device = "cuda" if torch.cuda.is_available() else "cpu"
85
 
86
  x, speaker_id, phone, phone_kind = get_batch(
87
  seed,
@@ -143,6 +143,9 @@ def plot_samples(samples, x):
143
  plt.close()
144
 
145
 
 
 
 
146
  def sample(
147
  config_path,
148
  ckpt_path,
@@ -153,9 +156,10 @@ def sample(
153
  phone=None,
154
  phone_kind=None,
155
  ):
 
156
  torch.manual_seed(seed)
157
  torch.set_grad_enabled(False)
158
- device = "cuda" if torch.cuda.is_available() else "cpu"
159
 
160
  with open(config_path, "r") as f:
161
  config = yaml.safe_load(f)
@@ -163,17 +167,21 @@ def sample(
163
  data_config = config["data"]
164
  model_config = config["model"]
165
 
166
- # Load model:
167
- model = DiT_models[model_config["name"]](
168
- input_size=model_config["input_size"],
169
- embedding_vocab_size=model_config["embedding_vocab_size"],
170
- learn_sigma=model_config["learn_sigma"],
171
- in_channels=data_config["data_dim"],
172
- ).to(device)
173
-
174
- state_dict = find_model(ckpt_path)
175
- model.load_state_dict(state_dict)
176
- model.eval() # important!
 
 
 
 
177
  diffusion = create_diffusion(str(num_sampling_steps))
178
  n = 1
179
  z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)
 
81
 
82
  data_config = config["data"]
83
  model_config = config["model"]
84
+ device = "cuda" # if torch.cuda.is_available() else "cpu"
85
 
86
  x, speaker_id, phone, phone_kind = get_batch(
87
  seed,
 
143
  plt.close()
144
 
145
 
146
+ model_cache = {}
147
+
148
+
149
  def sample(
150
  config_path,
151
  ckpt_path,
 
156
  phone=None,
157
  phone_kind=None,
158
  ):
159
+ global model_cache
160
  torch.manual_seed(seed)
161
  torch.set_grad_enabled(False)
162
+ device = "cuda" # if torch.cuda.is_available() else "cpu"
163
 
164
  with open(config_path, "r") as f:
165
  config = yaml.safe_load(f)
 
167
  data_config = config["data"]
168
  model_config = config["model"]
169
 
170
+ if ckpt_path not in model_cache:
171
+ # Load model:
172
+ model = DiT_models[model_config["name"]](
173
+ input_size=model_config["input_size"],
174
+ embedding_vocab_size=model_config["embedding_vocab_size"],
175
+ learn_sigma=model_config["learn_sigma"],
176
+ in_channels=data_config["data_dim"],
177
+ ).to(device)
178
+
179
+ state_dict = find_model(ckpt_path)
180
+ model.load_state_dict(state_dict)
181
+ model.eval() # important!
182
+ model_cache[ckpt_path] = model
183
+ else:
184
+ model = model_cache[ckpt_path]
185
  diffusion = create_diffusion(str(num_sampling_steps))
186
  n = 1
187
  z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)
synthesize.py CHANGED
@@ -6,11 +6,12 @@ import json
6
  import os
7
 
8
  os.environ["NLTK_DATA"] = "nltk_data"
 
9
  import torch
10
  import yaml
11
  from g2p_en import G2p
12
- import soundfile as sf
13
  from vocos import Vocos
 
14
  from sample import sample
15
 
16
 
@@ -116,7 +117,7 @@ def synthesize(
116
  print("Phonemes:", phonemes)
117
 
118
  # Step 2: Duration prediction
119
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
  torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
121
  torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
122
  torch_phone_kind_indices = (
 
6
  import os
7
 
8
  os.environ["NLTK_DATA"] = "nltk_data"
9
+ import soundfile as sf
10
  import torch
11
  import yaml
12
  from g2p_en import G2p
 
13
  from vocos import Vocos
14
+
15
  from sample import sample
16
 
17
 
 
117
  print("Phonemes:", phonemes)
118
 
119
  # Step 2: Duration prediction
120
+ device = torch.device("cuda") # if torch.cuda.is_available() else "cpu")
121
  torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
122
  torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
123
  torch_phone_kind_indices = (