Spaces:
Sleeping
Sleeping
use gpu
Browse files- app.py +4 -2
- gaussian_diffusion.py +16 -8
- sample.py +21 -13
- synthesize.py +3 -2
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
-
import spaces
|
2 |
import gradio as gr
|
3 |
-
import torch
|
4 |
import numpy as np
|
|
|
|
|
|
|
5 |
from synthesize import synthesize
|
6 |
|
|
|
7 |
@spaces.GPU
|
8 |
def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
|
9 |
audio, sample_rate = synthesize(
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import numpy as np
|
3 |
+
import spaces
|
4 |
+
import torch
|
5 |
+
|
6 |
from synthesize import synthesize
|
7 |
|
8 |
+
|
9 |
@spaces.GPU
|
10 |
def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
|
11 |
audio, sample_rate = synthesize(
|
gaussian_diffusion.py
CHANGED
@@ -202,22 +202,30 @@ class GaussianDiffusion:
|
|
202 |
)
|
203 |
|
204 |
# convert all numpy arrays to torch tensors
|
205 |
-
DEVICE = th.device("cuda") if th.cuda.is_available() else th.device("cpu")
|
206 |
self.betas = th.from_numpy(self.betas).to(DEVICE)
|
207 |
self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
|
208 |
self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
|
209 |
self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
|
210 |
self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
|
211 |
-
self.sqrt_one_minus_alphas_cumprod = th.from_numpy(
|
212 |
-
|
213 |
-
|
214 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
|
216 |
-
self.posterior_log_variance_clipped = th.from_numpy(
|
|
|
|
|
217 |
self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
|
218 |
self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
|
219 |
-
|
220 |
-
|
221 |
|
222 |
def q_mean_variance(self, x_start, t):
|
223 |
"""
|
|
|
202 |
)
|
203 |
|
204 |
# convert all numpy arrays to torch tensors
|
205 |
+
DEVICE = th.device("cuda") # if th.cuda.is_available() else th.device("cpu")
|
206 |
self.betas = th.from_numpy(self.betas).to(DEVICE)
|
207 |
self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
|
208 |
self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
|
209 |
self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
|
210 |
self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
|
211 |
+
self.sqrt_one_minus_alphas_cumprod = th.from_numpy(
|
212 |
+
self.sqrt_one_minus_alphas_cumprod
|
213 |
+
).to(DEVICE)
|
214 |
+
self.log_one_minus_alphas_cumprod = th.from_numpy(
|
215 |
+
self.log_one_minus_alphas_cumprod
|
216 |
+
).to(DEVICE)
|
217 |
+
self.sqrt_recip_alphas_cumprod = th.from_numpy(
|
218 |
+
self.sqrt_recip_alphas_cumprod
|
219 |
+
).to(DEVICE)
|
220 |
+
self.sqrt_recipm1_alphas_cumprod = th.from_numpy(
|
221 |
+
self.sqrt_recipm1_alphas_cumprod
|
222 |
+
).to(DEVICE)
|
223 |
self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
|
224 |
+
self.posterior_log_variance_clipped = th.from_numpy(
|
225 |
+
self.posterior_log_variance_clipped
|
226 |
+
).to(DEVICE)
|
227 |
self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
|
228 |
self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
|
|
|
|
|
229 |
|
230 |
def q_mean_variance(self, x_start, t):
|
231 |
"""
|
sample.py
CHANGED
@@ -81,7 +81,7 @@ def get_data(config_path, seed=0):
|
|
81 |
|
82 |
data_config = config["data"]
|
83 |
model_config = config["model"]
|
84 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
85 |
|
86 |
x, speaker_id, phone, phone_kind = get_batch(
|
87 |
seed,
|
@@ -143,6 +143,9 @@ def plot_samples(samples, x):
|
|
143 |
plt.close()
|
144 |
|
145 |
|
|
|
|
|
|
|
146 |
def sample(
|
147 |
config_path,
|
148 |
ckpt_path,
|
@@ -153,9 +156,10 @@ def sample(
|
|
153 |
phone=None,
|
154 |
phone_kind=None,
|
155 |
):
|
|
|
156 |
torch.manual_seed(seed)
|
157 |
torch.set_grad_enabled(False)
|
158 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
159 |
|
160 |
with open(config_path, "r") as f:
|
161 |
config = yaml.safe_load(f)
|
@@ -163,17 +167,21 @@ def sample(
|
|
163 |
data_config = config["data"]
|
164 |
model_config = config["model"]
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
177 |
diffusion = create_diffusion(str(num_sampling_steps))
|
178 |
n = 1
|
179 |
z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)
|
|
|
81 |
|
82 |
data_config = config["data"]
|
83 |
model_config = config["model"]
|
84 |
+
device = "cuda" # if torch.cuda.is_available() else "cpu"
|
85 |
|
86 |
x, speaker_id, phone, phone_kind = get_batch(
|
87 |
seed,
|
|
|
143 |
plt.close()
|
144 |
|
145 |
|
146 |
+
model_cache = {}
|
147 |
+
|
148 |
+
|
149 |
def sample(
|
150 |
config_path,
|
151 |
ckpt_path,
|
|
|
156 |
phone=None,
|
157 |
phone_kind=None,
|
158 |
):
|
159 |
+
global model_cache
|
160 |
torch.manual_seed(seed)
|
161 |
torch.set_grad_enabled(False)
|
162 |
+
device = "cuda" # if torch.cuda.is_available() else "cpu"
|
163 |
|
164 |
with open(config_path, "r") as f:
|
165 |
config = yaml.safe_load(f)
|
|
|
167 |
data_config = config["data"]
|
168 |
model_config = config["model"]
|
169 |
|
170 |
+
if ckpt_path not in model_cache:
|
171 |
+
# Load model:
|
172 |
+
model = DiT_models[model_config["name"]](
|
173 |
+
input_size=model_config["input_size"],
|
174 |
+
embedding_vocab_size=model_config["embedding_vocab_size"],
|
175 |
+
learn_sigma=model_config["learn_sigma"],
|
176 |
+
in_channels=data_config["data_dim"],
|
177 |
+
).to(device)
|
178 |
+
|
179 |
+
state_dict = find_model(ckpt_path)
|
180 |
+
model.load_state_dict(state_dict)
|
181 |
+
model.eval() # important!
|
182 |
+
model_cache[ckpt_path] = model
|
183 |
+
else:
|
184 |
+
model = model_cache[ckpt_path]
|
185 |
diffusion = create_diffusion(str(num_sampling_steps))
|
186 |
n = 1
|
187 |
z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)
|
synthesize.py
CHANGED
@@ -6,11 +6,12 @@ import json
|
|
6 |
import os
|
7 |
|
8 |
os.environ["NLTK_DATA"] = "nltk_data"
|
|
|
9 |
import torch
|
10 |
import yaml
|
11 |
from g2p_en import G2p
|
12 |
-
import soundfile as sf
|
13 |
from vocos import Vocos
|
|
|
14 |
from sample import sample
|
15 |
|
16 |
|
@@ -116,7 +117,7 @@ def synthesize(
|
|
116 |
print("Phonemes:", phonemes)
|
117 |
|
118 |
# Step 2: Duration prediction
|
119 |
-
device = torch.device("cuda"
|
120 |
torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
|
121 |
torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
|
122 |
torch_phone_kind_indices = (
|
|
|
6 |
import os
|
7 |
|
8 |
os.environ["NLTK_DATA"] = "nltk_data"
|
9 |
+
import soundfile as sf
|
10 |
import torch
|
11 |
import yaml
|
12 |
from g2p_en import G2p
|
|
|
13 |
from vocos import Vocos
|
14 |
+
|
15 |
from sample import sample
|
16 |
|
17 |
|
|
|
117 |
print("Phonemes:", phonemes)
|
118 |
|
119 |
# Step 2: Duration prediction
|
120 |
+
device = torch.device("cuda") # if torch.cuda.is_available() else "cpu")
|
121 |
torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
|
122 |
torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
|
123 |
torch_phone_kind_indices = (
|