Spaces:

ntt123
/

diffusion-speech-360h

Sleeping

App Files Files Community

ntt123 commited on Dec 18, 2024

Commit

1e2a90a

1 Parent(s): 3fbb4b8

use gpu

Browse files

Files changed (4) hide show

app.py +4 -2
gaussian_diffusion.py +16 -8
sample.py +21 -13
synthesize.py +3 -2

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
-import spaces
 import gradio as gr
-import torch
 import numpy as np
 from synthesize import synthesize
 @spaces.GPU
 def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
     audio, sample_rate = synthesize(

 import gradio as gr
 import numpy as np
+import spaces
+import torch
 from synthesize import synthesize
 @spaces.GPU
 def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
     audio, sample_rate = synthesize(

gaussian_diffusion.py CHANGED Viewed

@@ -202,22 +202,30 @@ class GaussianDiffusion:
         )
         # convert all numpy arrays to torch tensors
-        DEVICE = th.device("cuda") if th.cuda.is_available() else th.device("cpu")
         self.betas = th.from_numpy(self.betas).to(DEVICE)
         self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
         self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
         self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
         self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
-        self.sqrt_one_minus_alphas_cumprod = th.from_numpy(self.sqrt_one_minus_alphas_cumprod).to(DEVICE)
-        self.log_one_minus_alphas_cumprod = th.from_numpy(self.log_one_minus_alphas_cumprod).to(DEVICE)
-        self.sqrt_recip_alphas_cumprod = th.from_numpy(self.sqrt_recip_alphas_cumprod).to(DEVICE)
-        self.sqrt_recipm1_alphas_cumprod = th.from_numpy(self.sqrt_recipm1_alphas_cumprod).to(DEVICE)
         self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
-        self.posterior_log_variance_clipped = th.from_numpy(self.posterior_log_variance_clipped).to(DEVICE)
         self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
         self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
     def q_mean_variance(self, x_start, t):
         """

         )
         # convert all numpy arrays to torch tensors
+        DEVICE = th.device("cuda")  # if th.cuda.is_available() else th.device("cpu")
         self.betas = th.from_numpy(self.betas).to(DEVICE)
         self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
         self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
         self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
         self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
+        self.sqrt_one_minus_alphas_cumprod = th.from_numpy(
+            self.sqrt_one_minus_alphas_cumprod
+        ).to(DEVICE)
+        self.log_one_minus_alphas_cumprod = th.from_numpy(
+            self.log_one_minus_alphas_cumprod
+        ).to(DEVICE)
+        self.sqrt_recip_alphas_cumprod = th.from_numpy(
+            self.sqrt_recip_alphas_cumprod
+        ).to(DEVICE)
+        self.sqrt_recipm1_alphas_cumprod = th.from_numpy(
+            self.sqrt_recipm1_alphas_cumprod
+        ).to(DEVICE)
         self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
+        self.posterior_log_variance_clipped = th.from_numpy(
+            self.posterior_log_variance_clipped
+        ).to(DEVICE)
         self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
         self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
     def q_mean_variance(self, x_start, t):
         """

sample.py CHANGED Viewed

@@ -81,7 +81,7 @@ def get_data(config_path, seed=0):
     data_config = config["data"]
     model_config = config["model"]
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     x, speaker_id, phone, phone_kind = get_batch(
         seed,
@@ -143,6 +143,9 @@ def plot_samples(samples, x):
     plt.close()
 def sample(
     config_path,
     ckpt_path,
@@ -153,9 +156,10 @@ def sample(
     phone=None,
     phone_kind=None,
 ):
     torch.manual_seed(seed)
     torch.set_grad_enabled(False)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
@@ -163,17 +167,21 @@ def sample(
     data_config = config["data"]
     model_config = config["model"]
-    # Load model:
-    model = DiT_models[model_config["name"]](
-        input_size=model_config["input_size"],
-        embedding_vocab_size=model_config["embedding_vocab_size"],
-        learn_sigma=model_config["learn_sigma"],
-        in_channels=data_config["data_dim"],
-    ).to(device)
-    state_dict = find_model(ckpt_path)
-    model.load_state_dict(state_dict)
-    model.eval()  # important!
     diffusion = create_diffusion(str(num_sampling_steps))
     n = 1
     z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)

     data_config = config["data"]
     model_config = config["model"]
+    device = "cuda"  # if torch.cuda.is_available() else "cpu"
     x, speaker_id, phone, phone_kind = get_batch(
         seed,
     plt.close()
+model_cache = {}
 def sample(
     config_path,
     ckpt_path,
     phone=None,
     phone_kind=None,
 ):
+    global model_cache
     torch.manual_seed(seed)
     torch.set_grad_enabled(False)
+    device = "cuda"  # if torch.cuda.is_available() else "cpu"
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
     data_config = config["data"]
     model_config = config["model"]
+    if ckpt_path not in model_cache:
+        # Load model:
+        model = DiT_models[model_config["name"]](
+            input_size=model_config["input_size"],
+            embedding_vocab_size=model_config["embedding_vocab_size"],
+            learn_sigma=model_config["learn_sigma"],
+            in_channels=data_config["data_dim"],
+        ).to(device)
+        state_dict = find_model(ckpt_path)
+        model.load_state_dict(state_dict)
+        model.eval()  # important!
+        model_cache[ckpt_path] = model
+    else:
+        model = model_cache[ckpt_path]
     diffusion = create_diffusion(str(num_sampling_steps))
     n = 1
     z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)

synthesize.py CHANGED Viewed

@@ -6,11 +6,12 @@ import json
 import os
 os.environ["NLTK_DATA"] = "nltk_data"
 import torch
 import yaml
 from g2p_en import G2p
-import soundfile as sf
 from vocos import Vocos
 from sample import sample
@@ -116,7 +117,7 @@ def synthesize(
     print("Phonemes:", phonemes)
     # Step 2: Duration prediction
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
     torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
     torch_phone_kind_indices = (

 import os
 os.environ["NLTK_DATA"] = "nltk_data"
+import soundfile as sf
 import torch
 import yaml
 from g2p_en import G2p
 from vocos import Vocos
 from sample import sample
     print("Phonemes:", phonemes)
     # Step 2: Duration prediction
+    device = torch.device("cuda")  #  if torch.cuda.is_available() else "cpu")
     torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
     torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
     torch_phone_kind_indices = (