VoiceCraft_gradio

Build error

App Files Files Community

cocktailpeanut commited on Apr 21, 2024

Commit

cb6da82

1 Parent(s): 579d79b

update

Browse files

Files changed (4) hide show

app.py +16 -11
inference_speech_editing_scale.py +9 -4
inference_tts_scale.py +11 -5
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -11,31 +11,36 @@ import io
 import numpy as np
 import random
 import uuid
-import spaces
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
 TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
 MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
-device = "cuda" if torch.cuda.is_available() else "cpu"
 whisper_model, align_model, voicecraft_model = None, None, None
 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
-@spaces.GPU(duration=30)
 def seed_everything(seed):
     if seed != -1:
         os.environ['PYTHONHASHSEED'] = str(seed)
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
         torch.backends.cudnn.benchmark = False
         torch.backends.cudnn.deterministic = True
-@spaces.GPU(duration=120)
 class WhisperxAlignModel:
     def __init__(self):
         from whisperx import load_align_model
@@ -46,7 +51,7 @@ class WhisperxAlignModel:
         audio = load_audio(audio_path)
         return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
-@spaces.GPU(duration=120)
 class WhisperModel:
     def __init__(self, model_name):
         from whisper import load_model
@@ -63,7 +68,7 @@ class WhisperModel:
     def transcribe(self, audio_path):
         return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
-@spaces.GPU(duration=120)
 class WhisperxModel:
     def __init__(self, model_name, align_model: WhisperxAlignModel):
         from whisperx import load_model
@@ -74,7 +79,7 @@ class WhisperxModel:
         segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
         return self.align_model.align(segments, audio_path)
-@spaces.GPU(duration=120)
 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
     global transcribe_model, align_model, voicecraft_model
@@ -123,7 +128,7 @@ def get_transcribe_state(segments):
         "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
     }
-@spaces.GPU(duration=60)
 def transcribe(seed, audio_path):
     if transcribe_model is None:
         raise gr.Error("Transcription model not loaded")
@@ -162,7 +167,7 @@ def align_segments(transcript, audio_path):
     with open(tmp_sync_map_path, "r") as f:
         return json.load(f)
-@spaces.GPU(duration=90)
 def align(seed, transcript, audio_path):
     if align_model is None:
         raise gr.Error("Align model not loaded")
@@ -193,7 +198,7 @@ def get_output_audio(audio_tensors, codec_audio_sr):
     buffer.seek(0)
     return buffer.read()
-@spaces.GPU(duration=90)
 def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
         stop_repetition, sample_batch_size, kvcache, silence_tokens,
         audio_path, transcribe_state, transcript, smart_transcript,

 import numpy as np
 import random
 import uuid
+#import spaces
+import devicetorch
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
 TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
 MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
+#device = "cuda" if torch.cuda.is_available() else "cpu"
+device = devicetorch(torch)
 whisper_model, align_model, voicecraft_model = None, None, None
 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
+#@spaces.GPU(duration=30)
 def seed_everything(seed):
     if seed != -1:
         os.environ['PYTHONHASHSEED'] = str(seed)
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
+        if device == "cuda":
+            torch.cuda.manual_seed(seed)
+        elif device == "mps":
+            torch.mps.manual_seed(seed)
         torch.backends.cudnn.benchmark = False
         torch.backends.cudnn.deterministic = True
+#@spaces.GPU(duration=120)
 class WhisperxAlignModel:
     def __init__(self):
         from whisperx import load_align_model
         audio = load_audio(audio_path)
         return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
+#@spaces.GPU(duration=120)
 class WhisperModel:
     def __init__(self, model_name):
         from whisper import load_model
     def transcribe(self, audio_path):
         return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
+#@spaces.GPU(duration=120)
 class WhisperxModel:
     def __init__(self, model_name, align_model: WhisperxAlignModel):
         from whisperx import load_model
         segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
         return self.align_model.align(segments, audio_path)
+#@spaces.GPU(duration=120)
 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
     global transcribe_model, align_model, voicecraft_model
         "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
     }
+#@spaces.GPU(duration=60)
 def transcribe(seed, audio_path):
     if transcribe_model is None:
         raise gr.Error("Transcription model not loaded")
     with open(tmp_sync_map_path, "r") as f:
         return json.load(f)
+#@spaces.GPU(duration=90)
 def align(seed, transcript, audio_path):
     if align_model is None:
         raise gr.Error("Align model not loaded")
     buffer.seek(0)
     return buffer.read()
+#@spaces.GPU(duration=90)
 def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
         stop_repetition, sample_batch_size, kvcache, silence_tokens,
         audio_path, transcribe_state, transcript, smart_transcript,

inference_speech_editing_scale.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os, random
 import numpy as np
 import torch
 import torchaudio
 from data.tokenizer import (
     AudioTokenizer,
@@ -96,9 +97,10 @@ def get_model(exp_dir, device=None):
     del ckpt
     logging.info("done loading weights...")
     if device == None:
-        device = torch.device("cpu")
-        if torch.cuda.is_available():
-            device = torch.device("cuda:0")
     model.to(device)
     model.eval()
     return model, model_args, phn2num
@@ -132,7 +134,10 @@ if __name__ == "__main__":
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
         torch.backends.cudnn.benchmark = False
         torch.backends.cudnn.deterministic = True
     formatter = (

 import numpy as np
 import torch
 import torchaudio
+import devicetorch
 from data.tokenizer import (
     AudioTokenizer,
     del ckpt
     logging.info("done loading weights...")
     if device == None:
+        device = devicetorch(torch)
+#        device = torch.device("cpu")
+#        if torch.cuda.is_available():
+#            device = torch.device("cuda:0")
     model.to(device)
     model.eval()
     return model, model_args, phn2num
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
+        if device == "cuda":
+            torch.cuda.manual_seed(seed)
+        elif device == "mps":
+            torch.mps.manual_seed(seed)
         torch.backends.cudnn.benchmark = False
         torch.backends.cudnn.deterministic = True
     formatter = (

inference_tts_scale.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os, random
 import numpy as np
 import torch
 import torchaudio
 from data.tokenizer import (
     AudioTokenizer,
@@ -115,9 +116,10 @@ def get_model(exp_dir, device=None):
     del ckpt
     logging.info("done loading weights...")
     if device == None:
-        device = torch.device("cpu")
-        if torch.cuda.is_available():
-            device = torch.device("cuda:0")
     model.to(device)
     model.eval()
     return model, model_args, phn2num
@@ -128,7 +130,11 @@ if __name__ == "__main__":
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
         torch.backends.cudnn.benchmark = False
         torch.backends.cudnn.deterministic = True
     formatter = (
@@ -187,4 +193,4 @@ if __name__ == "__main__":
         seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
         torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
-        torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)

 import numpy as np
 import torch
 import torchaudio
+import devicetorch
 from data.tokenizer import (
     AudioTokenizer,
     del ckpt
     logging.info("done loading weights...")
     if device == None:
+        device = devicetorch.get(torch)
+#        device = torch.device("cpu")
+#        if torch.cuda.is_available():
+#            device = torch.device("cuda:0")
     model.to(device)
     model.eval()
     return model, model_args, phn2num
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
+        device = devicetorch.get(torch)
+        if device == "cuda":
+            torch.cuda.manual_seed(seed)
+        elif device == "mps":
+            torch.mps.manual_seed(seed)
         torch.backends.cudnn.benchmark = False
         torch.backends.cudnn.deterministic = True
     formatter = (
         seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
         torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
+        torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)

requirements.txt CHANGED Viewed

@@ -3,7 +3,8 @@ phonemizer==3.2.1
 gradio
 nltk>=3.8.1
 openai-whisper>=20231117
-spaces
 aeneas==1.7.3.0
 whisperx==3.1.1
-huggingface-hub==0.22.2

 gradio
 nltk>=3.8.1
 openai-whisper>=20231117
+#spaces
 aeneas==1.7.3.0
 whisperx==3.1.1
+huggingface-hub==0.22.2
+devicetorch