Spaces:
Build error
Build error
cocktailpeanut
commited on
Commit
·
cb6da82
1
Parent(s):
579d79b
update
Browse files- app.py +16 -11
- inference_speech_editing_scale.py +9 -4
- inference_tts_scale.py +11 -5
- requirements.txt +3 -2
app.py
CHANGED
@@ -11,31 +11,36 @@ import io
|
|
11 |
import numpy as np
|
12 |
import random
|
13 |
import uuid
|
14 |
-
import spaces
|
|
|
15 |
|
16 |
|
17 |
DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
|
18 |
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
19 |
MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
|
20 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
21 |
whisper_model, align_model, voicecraft_model = None, None, None
|
22 |
|
23 |
|
24 |
def get_random_string():
|
25 |
return "".join(str(uuid.uuid4()).split("-"))
|
26 |
|
27 |
-
|
28 |
def seed_everything(seed):
|
29 |
if seed != -1:
|
30 |
os.environ['PYTHONHASHSEED'] = str(seed)
|
31 |
random.seed(seed)
|
32 |
np.random.seed(seed)
|
33 |
torch.manual_seed(seed)
|
34 |
-
|
|
|
|
|
|
|
35 |
torch.backends.cudnn.benchmark = False
|
36 |
torch.backends.cudnn.deterministic = True
|
37 |
|
38 |
-
|
39 |
class WhisperxAlignModel:
|
40 |
def __init__(self):
|
41 |
from whisperx import load_align_model
|
@@ -46,7 +51,7 @@ class WhisperxAlignModel:
|
|
46 |
audio = load_audio(audio_path)
|
47 |
return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
|
48 |
|
49 |
-
|
50 |
class WhisperModel:
|
51 |
def __init__(self, model_name):
|
52 |
from whisper import load_model
|
@@ -63,7 +68,7 @@ class WhisperModel:
|
|
63 |
def transcribe(self, audio_path):
|
64 |
return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
|
65 |
|
66 |
-
|
67 |
class WhisperxModel:
|
68 |
def __init__(self, model_name, align_model: WhisperxAlignModel):
|
69 |
from whisperx import load_model
|
@@ -74,7 +79,7 @@ class WhisperxModel:
|
|
74 |
segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
|
75 |
return self.align_model.align(segments, audio_path)
|
76 |
|
77 |
-
|
78 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
|
79 |
global transcribe_model, align_model, voicecraft_model
|
80 |
|
@@ -123,7 +128,7 @@ def get_transcribe_state(segments):
|
|
123 |
"word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
|
124 |
}
|
125 |
|
126 |
-
|
127 |
def transcribe(seed, audio_path):
|
128 |
if transcribe_model is None:
|
129 |
raise gr.Error("Transcription model not loaded")
|
@@ -162,7 +167,7 @@ def align_segments(transcript, audio_path):
|
|
162 |
with open(tmp_sync_map_path, "r") as f:
|
163 |
return json.load(f)
|
164 |
|
165 |
-
|
166 |
def align(seed, transcript, audio_path):
|
167 |
if align_model is None:
|
168 |
raise gr.Error("Align model not loaded")
|
@@ -193,7 +198,7 @@ def get_output_audio(audio_tensors, codec_audio_sr):
|
|
193 |
buffer.seek(0)
|
194 |
return buffer.read()
|
195 |
|
196 |
-
|
197 |
def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
198 |
stop_repetition, sample_batch_size, kvcache, silence_tokens,
|
199 |
audio_path, transcribe_state, transcript, smart_transcript,
|
|
|
11 |
import numpy as np
|
12 |
import random
|
13 |
import uuid
|
14 |
+
#import spaces
|
15 |
+
import devicetorch
|
16 |
|
17 |
|
18 |
DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
|
19 |
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
20 |
MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
|
21 |
+
#device = "cuda" if torch.cuda.is_available() else "cpu"
|
22 |
+
device = devicetorch(torch)
|
23 |
whisper_model, align_model, voicecraft_model = None, None, None
|
24 |
|
25 |
|
26 |
def get_random_string():
|
27 |
return "".join(str(uuid.uuid4()).split("-"))
|
28 |
|
29 |
+
#@spaces.GPU(duration=30)
|
30 |
def seed_everything(seed):
|
31 |
if seed != -1:
|
32 |
os.environ['PYTHONHASHSEED'] = str(seed)
|
33 |
random.seed(seed)
|
34 |
np.random.seed(seed)
|
35 |
torch.manual_seed(seed)
|
36 |
+
if device == "cuda":
|
37 |
+
torch.cuda.manual_seed(seed)
|
38 |
+
elif device == "mps":
|
39 |
+
torch.mps.manual_seed(seed)
|
40 |
torch.backends.cudnn.benchmark = False
|
41 |
torch.backends.cudnn.deterministic = True
|
42 |
|
43 |
+
#@spaces.GPU(duration=120)
|
44 |
class WhisperxAlignModel:
|
45 |
def __init__(self):
|
46 |
from whisperx import load_align_model
|
|
|
51 |
audio = load_audio(audio_path)
|
52 |
return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
|
53 |
|
54 |
+
#@spaces.GPU(duration=120)
|
55 |
class WhisperModel:
|
56 |
def __init__(self, model_name):
|
57 |
from whisper import load_model
|
|
|
68 |
def transcribe(self, audio_path):
|
69 |
return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
|
70 |
|
71 |
+
#@spaces.GPU(duration=120)
|
72 |
class WhisperxModel:
|
73 |
def __init__(self, model_name, align_model: WhisperxAlignModel):
|
74 |
from whisperx import load_model
|
|
|
79 |
segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
|
80 |
return self.align_model.align(segments, audio_path)
|
81 |
|
82 |
+
#@spaces.GPU(duration=120)
|
83 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
|
84 |
global transcribe_model, align_model, voicecraft_model
|
85 |
|
|
|
128 |
"word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
|
129 |
}
|
130 |
|
131 |
+
#@spaces.GPU(duration=60)
|
132 |
def transcribe(seed, audio_path):
|
133 |
if transcribe_model is None:
|
134 |
raise gr.Error("Transcription model not loaded")
|
|
|
167 |
with open(tmp_sync_map_path, "r") as f:
|
168 |
return json.load(f)
|
169 |
|
170 |
+
#@spaces.GPU(duration=90)
|
171 |
def align(seed, transcript, audio_path):
|
172 |
if align_model is None:
|
173 |
raise gr.Error("Align model not loaded")
|
|
|
198 |
buffer.seek(0)
|
199 |
return buffer.read()
|
200 |
|
201 |
+
#@spaces.GPU(duration=90)
|
202 |
def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
203 |
stop_repetition, sample_batch_size, kvcache, silence_tokens,
|
204 |
audio_path, transcribe_state, transcript, smart_transcript,
|
inference_speech_editing_scale.py
CHANGED
@@ -4,6 +4,7 @@ import os, random
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import torchaudio
|
|
|
7 |
|
8 |
from data.tokenizer import (
|
9 |
AudioTokenizer,
|
@@ -96,9 +97,10 @@ def get_model(exp_dir, device=None):
|
|
96 |
del ckpt
|
97 |
logging.info("done loading weights...")
|
98 |
if device == None:
|
99 |
-
device = torch
|
100 |
-
|
101 |
-
|
|
|
102 |
model.to(device)
|
103 |
model.eval()
|
104 |
return model, model_args, phn2num
|
@@ -132,7 +134,10 @@ if __name__ == "__main__":
|
|
132 |
random.seed(seed)
|
133 |
np.random.seed(seed)
|
134 |
torch.manual_seed(seed)
|
135 |
-
|
|
|
|
|
|
|
136 |
torch.backends.cudnn.benchmark = False
|
137 |
torch.backends.cudnn.deterministic = True
|
138 |
formatter = (
|
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import torchaudio
|
7 |
+
import devicetorch
|
8 |
|
9 |
from data.tokenizer import (
|
10 |
AudioTokenizer,
|
|
|
97 |
del ckpt
|
98 |
logging.info("done loading weights...")
|
99 |
if device == None:
|
100 |
+
device = devicetorch(torch)
|
101 |
+
# device = torch.device("cpu")
|
102 |
+
# if torch.cuda.is_available():
|
103 |
+
# device = torch.device("cuda:0")
|
104 |
model.to(device)
|
105 |
model.eval()
|
106 |
return model, model_args, phn2num
|
|
|
134 |
random.seed(seed)
|
135 |
np.random.seed(seed)
|
136 |
torch.manual_seed(seed)
|
137 |
+
if device == "cuda":
|
138 |
+
torch.cuda.manual_seed(seed)
|
139 |
+
elif device == "mps":
|
140 |
+
torch.mps.manual_seed(seed)
|
141 |
torch.backends.cudnn.benchmark = False
|
142 |
torch.backends.cudnn.deterministic = True
|
143 |
formatter = (
|
inference_tts_scale.py
CHANGED
@@ -4,6 +4,7 @@ import os, random
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import torchaudio
|
|
|
7 |
|
8 |
from data.tokenizer import (
|
9 |
AudioTokenizer,
|
@@ -115,9 +116,10 @@ def get_model(exp_dir, device=None):
|
|
115 |
del ckpt
|
116 |
logging.info("done loading weights...")
|
117 |
if device == None:
|
118 |
-
device =
|
119 |
-
|
120 |
-
|
|
|
121 |
model.to(device)
|
122 |
model.eval()
|
123 |
return model, model_args, phn2num
|
@@ -128,7 +130,11 @@ if __name__ == "__main__":
|
|
128 |
random.seed(seed)
|
129 |
np.random.seed(seed)
|
130 |
torch.manual_seed(seed)
|
131 |
-
|
|
|
|
|
|
|
|
|
132 |
torch.backends.cudnn.benchmark = False
|
133 |
torch.backends.cudnn.deterministic = True
|
134 |
formatter = (
|
@@ -187,4 +193,4 @@ if __name__ == "__main__":
|
|
187 |
seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
|
188 |
|
189 |
torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
|
190 |
-
torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)
|
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import torchaudio
|
7 |
+
import devicetorch
|
8 |
|
9 |
from data.tokenizer import (
|
10 |
AudioTokenizer,
|
|
|
116 |
del ckpt
|
117 |
logging.info("done loading weights...")
|
118 |
if device == None:
|
119 |
+
device = devicetorch.get(torch)
|
120 |
+
# device = torch.device("cpu")
|
121 |
+
# if torch.cuda.is_available():
|
122 |
+
# device = torch.device("cuda:0")
|
123 |
model.to(device)
|
124 |
model.eval()
|
125 |
return model, model_args, phn2num
|
|
|
130 |
random.seed(seed)
|
131 |
np.random.seed(seed)
|
132 |
torch.manual_seed(seed)
|
133 |
+
device = devicetorch.get(torch)
|
134 |
+
if device == "cuda":
|
135 |
+
torch.cuda.manual_seed(seed)
|
136 |
+
elif device == "mps":
|
137 |
+
torch.mps.manual_seed(seed)
|
138 |
torch.backends.cudnn.benchmark = False
|
139 |
torch.backends.cudnn.deterministic = True
|
140 |
formatter = (
|
|
|
193 |
seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
|
194 |
|
195 |
torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
|
196 |
+
torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)
|
requirements.txt
CHANGED
@@ -3,7 +3,8 @@ phonemizer==3.2.1
|
|
3 |
gradio
|
4 |
nltk>=3.8.1
|
5 |
openai-whisper>=20231117
|
6 |
-
spaces
|
7 |
aeneas==1.7.3.0
|
8 |
whisperx==3.1.1
|
9 |
-
huggingface-hub==0.22.2
|
|
|
|
3 |
gradio
|
4 |
nltk>=3.8.1
|
5 |
openai-whisper>=20231117
|
6 |
+
#spaces
|
7 |
aeneas==1.7.3.0
|
8 |
whisperx==3.1.1
|
9 |
+
huggingface-hub==0.22.2
|
10 |
+
devicetorch
|