Spaces:
Sleeping
Sleeping
tonychenxyz
commited on
Commit
•
e9585f6
1
Parent(s):
abbdb85
added gpu detection
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
import os
|
3 |
import subprocess
|
4 |
import sys
|
|
|
5 |
|
6 |
def install(package):
|
7 |
if '=' in package:
|
@@ -41,6 +42,7 @@ if not is_prod:
|
|
41 |
os.environ['PATH'] += os.pathsep + ffmpeg_path
|
42 |
|
43 |
|
|
|
44 |
import shutil
|
45 |
import tempfile
|
46 |
import time
|
@@ -71,45 +73,50 @@ from fam.llm.utils import (
|
|
71 |
)
|
72 |
|
73 |
debug = False
|
74 |
-
if not debug:
|
75 |
-
model_name = "metavoiceio/metavoice-1B-v0.1"
|
76 |
-
seed = 1337
|
77 |
-
output_dir = "outputs"
|
78 |
-
_dtype = get_default_dtype()
|
79 |
-
_device = 'cuda:0'
|
80 |
-
_model_dir = snapshot_download(repo_id=model_name)
|
81 |
-
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
|
82 |
-
output_dir = output_dir
|
83 |
-
os.makedirs(output_dir, exist_ok=True)
|
84 |
-
|
85 |
-
second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
|
86 |
-
config_second_stage = InferenceConfig(
|
87 |
-
ckpt_path=second_stage_ckpt_path,
|
88 |
-
num_samples=1,
|
89 |
-
seed=seed,
|
90 |
-
device=_device,
|
91 |
-
dtype=_dtype,
|
92 |
-
compile=False,
|
93 |
-
init_from="resume",
|
94 |
-
output_dir=output_dir,
|
95 |
-
)
|
96 |
-
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
|
97 |
-
llm_second_stage = Model(
|
98 |
-
config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
|
99 |
-
)
|
100 |
-
enhancer = get_enhancer("df")
|
101 |
-
|
102 |
-
precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
|
103 |
-
model, tokenizer, smodel, model_size = build_model(
|
104 |
-
precision=precision,
|
105 |
-
checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
|
106 |
-
spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
|
107 |
-
device=_device,
|
108 |
-
compile=True,
|
109 |
-
compile_prefill=True,
|
110 |
-
)
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
|
|
113 |
def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
|
114 |
|
115 |
print('text', text)
|
@@ -284,6 +291,7 @@ EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voi
|
|
284 |
|
285 |
with gr.Blocks(title="EmoKnob Demo") as demo:
|
286 |
gr.Markdown(title)
|
|
|
287 |
gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
|
288 |
|
289 |
with gr.Row():
|
@@ -383,4 +391,5 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
|
|
383 |
outputs=speech,
|
384 |
)
|
385 |
|
|
|
386 |
demo.launch()
|
|
|
2 |
import os
|
3 |
import subprocess
|
4 |
import sys
|
5 |
+
import spaces
|
6 |
|
7 |
def install(package):
|
8 |
if '=' in package:
|
|
|
42 |
os.environ['PATH'] += os.pathsep + ffmpeg_path
|
43 |
|
44 |
|
45 |
+
|
46 |
import shutil
|
47 |
import tempfile
|
48 |
import time
|
|
|
73 |
)
|
74 |
|
75 |
debug = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
DESCRIPTION = ""
|
78 |
+
if not torch.cuda.is_available():
|
79 |
+
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
|
80 |
+
if torch.cuda.is_available():
|
81 |
+
if not debug:
|
82 |
+
model_name = "metavoiceio/metavoice-1B-v0.1"
|
83 |
+
seed = 1337
|
84 |
+
output_dir = "outputs"
|
85 |
+
_dtype = get_default_dtype()
|
86 |
+
_device = 'cuda:0'
|
87 |
+
_model_dir = snapshot_download(repo_id=model_name)
|
88 |
+
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
|
89 |
+
output_dir = output_dir
|
90 |
+
os.makedirs(output_dir, exist_ok=True)
|
91 |
+
|
92 |
+
second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
|
93 |
+
config_second_stage = InferenceConfig(
|
94 |
+
ckpt_path=second_stage_ckpt_path,
|
95 |
+
num_samples=1,
|
96 |
+
seed=seed,
|
97 |
+
device=_device,
|
98 |
+
dtype=_dtype,
|
99 |
+
compile=False,
|
100 |
+
init_from="resume",
|
101 |
+
output_dir=output_dir,
|
102 |
+
)
|
103 |
+
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
|
104 |
+
llm_second_stage = Model(
|
105 |
+
config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
|
106 |
+
)
|
107 |
+
enhancer = get_enhancer("df")
|
108 |
+
|
109 |
+
precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
|
110 |
+
model, tokenizer, smodel, model_size = build_model(
|
111 |
+
precision=precision,
|
112 |
+
checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
|
113 |
+
spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
|
114 |
+
device=_device,
|
115 |
+
compile=True,
|
116 |
+
compile_prefill=True,
|
117 |
+
)
|
118 |
|
119 |
+
@spaces.GPU
|
120 |
def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
|
121 |
|
122 |
print('text', text)
|
|
|
291 |
|
292 |
with gr.Blocks(title="EmoKnob Demo") as demo:
|
293 |
gr.Markdown(title)
|
294 |
+
gr.Markdown(description)
|
295 |
gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
|
296 |
|
297 |
with gr.Row():
|
|
|
391 |
outputs=speech,
|
392 |
)
|
393 |
|
394 |
+
|
395 |
demo.launch()
|