tonychenxyz commited on
Commit
e9585f6
1 Parent(s): abbdb85

added gpu detection

Browse files
Files changed (1) hide show
  1. app.py +46 -37
app.py CHANGED
@@ -2,6 +2,7 @@
2
  import os
3
  import subprocess
4
  import sys
 
5
 
6
  def install(package):
7
  if '=' in package:
@@ -41,6 +42,7 @@ if not is_prod:
41
  os.environ['PATH'] += os.pathsep + ffmpeg_path
42
 
43
 
 
44
  import shutil
45
  import tempfile
46
  import time
@@ -71,45 +73,50 @@ from fam.llm.utils import (
71
  )
72
 
73
  debug = False
74
- if not debug:
75
- model_name = "metavoiceio/metavoice-1B-v0.1"
76
- seed = 1337
77
- output_dir = "outputs"
78
- _dtype = get_default_dtype()
79
- _device = 'cuda:0'
80
- _model_dir = snapshot_download(repo_id=model_name)
81
- first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
82
- output_dir = output_dir
83
- os.makedirs(output_dir, exist_ok=True)
84
-
85
- second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
86
- config_second_stage = InferenceConfig(
87
- ckpt_path=second_stage_ckpt_path,
88
- num_samples=1,
89
- seed=seed,
90
- device=_device,
91
- dtype=_dtype,
92
- compile=False,
93
- init_from="resume",
94
- output_dir=output_dir,
95
- )
96
- data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
97
- llm_second_stage = Model(
98
- config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
99
- )
100
- enhancer = get_enhancer("df")
101
-
102
- precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
103
- model, tokenizer, smodel, model_size = build_model(
104
- precision=precision,
105
- checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
106
- spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
107
- device=_device,
108
- compile=True,
109
- compile_prefill=True,
110
- )
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
 
113
  def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
114
 
115
  print('text', text)
@@ -284,6 +291,7 @@ EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voi
284
 
285
  with gr.Blocks(title="EmoKnob Demo") as demo:
286
  gr.Markdown(title)
 
287
  gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
288
 
289
  with gr.Row():
@@ -383,4 +391,5 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
383
  outputs=speech,
384
  )
385
 
 
386
  demo.launch()
 
2
  import os
3
  import subprocess
4
  import sys
5
+ import spaces
6
 
7
  def install(package):
8
  if '=' in package:
 
42
  os.environ['PATH'] += os.pathsep + ffmpeg_path
43
 
44
 
45
+
46
  import shutil
47
  import tempfile
48
  import time
 
73
  )
74
 
75
  debug = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ DESCRIPTION = ""
78
+ if not torch.cuda.is_available():
79
+ DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
80
+ if torch.cuda.is_available():
81
+ if not debug:
82
+ model_name = "metavoiceio/metavoice-1B-v0.1"
83
+ seed = 1337
84
+ output_dir = "outputs"
85
+ _dtype = get_default_dtype()
86
+ _device = 'cuda:0'
87
+ _model_dir = snapshot_download(repo_id=model_name)
88
+ first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
89
+ output_dir = output_dir
90
+ os.makedirs(output_dir, exist_ok=True)
91
+
92
+ second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
93
+ config_second_stage = InferenceConfig(
94
+ ckpt_path=second_stage_ckpt_path,
95
+ num_samples=1,
96
+ seed=seed,
97
+ device=_device,
98
+ dtype=_dtype,
99
+ compile=False,
100
+ init_from="resume",
101
+ output_dir=output_dir,
102
+ )
103
+ data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
104
+ llm_second_stage = Model(
105
+ config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
106
+ )
107
+ enhancer = get_enhancer("df")
108
+
109
+ precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
110
+ model, tokenizer, smodel, model_size = build_model(
111
+ precision=precision,
112
+ checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
113
+ spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
114
+ device=_device,
115
+ compile=True,
116
+ compile_prefill=True,
117
+ )
118
 
119
+ @spaces.GPU
120
  def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
121
 
122
  print('text', text)
 
291
 
292
  with gr.Blocks(title="EmoKnob Demo") as demo:
293
  gr.Markdown(title)
294
+ gr.Markdown(description)
295
  gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
296
 
297
  with gr.Row():
 
391
  outputs=speech,
392
  )
393
 
394
+
395
  demo.launch()