tonychenxyz commited on
Commit
9bfcf61
1 Parent(s): e9585f6

fixed demo

Browse files
all_emo_dirs.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:beadd1f3c7eada0fa99dbdecc5c370036c1c044955a02f019f879bdc6f5fefcb
3
- size 20343
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3160074617894c8a0fb888fac217b3c4ae0a647e4b218aa498d2ff356e040f9e
3
+ size 21612
app.py CHANGED
@@ -2,7 +2,7 @@
2
  import os
3
  import subprocess
4
  import sys
5
- import spaces
6
 
7
  def install(package):
8
  if '=' in package:
@@ -21,9 +21,12 @@ def install(package):
21
  # install('gradio==4.44.0')
22
  # install('spacy==3.7')
23
 
 
24
  is_prod = True
25
  if os.environ.get('PROD_MODE') == 'local':
26
  is_prod = False
 
 
27
 
28
  import pickle
29
 
@@ -42,37 +45,38 @@ if not is_prod:
42
  os.environ['PATH'] += os.pathsep + ffmpeg_path
43
 
44
 
 
 
 
 
 
 
45
 
46
- import shutil
47
- import tempfile
48
- import time
49
- from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- import librosa
52
- import torch
53
- from huggingface_hub import snapshot_download
54
-
55
- from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
56
- from fam.llm.decoders import EncodecDecoder
57
- from fam.llm.fast_inference_utils import build_model, main
58
- from fam.llm.inference import (
59
- EncodecDecoder,
60
- InferenceConfig,
61
- Model,
62
- TiltedEncodec,
63
- TrainedBPETokeniser,
64
- get_cached_embedding,
65
- get_cached_file,
66
- get_enhancer,
67
- )
68
- from fam.llm.utils import (
69
- check_audio_file,
70
- get_default_dtype,
71
- get_device,
72
- normalize_text,
73
- )
74
 
75
- debug = False
76
 
77
  DESCRIPTION = ""
78
  if not torch.cuda.is_available():
@@ -83,7 +87,8 @@ if torch.cuda.is_available():
83
  seed = 1337
84
  output_dir = "outputs"
85
  _dtype = get_default_dtype()
86
- _device = 'cuda:0'
 
87
  _model_dir = snapshot_download(repo_id=model_name)
88
  first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
89
  output_dir = output_dir
@@ -116,7 +121,6 @@ if torch.cuda.is_available():
116
  compile_prefill=True,
117
  )
118
 
119
- @spaces.GPU
120
  def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
121
 
122
  print('text', text)
@@ -270,32 +274,46 @@ def change_voice_selection_layout(choice):
270
 
271
  def change_emotion_selection_layout(choice):
272
  if choice == EMO_NAMES[0]:
273
- return [gr.update(visible=True)]
274
-
275
- return [gr.update(visible=False)]
276
 
277
  title = """
 
 
 
 
 
 
 
 
278
  </style>
279
  <h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1>
 
 
 
 
 
 
280
  """
281
 
282
  description = """
283
- - While existing TTS services do not allow fine-grained control over emotions, EmoKnob allows users to control emotion in speech with few-shot samples.
 
 
284
  - In this demo, you can select from a few preset voices and upload your own emotional samples to clone.
285
- - You can then use preset emotion or upload your own emotional-neutral sample pair to control emotions.
286
  - You can adjust the strength of the emotion by using the slider.
287
 
 
288
 
289
  EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone.
290
  """
291
 
292
- with gr.Blocks(title="EmoKnob Demo") as demo:
293
  gr.Markdown(title)
294
  gr.Markdown(description)
295
- gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
296
-
297
- with gr.Row():
298
- gr.Markdown(description)
299
 
300
  with gr.Row():
301
  with gr.Column():
@@ -305,7 +323,57 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
305
  value="To be or not to be, that is the question.",
306
  )
307
 
308
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  with gr.Row(), gr.Column():
311
  # voice settings
@@ -324,47 +392,11 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
324
  label="Speaker similarity - How closely to match speaker identity and speech style.",
325
  )
326
 
327
- strength = gr.Slider(
328
- value=0.1,
329
- minimum=0.0,
330
- maximum=5.0,
331
- step=0.01,
332
- label="Strength - how strong the emotion is. Setting it to too large a value may result in unstable output.",
333
- )
334
-
335
-
336
-
337
- # voice select
338
- toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])
339
-
340
- with gr.Row(visible=True) as row_1:
341
- preset_dropdown = gr.Dropdown(
342
- PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
343
- )
344
- with gr.Accordion("Preview: Preset voices", open=False):
345
- for label, path in PRESET_VOICES.items():
346
- gr.Audio(value=path, label=label)
347
-
348
- with gr.Row(visible=False) as row_2:
349
- upload_target = gr.Audio(
350
- sources=["upload"],
351
- type="filepath",
352
- label="Upload a clean sample to clone.",
353
- )
354
- with gr.Row():
355
- emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[0])
356
- with gr.Row(visible=True) as row_3:
357
- upload_neutral = gr.Audio(
358
- sources=["upload"],
359
- type="filepath",
360
- label="Upload a neutral sample to compute the emotion direction. Should be same speaker as the emotional sample.",
361
- )
362
-
363
- upload_emo = gr.Audio(
364
- sources=["upload"],
365
- type="filepath",
366
- label="Upload an emotional sample to compute the emotion direction. Should be same speaker as the neutral sample.",
367
- )
368
 
369
  toggle.change(
370
  change_voice_selection_layout,
@@ -372,12 +404,6 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
372
  outputs=[row_1, row_2],
373
  )
374
 
375
- # emotion_name.change(
376
- # change_emotion_selection_layout,
377
- # inputs=emotion_name,
378
- # outputs=[row_3],
379
- # )
380
-
381
  with gr.Column():
382
  speech = gr.Audio(
383
  type="filepath",
 
2
  import os
3
  import subprocess
4
  import sys
5
+
6
 
7
  def install(package):
8
  if '=' in package:
 
21
  # install('gradio==4.44.0')
22
  # install('spacy==3.7')
23
 
24
+ debug = False
25
  is_prod = True
26
  if os.environ.get('PROD_MODE') == 'local':
27
  is_prod = False
28
+ else:
29
+ debug = False
30
 
31
  import pickle
32
 
 
45
  os.environ['PATH'] += os.pathsep + ffmpeg_path
46
 
47
 
48
+ import torch
49
+ if not debug:
50
+ import shutil
51
+ import tempfile
52
+ import time
53
+ from pathlib import Path
54
 
55
+ import librosa
56
+
57
+ from huggingface_hub import snapshot_download
58
+
59
+ from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
60
+ from fam.llm.decoders import EncodecDecoder
61
+ from fam.llm.fast_inference_utils import build_model, main
62
+ from fam.llm.inference import (
63
+ EncodecDecoder,
64
+ InferenceConfig,
65
+ Model,
66
+ TiltedEncodec,
67
+ TrainedBPETokeniser,
68
+ get_cached_embedding,
69
+ get_cached_file,
70
+ get_enhancer,
71
+ )
72
+ from fam.llm.utils import (
73
+ check_audio_file,
74
+ get_default_dtype,
75
+ get_device,
76
+ normalize_text,
77
+ )
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
 
80
 
81
  DESCRIPTION = ""
82
  if not torch.cuda.is_available():
 
87
  seed = 1337
88
  output_dir = "outputs"
89
  _dtype = get_default_dtype()
90
+ # _device = 'cuda:0'
91
+
92
  _model_dir = snapshot_download(repo_id=model_name)
93
  first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
94
  output_dir = output_dir
 
121
  compile_prefill=True,
122
  )
123
 
 
124
  def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
125
 
126
  print('text', text)
 
274
 
275
  def change_emotion_selection_layout(choice):
276
  if choice == EMO_NAMES[0]:
277
+ return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
278
+ else:
279
+ return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)]
280
 
281
  title = """
282
+ <!-- Google Tag Manager -->
283
+ <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
284
+ new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
285
+ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
286
+ 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
287
+ })(window,document,'script','dataLayer','GTM-5N27BQH8');</script>
288
+ <!-- End Google Tag Manager -->
289
+
290
  </style>
291
  <h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1>
292
+
293
+ <!-- Google Tag Manager (noscript) -->
294
+ <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5N27BQH8"
295
+ height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
296
+ <!-- End Google Tag Manager (noscript) -->
297
+
298
  """
299
 
300
  description = """
301
+
302
+ - EmoKnob applies control of emotion over arbitrary speaker.
303
+ - EmoKnob <b>extracts emotion from a pair of emotional and neutral audio from the same speaker.</b>
304
  - In this demo, you can select from a few preset voices and upload your own emotional samples to clone.
305
+ - You can then apply control of a preset emotion or extract emotion from your own pair of emotional and neutral audio.
306
  - You can adjust the strength of the emotion by using the slider.
307
 
308
+ Check out our [project page](https://emoknob.cs.columbia.edu/) for more details.
309
 
310
  EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone.
311
  """
312
 
313
+ with gr.Blocks(title="EmoKnob: EmoKnob: Enhance Voice Cloning with Fine-Grained Emotion Control") as demo:
314
  gr.Markdown(title)
315
  gr.Markdown(description)
316
+ gr.Image("https://raw.githubusercontent.com/tonychenxyz/emoknob/main/docs/assets/emo-knob-teaser-1.svg", show_label=False, container=False)
 
 
 
317
 
318
  with gr.Row():
319
  with gr.Column():
 
323
  value="To be or not to be, that is the question.",
324
  )
325
 
326
+
327
+
328
+ # voice select
329
+
330
+ with gr.Row(), gr.Column():
331
+ toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])
332
+
333
+
334
+ with gr.Row() as row_1:
335
+ preset_dropdown = gr.Dropdown(
336
+ PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
337
+ )
338
+
339
+ with gr.Accordion("Preview: Preset voices", open=False):
340
+ for label, path in PRESET_VOICES.items():
341
+ gr.Audio(value=path, label=label)
342
+
343
+ with gr.Row(visible=False) as row_2:
344
+ upload_target = gr.Audio(
345
+ sources=["upload"],
346
+ type="filepath",
347
+ label="Upload a clean sample to clone.",
348
+ )
349
+
350
+
351
+ with gr.Row(), gr.Column():
352
+ strength = gr.Slider(
353
+ value=0.1,
354
+ minimum=0.0,
355
+ maximum=1.0,
356
+ step=0.01,
357
+ label="Strength - how strong the emotion is. Recommended value is between 0.0 and 0.6.",
358
+ )
359
+
360
+ with gr.Row():
361
+ emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[1]) # Set default to second option
362
+
363
+
364
+
365
+ with gr.Row(visible=False) as row_3:
366
+ upload_neutral = gr.Audio(
367
+ sources=["upload"],
368
+ type="filepath",
369
+ label="Neutral sample for emotion extraction.",
370
+ )
371
+
372
+ upload_emo = gr.Audio(
373
+ sources=["upload"],
374
+ type="filepath",
375
+ label="Emotional sample for emotion extraction.",
376
+ )
377
 
378
  with gr.Row(), gr.Column():
379
  # voice settings
 
392
  label="Speaker similarity - How closely to match speaker identity and speech style.",
393
  )
394
 
395
+ emotion_name.change(
396
+ change_emotion_selection_layout,
397
+ inputs=emotion_name,
398
+ outputs=[row_3, upload_neutral, upload_emo],
399
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  toggle.change(
402
  change_voice_selection_layout,
 
404
  outputs=[row_1, row_2],
405
  )
406
 
 
 
 
 
 
 
407
  with gr.Column():
408
  speech = gr.Audio(
409
  type="filepath",
fam/llm/__pycache__/fast_inference_utils.cpython-39.pyc CHANGED
Binary files a/fam/llm/__pycache__/fast_inference_utils.cpython-39.pyc and b/fam/llm/__pycache__/fast_inference_utils.cpython-39.pyc differ