hilamanor commited on
Commit
511e6ea
·
1 Parent(s): c5c715d

swap to gradio 4.44 & add adaptive duration

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +25 -1
  3. requirements.txt +1 -0
  4. utils.py +2 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🎧
4
  colorFrom: indigo
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-sa-4.0
@@ -12,4 +12,4 @@ short_description: Edit audios with text prompts
12
  ---
13
 
14
  The 30-second limit was introduced to ensure that queue wait times remain reasonable, especially when there are a lot of users.
15
- For that reason pull-requests that change this limit will not be merged. Please clone or duplicate the space to work locally without limits.
 
4
  colorFrom: indigo
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-sa-4.0
 
12
  ---
13
 
14
  The 30-second limit was introduced to ensure that queue wait times remain reasonable, especially when there are a lot of users.
15
+ For that reason pull-requests that change this limit will not be merged. Please clone or duplicate the space to work locally without limits.
app.py CHANGED
@@ -73,7 +73,31 @@ def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # ,
73
 
74
  return (16000, audio.squeeze().cpu().numpy())
75
 
76
- @spaces.GPU(duration=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def edit(
78
  # cache_dir,
79
  input_audio,
 
73
 
74
  return (16000, audio.squeeze().cpu().numpy())
75
 
76
+ def get_duration(input_audio, model_id: str, do_inversion: bool,
77
+ wts: Optional[torch.Tensor], zs: Optional[torch.Tensor],
78
+ saved_inv_model: str, source_prompt="", target_prompt="",
79
+ steps=200, cfg_scale_src=3.5, cfg_scale_tar=12, t_start=45, randomize_seed=True):
80
+ if model_id == LDM2:
81
+ factor = 0.8
82
+ elif model_id == LDM2_LARGE:
83
+ factor = 1.5
84
+ else: # MUSIC
85
+ factor = 1
86
+
87
+ mult = 0
88
+ if do_inversion or randomize_seed:
89
+ mult = steps
90
+
91
+ if input_audio is None:
92
+ raise gr.Error('Input audio missing!')
93
+ duration = min(utils.get_duration(input_audio), 30)
94
+
95
+ time_per_iter_of_full = factor * ((t_start /100 * steps)*2 + mult) * 0.2
96
+ print('expected time:', time_per_iter_of_full / 30 * duration)
97
+ return time_per_iter_of_full / 30 * duration
98
+
99
+
100
+ @spaces.GPU(duration=get_duration)
101
  def edit(
102
  # cache_dir,
103
  input_audio,
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  torch
 
2
  torchaudio
3
  diffusers
4
  accelerate
 
1
  torch
2
+ numpy<2
3
  torchaudio
4
  diffusers
5
  accelerate
utils.py CHANGED
@@ -2,6 +2,7 @@ import numpy as np
2
  import torch
3
  from typing import Optional, List, Tuple, NamedTuple, Union
4
  from models import PipelineWrapper
 
5
 
6
 
7
  class PromptEmbeddings(NamedTuple):
@@ -16,7 +17,7 @@ def load_audio(audio_path: Union[str, np.array], fn_STFT, left: int = 0, right:
16
  import audioldm
17
  import audioldm.audio
18
 
19
- duration = min(audioldm.utils.get_duration(audio_path), 30)
20
 
21
  mel, _, _ = audioldm.audio.wav_to_fbank(audio_path, target_length=int(duration * 102.4), fn_STFT=fn_STFT)
22
  mel = mel.unsqueeze(0)
 
2
  import torch
3
  from typing import Optional, List, Tuple, NamedTuple, Union
4
  from models import PipelineWrapper
5
+ from audioldm.utils import get_duration
6
 
7
 
8
  class PromptEmbeddings(NamedTuple):
 
17
  import audioldm
18
  import audioldm.audio
19
 
20
+ duration = min(get_duration(audio_path), 30)
21
 
22
  mel, _, _ = audioldm.audio.wav_to_fbank(audio_path, target_length=int(duration * 102.4), fn_STFT=fn_STFT)
23
  mel = mel.unsqueeze(0)