adefossez commited on
Commit
9aefa3d
2 Parent(s): 7bcf4ee f187760

Merge branch 'main' into our_hf2

Browse files
Files changed (1) hide show
  1. app.py +62 -16
app.py CHANGED
@@ -10,9 +10,11 @@
10
  import argparse
11
  from concurrent.futures import ProcessPoolExecutor
12
  import os
 
13
  import subprocess as sp
14
  from tempfile import NamedTemporaryFile
15
  import time
 
16
  import warnings
17
 
18
  import torch
@@ -50,6 +52,29 @@ def interrupt():
50
  INTERRUPTING = True
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def make_waveform(*args, **kwargs):
54
  # Further remove some warnings.
55
  be = time.time()
@@ -103,8 +128,12 @@ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
103
  file.name, output, MODEL.sample_rate, strategy="loudness",
104
  loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
105
  out_files.append(pool.submit(make_waveform, file.name))
 
106
  res = [out_file.result() for out_file in out_files]
 
 
107
  print("batch finished", len(texts), time.time() - be)
 
108
  return res
109
 
110
 
@@ -140,18 +169,21 @@ def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coe
140
  top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
141
  return outs[0]
142
 
 
143
  def toggle_audio_src(choice):
144
  if choice == "mic":
145
  return gr.update(source="microphone", value=None, label="Microphone")
146
  else:
147
  return gr.update(source="upload", value=None, label="File")
148
-
 
149
  def ui_full(launch_kwargs):
150
  with gr.Blocks() as interface:
151
  gr.Markdown(
152
  """
153
  # MusicGen
154
- This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
 
155
  presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
156
  """
157
  )
@@ -160,14 +192,17 @@ def ui_full(launch_kwargs):
160
  with gr.Row():
161
  text = gr.Text(label="Input Text", interactive=True)
162
  with gr.Column():
163
- radio = gr.Radio(["file", "mic"], value="file", label="Condition on a melody (optional) File or Mic")
164
- melody = gr.Audio(source="upload", type="numpy", label="File", interactive=True, elem_id="melody-input")
 
 
165
  with gr.Row():
166
  submit = gr.Button("Submit")
167
  # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
168
  _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
169
  with gr.Row():
170
- model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
 
171
  with gr.Row():
172
  duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
173
  with gr.Row():
@@ -177,7 +212,9 @@ def ui_full(launch_kwargs):
177
  cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
178
  with gr.Column():
179
  output = gr.Video(label="Generated Music")
180
- submit.click(predict_full, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
 
 
181
  radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
182
  gr.Examples(
183
  fn=predict_full,
@@ -221,17 +258,20 @@ def ui_full(launch_kwargs):
221
  This can take a long time, and the model might lose consistency. The model might also
222
  decide at arbitrary positions that the song ends.
223
 
224
- **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min). An overlap of 12 seconds
225
- is kept with the previously generated chunk, and 18 "new" seconds are generated each time.
 
226
 
227
  We present 4 model variations:
228
- 1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
 
229
  2. Small -- a 300M transformer decoder conditioned on text only.
230
  3. Medium -- a 1.5B transformer decoder conditioned on text only.
231
  4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
232
 
233
  When using `melody`, ou can optionaly provide a reference audio from
234
- which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
 
235
 
236
  You can also use your own GPU or a Google Colab by following the instructions on our repo.
237
  See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
@@ -248,11 +288,14 @@ def ui_batched(launch_kwargs):
248
  """
249
  # MusicGen
250
 
251
- This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
 
252
  presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
253
  <br/>
254
- <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
255
- <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 
 
256
  for longer sequences, more control and no queue.</p>
257
  """
258
  )
@@ -261,13 +304,16 @@ def ui_batched(launch_kwargs):
261
  with gr.Row():
262
  text = gr.Text(label="Describe your music", lines=2, interactive=True)
263
  with gr.Column():
264
- radio = gr.Radio(["file", "mic"], value="file", label="Condition on a melody (optional) File or Mic")
265
- melody = gr.Audio(source="upload", type="numpy", label="File", interactive=True, elem_id="melody-input")
 
 
266
  with gr.Row():
267
  submit = gr.Button("Generate")
268
  with gr.Column():
269
  output = gr.Video(label="Generated Music")
270
- submit.click(predict_batched, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
 
271
  radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
272
  gr.Examples(
273
  fn=predict_batched,
 
10
  import argparse
11
  from concurrent.futures import ProcessPoolExecutor
12
  import os
13
+ from pathlib import Path
14
  import subprocess as sp
15
  from tempfile import NamedTemporaryFile
16
  import time
17
+ import typing as tp
18
  import warnings
19
 
20
  import torch
 
52
  INTERRUPTING = True
53
 
54
 
55
+ class FileCleaner:
56
+ def __init__(self, file_lifetime: float = 3600):
57
+ self.file_lifetime = file_lifetime
58
+ self.files = []
59
+
60
+ def add(self, path: tp.Union[str, Path]):
61
+ self._cleanup()
62
+ self.files.append((time.time(), Path(path)))
63
+
64
+ def _cleanup(self):
65
+ now = time.time()
66
+ for time_added, path in list(self.files):
67
+ if now - time_added > self.file_lifetime:
68
+ if path.exists():
69
+ path.unlink()
70
+ self.files.pop(0)
71
+ else:
72
+ break
73
+
74
+
75
+ file_cleaner = FileCleaner()
76
+
77
+
78
  def make_waveform(*args, **kwargs):
79
  # Further remove some warnings.
80
  be = time.time()
 
128
  file.name, output, MODEL.sample_rate, strategy="loudness",
129
  loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
130
  out_files.append(pool.submit(make_waveform, file.name))
131
+ file_cleaner.add(file.name)
132
  res = [out_file.result() for out_file in out_files]
133
+ for file in res:
134
+ file_cleaner.add(file)
135
  print("batch finished", len(texts), time.time() - be)
136
+ print("Tempfiles currently stored: ", len(file_cleaner.files))
137
  return res
138
 
139
 
 
169
  top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
170
  return outs[0]
171
 
172
+
173
  def toggle_audio_src(choice):
174
  if choice == "mic":
175
  return gr.update(source="microphone", value=None, label="Microphone")
176
  else:
177
  return gr.update(source="upload", value=None, label="File")
178
+
179
+
180
  def ui_full(launch_kwargs):
181
  with gr.Blocks() as interface:
182
  gr.Markdown(
183
  """
184
  # MusicGen
185
+ This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
186
+ a simple and controllable model for music generation
187
  presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
188
  """
189
  )
 
192
  with gr.Row():
193
  text = gr.Text(label="Input Text", interactive=True)
194
  with gr.Column():
195
+ radio = gr.Radio(["file", "mic"], value="file",
196
+ label="Condition on a melody (optional) File or Mic")
197
+ melody = gr.Audio(source="upload", type="numpy", label="File",
198
+ interactive=True, elem_id="melody-input")
199
  with gr.Row():
200
  submit = gr.Button("Submit")
201
  # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
202
  _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
203
  with gr.Row():
204
+ model = gr.Radio(["melody", "medium", "small", "large"],
205
+ label="Model", value="melody", interactive=True)
206
  with gr.Row():
207
  duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
208
  with gr.Row():
 
212
  cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
213
  with gr.Column():
214
  output = gr.Video(label="Generated Music")
215
+ submit.click(predict_full,
216
+ inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef],
217
+ outputs=[output])
218
  radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
219
  gr.Examples(
220
  fn=predict_full,
 
258
  This can take a long time, and the model might lose consistency. The model might also
259
  decide at arbitrary positions that the song ends.
260
 
261
+ **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
262
+ An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
263
+ are generated each time.
264
 
265
  We present 4 model variations:
266
+ 1. Melody -- a music generation model capable of generating music condition
267
+ on text and melody inputs. **Note**, you can also use text only.
268
  2. Small -- a 300M transformer decoder conditioned on text only.
269
  3. Medium -- a 1.5B transformer decoder conditioned on text only.
270
  4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
271
 
272
  When using `melody`, ou can optionaly provide a reference audio from
273
+ which a broad melody will be extracted. The model will then try to follow both
274
+ the description and melody provided.
275
 
276
  You can also use your own GPU or a Google Colab by following the instructions on our repo.
277
  See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
 
288
  """
289
  # MusicGen
290
 
291
+ This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
292
+ a simple and controllable model for music generation
293
  presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
294
  <br/>
295
+ <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
296
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
297
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
298
+ src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
299
  for longer sequences, more control and no queue.</p>
300
  """
301
  )
 
304
  with gr.Row():
305
  text = gr.Text(label="Describe your music", lines=2, interactive=True)
306
  with gr.Column():
307
+ radio = gr.Radio(["file", "mic"], value="file",
308
+ label="Condition on a melody (optional) File or Mic")
309
+ melody = gr.Audio(source="upload", type="numpy", label="File",
310
+ interactive=True, elem_id="melody-input")
311
  with gr.Row():
312
  submit = gr.Button("Generate")
313
  with gr.Column():
314
  output = gr.Video(label="Generated Music")
315
+ submit.click(predict_batched, inputs=[text, melody],
316
+ outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
317
  radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
318
  gr.Examples(
319
  fn=predict_batched,