soujanyaporia commited on
Commit
31cd11e
1 Parent(s): 58d4d6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -185
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import random
3
  import json
4
  import torch
5
  import wavio
@@ -24,6 +23,7 @@ from tqdm import tqdm
24
 
25
 
26
 
 
27
  class Tango2Pipeline(DiffusionPipeline):
28
 
29
 
@@ -169,7 +169,6 @@ class Tango2Pipeline(DiffusionPipeline):
169
 
170
  return AudioPipelineOutput(audios=wave)
171
 
172
- max_64_bit_int = 2**63 - 1
173
 
174
  # Automatic device detection
175
  if torch.cuda.is_available():
@@ -250,73 +249,21 @@ pipe = Tango2Pipeline(vae=tango.vae,
250
  scheduler=tango.scheduler
251
  )
252
 
253
-
254
- def update_seed(is_randomize_seed, seed):
255
- if is_randomize_seed:
256
- return random.randint(0, max_64_bit_int)
257
- return seed
258
-
259
- def check(
260
- prompt,
261
- output_format,
262
- output_number,
263
- steps,
264
- guidance,
265
- is_randomize_seed,
266
- seed
267
- ):
268
- if prompt is None or prompt == "":
269
- raise gr.Error("Please provide a prompt input.")
270
- if not output_number in [1, 2, 3]:
271
- raise gr.Error("Please ask for 1, 2 or 3 output files.")
272
-
273
- def update_output(output_format, output_number):
274
- return [
275
- gr.update(format = output_format),
276
- gr.update(format = output_format, visible = (2 <= output_number)),
277
- gr.update(format = output_format, visible = (output_number == 3))
278
- ]
279
-
280
- def generate_output(output_wave, output_format, output_number, output_index):
281
- if (output_number < output_index):
282
- return gr.update(format = output_format, visible = False)
283
-
284
- output_wave = output_wave.audios[output_index - 1]
285
- output_filename = "tmp" + str(output_index) + ".wav"
286
- wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
287
-
288
- if (output_format == "mp3"):
289
- AudioSegment.from_wav("tmp" + str(output_index) + ".wav").export("tmp" + str(output_index) + ".mp3", format = "mp3")
290
- output_filename = "tmp" + str(output_index) + ".mp3"
291
-
292
- return gr.update(value = output_filename, format = output_format, visible = True)
293
 
294
- @spaces.GPU(duration=180)
295
- def gradio_generate(
296
- prompt,
297
- output_format,
298
- output_number,
299
- steps,
300
- guidance,
301
- is_randomize_seed,
302
- seed
303
- ):
304
- if seed is None:
305
- seed = random.randint(0, max_64_bit_int)
306
-
307
- random.seed(seed)
308
- torch.manual_seed(seed)
309
-
310
- output_wave = pipe(prompt, steps, guidance, samples = output_number) ## Using pipeline automatically uses flash attention for torch2.0 above
311
-
312
  #output_wave = tango.generate(prompt, steps, guidance)
313
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
 
 
 
 
 
 
 
314
 
315
- return [
316
- generate_output(output_wave, output_format, output_number, 1),
317
- generate_output(output_wave, output_format, output_number, 2),
318
- generate_output(output_wave, output_format, output_number, 3)
319
- ]
320
 
321
  # description_text = """
322
  # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
@@ -338,130 +285,53 @@ def gradio_generate(
338
  # <p/>
339
  # """
340
  description_text = """
341
- <h1><center>Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization</center></h1>
342
  <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
343
  Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
344
  <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
345
  <p/>
346
  """
 
 
 
 
 
 
347
 
348
  # Gradio interface
349
- with gr.Blocks() as interface:
350
- gr.HTML(description_text)
351
- with gr.Row():
352
- with gr.Column():
353
- input_text = gr.Textbox(lines=2, label="Prompt")
354
- output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
355
- output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 1, step = 1, interactive = True)
356
- denoising_steps = gr.Slider(minimum=10, maximum=200, value=100, step=1, label="Steps", interactive=True)
357
- guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
358
- randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
359
- seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
360
-
361
- submit = gr.Button("Generate", variant = "primary")
362
-
363
- with gr.Column():
364
- output_audio_1 = gr.Audio(label = "Generated Audio #1/3", format = "wav", type="numpy")
365
- output_audio_2 = gr.Audio(label = "Generated Audio #2/3", format = "wav", type="numpy")
366
- output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
367
-
368
- submit.click(fn = update_seed, inputs = [
369
- randomize_seed,
370
- seed
371
- ], outputs = [
372
- seed
373
- ], queue = False, show_progress = False).then(fn = check, inputs = [
374
- input_text,
375
- output_format,
376
- output_number,
377
- denoising_steps,
378
- guidance_scale,
379
- randomize_seed,
380
- seed
381
- ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
382
- output_format,
383
- output_number
384
- ], outputs = [
385
- output_audio_1,
386
- output_audio_2,
387
- output_audio_3
388
- ], queue = False, show_progress = False).success(fn = gradio_generate, inputs = [
389
- input_text,
390
- output_format,
391
- output_number,
392
- denoising_steps,
393
- guidance_scale,
394
- randomize_seed,
395
- seed
396
- ], outputs = [
397
- output_audio_1,
398
- output_audio_2,
399
- output_audio_3
400
- ], scroll_to_output = True)
401
-
402
- gr.Examples(
403
- fn = gradio_generate,
404
- inputs = [
405
- input_text,
406
- output_format,
407
- output_number,
408
- denoising_steps,
409
- guidance_scale,
410
- randomize_seed,
411
- seed
412
- ],
413
- outputs = [
414
- output_audio_1,
415
- output_audio_2,
416
- output_audio_3
417
- ],
418
- examples = [
419
- ["Quiet speech and then airplane flying away", "wav", 3, 200, 3, False, 123],
420
- ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing", "wav", 3, 200, 3, False, 123],
421
- ["Ducks quack and water splashes with some animal screeching in the background", "wav", 3, 200, 3, False, 123],
422
- ["Describe the sound of the ocean", "wav", 3, 200, 3, False, 123],
423
- ["A woman and a baby are having a conversation", "wav", 3, 200, 3, False, 123],
424
- ["A man speaks followed by a popping noise and laughter", "wav", 3, 200, 3, False, 123],
425
- ["A cup is filled from a faucet", "wav", 3, 200, 3, False, 123],
426
- ["An audience cheering and clapping", "wav", 3, 200, 3, False, 123],
427
- ["Rolling thunder with lightning strikes", "wav", 3, 200, 3, False, 123],
428
- ["A dog barking and a cat mewing and a racing car passes by", "wav", 3, 200, 3, False, 123],
429
- ["Gentle water stream, birds chirping and sudden gun shot", "wav", 3, 200, 3, False, 123],
430
- ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone.", 3, 200, 3, False, 123],
431
- ["A dog barking", "wav", 3, 200, 3, False, 123],
432
- ["A cat meowing", "wav", 3, 200, 3, False, 123],
433
- ["Wooden table tapping sound while water pouring", "wav", 3, 200, 3, False, 123],
434
- ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker", "wav", 3, 200, 3, False, 123],
435
- ["two gunshots followed by birds flying away while chirping", "wav", 3, 200, 3, False, 123],
436
- ["Whistling with birds chirping", "wav", 3, 200, 3, False, 123],
437
- ["A person snoring", "wav", 3, 200, 3, False, 123],
438
- ["Motor vehicles are driving with loud engines and a person whistles", "wav", 3, 200, 3, False, 123],
439
- ["People cheering in a stadium while thunder and lightning strikes", "wav", 3, 200, 3, False, 123],
440
- ["A helicopter is in flight", "wav", 3, 200, 3, False, 123],
441
- ["A dog barking and a man talking and a racing car passes by", "wav", 3, 200, 3, False, 123],
442
- ],
443
- cache_examples = "lazy",
444
- )
445
-
446
- gr.Markdown(
447
- """
448
- ## How to prompt your sound
449
- You can use round brackets to increase the importance of a part:
450
- ```
451
- Peaceful and (calming) ambient music with singing bowl and other instruments
452
- ```
453
- You can use several levels of round brackets to even more increase the importance of a part:
454
- ```
455
- (Peaceful) and ((calming)) ambient music with singing bowl and other instruments
456
- ```
457
- You can use number instead of several round brackets:
458
- ```
459
- (Peaceful:1.5) and ((calming)) ambient music with singing bowl and other instruments
460
- ```
461
- You can do the same thing with square brackets to decrease the importance of a part:
462
- ```
463
- (Peaceful:1.5) and ((calming)) ambient music with [singing:2] bowl and other instruments
464
- """
465
- )
466
-
467
- interface.queue(10).launch()
 
1
  import gradio as gr
 
2
  import json
3
  import torch
4
  import wavio
 
23
 
24
 
25
 
26
+
27
  class Tango2Pipeline(DiffusionPipeline):
28
 
29
 
 
169
 
170
  return AudioPipelineOutput(audios=wave)
171
 
 
172
 
173
  # Automatic device detection
174
  if torch.cuda.is_available():
 
249
  scheduler=tango.scheduler
250
  )
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ @spaces.GPU(duration=60)
254
+ def gradio_generate(prompt, output_format, steps, guidance):
255
+ output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  #output_wave = tango.generate(prompt, steps, guidance)
257
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
258
+ output_wave = output_wave.audios[0]
259
+ output_filename = "temp.wav"
260
+ wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
261
+
262
+ if (output_format == "mp3"):
263
+ AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
264
+ output_filename = "temp.mp3"
265
 
266
+ return output_filename
 
 
 
 
267
 
268
  # description_text = """
269
  # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 
285
  # <p/>
286
  # """
287
  description_text = """
 
288
  <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
289
  Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
290
  <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
291
  <p/>
292
  """
293
+ # Gradio input and output components
294
+ input_text = gr.Textbox(lines=2, label="Prompt")
295
+ output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
296
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
297
+ denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
298
+ guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
299
 
300
  # Gradio interface
301
+ gr_interface = gr.Interface(
302
+ fn=gradio_generate,
303
+ inputs=[input_text, output_format, denoising_steps, guidance_scale],
304
+ outputs=[output_audio],
305
+ title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
306
+ description=description_text,
307
+ allow_flagging=False,
308
+ examples=[
309
+ ["Quiet speech and then and airplane flying away"],
310
+ ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
311
+ ["Ducks quack and water splashes with some animal screeching in the background"],
312
+ ["Describe the sound of the ocean"],
313
+ ["A woman and a baby are having a conversation"],
314
+ ["A man speaks followed by a popping noise and laughter"],
315
+ ["A cup is filled from a faucet"],
316
+ ["An audience cheering and clapping"],
317
+ ["Rolling thunder with lightning strikes"],
318
+ ["A dog barking and a cat mewing and a racing car passes by"],
319
+ ["Gentle water stream, birds chirping and sudden gun shot"],
320
+ ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."],
321
+ ["A dog barking"],
322
+ ["A cat meowing"],
323
+ ["Wooden table tapping sound while water pouring"],
324
+ ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
325
+ ["two gunshots followed by birds flying away while chirping"],
326
+ ["Whistling with birds chirping"],
327
+ ["A person snoring"],
328
+ ["Motor vehicles are driving with loud engines and a person whistles"],
329
+ ["People cheering in a stadium while thunder and lightning strikes"],
330
+ ["A helicopter is in flight"],
331
+ ["A dog barking and a man talking and a racing car passes by"],
332
+ ],
333
+ cache_examples="lazy", # Turn on to cache.
334
+ )
335
+
336
+ # Launch Gradio app
337
+ gr_interface.queue(10).launch()