Spaces:

declare-lab
/

tango2

Running on Zero

App Files Files Community

soujanyaporia commited on May 25

Commit

31cd11e

•

1 Parent(s): 58d4d6c

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -185

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import gradio as gr
-import random
 import json
 import torch
 import wavio
@@ -24,6 +23,7 @@ from tqdm import tqdm
 class Tango2Pipeline(DiffusionPipeline):
@@ -169,7 +169,6 @@ class Tango2Pipeline(DiffusionPipeline):
         return AudioPipelineOutput(audios=wave)
-max_64_bit_int = 2**63 - 1
 # Automatic device detection
 if torch.cuda.is_available():
@@ -250,73 +249,21 @@ pipe = Tango2Pipeline(vae=tango.vae,
                       scheduler=tango.scheduler
                       )
-def update_seed(is_randomize_seed, seed):
-    if is_randomize_seed:
-        return random.randint(0, max_64_bit_int)
-    return seed
-def check(
-    prompt,
-    output_format,
-    output_number,
-    steps,
-    guidance,
-    is_randomize_seed,
-    seed
-):
-    if prompt is None or prompt == "":
-        raise gr.Error("Please provide a prompt input.")
-    if not output_number in [1, 2, 3]:
-        raise gr.Error("Please ask for 1, 2 or 3 output files.")
-def update_output(output_format, output_number):
-    return [
-        gr.update(format = output_format),
-        gr.update(format = output_format, visible = (2 <= output_number)),
-        gr.update(format = output_format, visible = (output_number == 3))
-    ]
-def generate_output(output_wave, output_format, output_number, output_index):
-    if (output_number < output_index):
-        return gr.update(format = output_format, visible = False)
-    output_wave = output_wave.audios[output_index - 1]
-    output_filename = "tmp" + str(output_index) + ".wav"
-    wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
-    if (output_format == "mp3"):
-        AudioSegment.from_wav("tmp" + str(output_index) + ".wav").export("tmp" + str(output_index) + ".mp3", format = "mp3")
-        output_filename = "tmp" + str(output_index) + ".mp3"
-    return gr.update(value = output_filename, format = output_format, visible = True)
-@spaces.GPU(duration=180)
-def gradio_generate(
-    prompt,
-    output_format,
-    output_number,
-    steps,
-    guidance,
-    is_randomize_seed,
-    seed
-):
-    if seed is None:
-        seed = random.randint(0, max_64_bit_int)
-    random.seed(seed)
-    torch.manual_seed(seed)
-    output_wave = pipe(prompt, steps, guidance, samples = output_number) ## Using pipeline automatically uses flash attention for torch2.0 above
     #output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
-    return [
-        generate_output(output_wave, output_format, output_number, 1),
-        generate_output(output_wave, output_format, output_number, 2),
-        generate_output(output_wave, output_format, output_number, 3)
-    ]
 # description_text = """
 # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
@@ -338,130 +285,53 @@ def gradio_generate(
 # <p/>
 # """
 description_text = """
-<h1><center>Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization</center></h1>
 <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
 <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
 <p/>
 """
 # Gradio interface
-with gr.Blocks() as interface:
-    gr.HTML(description_text)
-    with gr.Row():
-        with gr.Column():
-            input_text = gr.Textbox(lines=2, label="Prompt")
-            output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
-            output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 1, step = 1, interactive = True)
-            denoising_steps = gr.Slider(minimum=10, maximum=200, value=100, step=1, label="Steps", interactive=True)
-            guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
-            randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
-            seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
-            submit = gr.Button("Generate", variant = "primary")
-        with gr.Column():
-            output_audio_1 = gr.Audio(label = "Generated Audio #1/3", format = "wav", type="numpy")
-            output_audio_2 = gr.Audio(label = "Generated Audio #2/3", format = "wav", type="numpy")
-            output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
-    submit.click(fn = update_seed, inputs = [
-        randomize_seed,
-        seed
-    ], outputs = [
-        seed
-    ], queue = False, show_progress = False).then(fn = check, inputs = [
-        input_text,
-        output_format,
-        output_number,
-        denoising_steps,
-        guidance_scale,
-        randomize_seed,
-        seed
-    ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
-        output_format,
-        output_number
-    ], outputs = [
-        output_audio_1,
-        output_audio_2,
-        output_audio_3
-    ], queue = False, show_progress = False).success(fn = gradio_generate, inputs = [
-        input_text,
-        output_format,
-        output_number,
-        denoising_steps,
-        guidance_scale,
-        randomize_seed,
-        seed
-    ], outputs = [
-        output_audio_1,
-        output_audio_2,
-        output_audio_3
-    ], scroll_to_output = True)
-    gr.Examples(
-        fn = gradio_generate,
-	    inputs = [
-            input_text,
-            output_format,
-            output_number,
-            denoising_steps,
-            guidance_scale,
-            randomize_seed,
-            seed
-        ],
-	    outputs = [
-            output_audio_1,
-            output_audio_2,
-            output_audio_3
-        ],
-        examples = [
-                ["Quiet speech and then airplane flying away", "wav", 3, 200, 3, False, 123],
-                ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing", "wav", 3, 200, 3, False, 123],
-                ["Ducks quack and water splashes with some animal screeching in the background", "wav", 3, 200, 3, False, 123],
-                ["Describe the sound of the ocean", "wav", 3, 200, 3, False, 123],
-                ["A woman and a baby are having a conversation", "wav", 3, 200, 3, False, 123],
-                ["A man speaks followed by a popping noise and laughter", "wav", 3, 200, 3, False, 123],
-                ["A cup is filled from a faucet", "wav", 3, 200, 3, False, 123],
-                ["An audience cheering and clapping", "wav", 3, 200, 3, False, 123],
-                ["Rolling thunder with lightning strikes", "wav", 3, 200, 3, False, 123],
-                ["A dog barking and a cat mewing and a racing car passes by", "wav", 3, 200, 3, False, 123],
-                ["Gentle water stream, birds chirping and sudden gun shot", "wav", 3, 200, 3, False, 123],
-                ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone.", 3, 200, 3, False, 123],
-                ["A dog barking", "wav", 3, 200, 3, False, 123],
-                ["A cat meowing", "wav", 3, 200, 3, False, 123],
-                ["Wooden table tapping sound while water pouring", "wav", 3, 200, 3, False, 123],
-                ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker", "wav", 3, 200, 3, False, 123],
-                ["two gunshots followed by birds flying away while chirping", "wav", 3, 200, 3, False, 123],
-                ["Whistling with birds chirping", "wav", 3, 200, 3, False, 123],
-                ["A person snoring", "wav", 3, 200, 3, False, 123],
-                ["Motor vehicles are driving with loud engines and a person whistles", "wav", 3, 200, 3, False, 123],
-                ["People cheering in a stadium while thunder and lightning strikes", "wav", 3, 200, 3, False, 123],
-                ["A helicopter is in flight", "wav", 3, 200, 3, False, 123],
-                ["A dog barking and a man talking and a racing car passes by", "wav", 3, 200, 3, False, 123],
-            ],
-        cache_examples = "lazy",
-    )
-    gr.Markdown(
-        """
-        ## How to prompt your sound
-        You can use round brackets to increase the importance of a part:
-        ```
-        Peaceful and (calming) ambient music with singing bowl and other instruments
-        ```
-        You can use several levels of round brackets to even more increase the importance of a part:
-        ```
-        (Peaceful) and ((calming)) ambient music with singing bowl and other instruments
-        ```
-        You can use number instead of several round brackets:
-        ```
-        (Peaceful:1.5) and ((calming)) ambient music with singing bowl and other instruments
-        ```
-        You can do the same thing with square brackets to decrease the importance of a part:
-        ```
-        (Peaceful:1.5) and ((calming)) ambient music with [singing:2] bowl and other instruments
-        """
-    )
-    interface.queue(10).launch()

 import gradio as gr
 import json
 import torch
 import wavio
 class Tango2Pipeline(DiffusionPipeline):
         return AudioPipelineOutput(audios=wave)
 # Automatic device detection
 if torch.cuda.is_available():
                       scheduler=tango.scheduler
                       )
+@spaces.GPU(duration=60)
+def gradio_generate(prompt, output_format, steps, guidance):
+    output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
     #output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
+    output_wave = output_wave.audios[0]
+    output_filename = "temp.wav"
+    wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
+    if (output_format == "mp3"):
+        AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
+        output_filename = "temp.mp3"
+    return output_filename
 # description_text = """
 # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 # <p/>
 # """
 description_text = """
 <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
 <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
 <p/>
 """
+# Gradio input and output components
+input_text = gr.Textbox(lines=2, label="Prompt")
+output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
+output_audio = gr.Audio(label="Generated Audio", type="filepath")
+denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
+guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
 # Gradio interface
+gr_interface = gr.Interface(
+    fn=gradio_generate,
+    inputs=[input_text, output_format, denoising_steps, guidance_scale],
+    outputs=[output_audio],
+    title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
+    description=description_text,
+    allow_flagging=False,
+    examples=[
+        ["Quiet speech and then and airplane flying away"],
+        ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
+        ["Ducks quack and water splashes with some animal screeching in the background"],
+        ["Describe the sound of the ocean"],
+        ["A woman and a baby are having a conversation"],
+        ["A man speaks followed by a popping noise and laughter"],
+        ["A cup is filled from a faucet"],
+        ["An audience cheering and clapping"],
+        ["Rolling thunder with lightning strikes"],
+        ["A dog barking and a cat mewing and a racing car passes by"],
+        ["Gentle water stream, birds chirping and sudden gun shot"],
+        ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."],
+        ["A dog barking"],
+        ["A cat meowing"],
+        ["Wooden table tapping sound while water pouring"],
+        ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
+        ["two gunshots followed by birds flying away while chirping"],
+        ["Whistling with birds chirping"],
+        ["A person snoring"],
+        ["Motor vehicles are driving with loud engines and a person whistles"],
+        ["People cheering in a stadium while thunder and lightning strikes"],
+        ["A helicopter is in flight"],
+        ["A dog barking and a man talking and a racing car passes by"],
+    ],
+    cache_examples="lazy", # Turn on to cache.
+)
+# Launch Gradio app
+gr_interface.queue(10).launch()