deepanway commited on
Commit
a664672
1 Parent(s): 2301775

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -4
app.py CHANGED
@@ -79,9 +79,14 @@ def gradio_generate(prompt, steps, guidance):
79
 
80
  return output_filename
81
 
82
- description_text = '''
83
- TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
84
- '''
 
 
 
 
 
85
 
86
  # Gradio input and output components
87
  input_text = gr.inputs.Textbox(lines=2, label="Prompt")
@@ -95,7 +100,7 @@ gr_interface = gr.Interface(
95
  inputs=[input_text, denoising_steps, guidance_scale],
96
  outputs=[output_audio],
97
  title="TANGO: Text to Audio using Instruction-Guided Diffusion",
98
- description="Generate audio using TANGO by providing a text prompt.",
99
  allow_flagging=False,
100
  examples=[
101
  ["An audience cheering and clapping"],
@@ -104,7 +109,9 @@ gr_interface = gr.Interface(
104
  ["A car engine revving"],
105
  ["A dog barking"],
106
  ["A cat meowing"],
 
107
  ["Emergency sirens wailing"],
 
108
  ["Whistling with birds chirping"],
109
  ["A person snoring"],
110
  ["Motor vehicles are driving with loud engines and a person whistles"],
 
79
 
80
  return output_filename
81
 
82
+ description_text = "Generate audio using TANGO by providing a text prompt. \
83
+ \n\nLimitations: TANGO is trained on the small AudioCaps dataset so it may not generate good audio \
84
+ samples related to concepts that it has not seen in training (e.g. singing). For the same reason, TANGO \
85
+ is not always able to finely control its generations over textual control prompts. For example, \
86
+ the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes \
87
+ on a metal table are very similar. \
88
+ \n\nWe are currently training another version of TANGO on larger datasets to enhance its generalization, \
89
+ compositional and controllable generation ability."
90
 
91
  # Gradio input and output components
92
  input_text = gr.inputs.Textbox(lines=2, label="Prompt")
 
100
  inputs=[input_text, denoising_steps, guidance_scale],
101
  outputs=[output_audio],
102
  title="TANGO: Text to Audio using Instruction-Guided Diffusion",
103
+ description=description_text,
104
  allow_flagging=False,
105
  examples=[
106
  ["An audience cheering and clapping"],
 
109
  ["A car engine revving"],
110
  ["A dog barking"],
111
  ["A cat meowing"],
112
+ ["Wooden table tapping sound while water pouring"],
113
  ["Emergency sirens wailing"],
114
+ ["two gunshots followed by birds flying away while chirping"],
115
  ["Whistling with birds chirping"],
116
  ["A person snoring"],
117
  ["Motor vehicles are driving with loud engines and a person whistles"],