Files changed (1) hide show
  1. app.py +9 -3
app.py CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import snapshot_download
7
  from models import AudioDiffusion, DDPMScheduler
8
  from audioldm.audio.stft import TacotronSTFT
9
  from audioldm.variational_autoencoder import AutoencoderKL
 
10
  from gradio import Markdown
11
  import spaces
12
 
@@ -83,12 +84,16 @@ tango.stft.to(device_type)
83
  tango.model.to(device_type)
84
 
85
  @spaces.GPU(duration=60)
86
- def gradio_generate(prompt, steps, guidance):
87
  output_wave = tango.generate(prompt, steps, guidance)
88
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
89
  output_filename = "temp.wav"
90
  wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
91
-
 
 
 
 
92
  return output_filename
93
 
94
  # description_text = """
@@ -118,6 +123,7 @@ Generate audio using Tango2 by providing a text prompt. Tango2 was built from Ta
118
  """
119
  # Gradio input and output components
120
  input_text = gr.Textbox(lines=2, label="Prompt")
 
121
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
122
  denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
123
  guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
@@ -125,7 +131,7 @@ guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guid
125
  # Gradio interface
126
  gr_interface = gr.Interface(
127
  fn=gradio_generate,
128
- inputs=[input_text, denoising_steps, guidance_scale],
129
  outputs=[output_audio],
130
  title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
131
  description=description_text,
 
7
  from models import AudioDiffusion, DDPMScheduler
8
  from audioldm.audio.stft import TacotronSTFT
9
  from audioldm.variational_autoencoder import AutoencoderKL
10
+ from pydub import AudioSegment
11
  from gradio import Markdown
12
  import spaces
13
 
 
84
  tango.model.to(device_type)
85
 
86
  @spaces.GPU(duration=60)
87
+ def gradio_generate(prompt, output_format, steps, guidance):
88
  output_wave = tango.generate(prompt, steps, guidance)
89
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
90
  output_filename = "temp.wav"
91
  wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
92
+
93
+ if (output_format == "mp3"):
94
+ AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
95
+ output_filename = "temp.mp3"
96
+
97
  return output_filename
98
 
99
  # description_text = """
 
123
  """
124
  # Gradio input and output components
125
  input_text = gr.Textbox(lines=2, label="Prompt")
126
+ output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
127
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
128
  denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
129
  guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
 
131
  # Gradio interface
132
  gr_interface = gr.Interface(
133
  fn=gradio_generate,
134
+ inputs=[input_text, output_format, denoising_steps, guidance_scale],
135
  outputs=[output_audio],
136
  title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
137
  description=description_text,