Spaces:
Runtime error
Runtime error
1, 2 or 3 output files
#3
by
Fabrice-TIERCELIN
- opened
app.py
CHANGED
@@ -49,12 +49,12 @@ class Tango:
|
|
49 |
self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler")
|
50 |
|
51 |
def chunks(self, lst, n):
|
52 |
-
|
53 |
for i in range(0, len(lst), n):
|
54 |
yield lst[i:i + n]
|
55 |
|
56 |
def generate(self, prompt, steps=100, guidance=3, samples=3, disable_progress=True):
|
57 |
-
|
58 |
with torch.no_grad():
|
59 |
latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
60 |
mel = self.vae.decode_first_stage(latents)
|
@@ -62,7 +62,7 @@ class Tango:
|
|
62 |
return wave
|
63 |
|
64 |
def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
|
65 |
-
|
66 |
outputs = []
|
67 |
for k in tqdm(range(0, len(prompts), batch_size)):
|
68 |
batch = prompts[k: k+batch_size]
|
@@ -84,24 +84,42 @@ tango.stft.to(device_type)
|
|
84 |
tango.model.to(device_type)
|
85 |
|
86 |
@spaces.GPU(duration=120)
|
87 |
-
def gradio_generate(
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
|
90 |
|
91 |
output_filename_1 = "tmp1.wav"
|
92 |
-
wavio.write(output_filename_1, output_wave[0], rate=16000, sampwidth=2)
|
93 |
-
output_filename_2 = "tmp2.wav"
|
94 |
-
wavio.write(output_filename_2, output_wave[1], rate=16000, sampwidth=2)
|
95 |
-
output_filename_3 = "tmp3.wav"
|
96 |
-
wavio.write(output_filename_3, output_wave[2], rate=16000, sampwidth=2)
|
97 |
|
98 |
if (output_format == "mp3"):
|
99 |
AudioSegment.from_wav("tmp1.wav").export("tmp1.mp3", format = "mp3")
|
100 |
output_filename_1 = "tmp1.mp3"
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
return [output_filename_1, output_filename_2, output_filename_3]
|
107 |
|
@@ -133,16 +151,17 @@ Generate audio using Tango2 by providing a text prompt. Tango2 was built from Ta
|
|
133 |
# Gradio input and output components
|
134 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
135 |
output_format = gr.Radio(label = "Output format", info = "The file you can download", choices = ["mp3", "wav"], value = "wav")
|
|
|
136 |
output_audio_1 = gr.Audio(label="Generated Audio #1/3", type="filepath")
|
137 |
output_audio_2 = gr.Audio(label="Generated Audio #2/3", type="filepath")
|
138 |
output_audio_3 = gr.Audio(label="Generated Audio #3/3", type="filepath")
|
139 |
-
denoising_steps = gr.Slider(minimum=
|
140 |
guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
|
141 |
|
142 |
# Gradio interface
|
143 |
gr_interface = gr.Interface(
|
144 |
fn=gradio_generate,
|
145 |
-
inputs=[input_text, output_format, denoising_steps, guidance_scale],
|
146 |
outputs=[output_audio_1, output_audio_2, output_audio_3],
|
147 |
title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
|
148 |
description=description_text,
|
|
|
49 |
self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler")
|
50 |
|
51 |
def chunks(self, lst, n):
|
52 |
+
# Yield successive n-sized chunks from a list
|
53 |
for i in range(0, len(lst), n):
|
54 |
yield lst[i:i + n]
|
55 |
|
56 |
def generate(self, prompt, steps=100, guidance=3, samples=3, disable_progress=True):
|
57 |
+
# Genrate audio for a single prompt string
|
58 |
with torch.no_grad():
|
59 |
latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
60 |
mel = self.vae.decode_first_stage(latents)
|
|
|
62 |
return wave
|
63 |
|
64 |
def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
|
65 |
+
# Genrate audio for a list of prompt strings
|
66 |
outputs = []
|
67 |
for k in tqdm(range(0, len(prompts), batch_size)):
|
68 |
batch = prompts[k: k+batch_size]
|
|
|
84 |
tango.model.to(device_type)
|
85 |
|
86 |
@spaces.GPU(duration=120)
|
87 |
+
def gradio_generate(
|
88 |
+
prompt,
|
89 |
+
output_format,
|
90 |
+
output_number,
|
91 |
+
steps,
|
92 |
+
guidance
|
93 |
+
):
|
94 |
+
output_wave = tango.generate(prompt, steps, guidance, output_number)
|
95 |
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
|
96 |
|
97 |
output_filename_1 = "tmp1.wav"
|
98 |
+
wavio.write(output_filename_1, output_wave[0], rate = 16000, sampwidth = 2)
|
|
|
|
|
|
|
|
|
99 |
|
100 |
if (output_format == "mp3"):
|
101 |
AudioSegment.from_wav("tmp1.wav").export("tmp1.mp3", format = "mp3")
|
102 |
output_filename_1 = "tmp1.mp3"
|
103 |
+
|
104 |
+
if (2 <= output_number):
|
105 |
+
output_filename_2 = "tmp2.wav"
|
106 |
+
wavio.write(output_filename_2, output_wave[1], rate = 16000, sampwidth = 2)
|
107 |
+
|
108 |
+
if (output_format == "mp3"):
|
109 |
+
AudioSegment.from_wav("tmp2.wav").export("tmp2.mp3", format = "mp3")
|
110 |
+
output_filename_2 = "tmp2.mp3"
|
111 |
+
else:
|
112 |
+
output_filename_2 = None
|
113 |
+
|
114 |
+
if (output_number == 3):
|
115 |
+
output_filename_3 = "tmp3.wav"
|
116 |
+
wavio.write(output_filename_3, output_wave[2], rate = 16000, sampwidth = 2)
|
117 |
+
|
118 |
+
if (output_format == "mp3"):
|
119 |
+
AudioSegment.from_wav("tmp3.wav").export("tmp3.mp3", format = "mp3")
|
120 |
+
output_filename_3 = "tmp3.mp3"
|
121 |
+
else:
|
122 |
+
output_filename_3 = None
|
123 |
|
124 |
return [output_filename_1, output_filename_2, output_filename_3]
|
125 |
|
|
|
151 |
# Gradio input and output components
|
152 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
153 |
output_format = gr.Radio(label = "Output format", info = "The file you can download", choices = ["mp3", "wav"], value = "wav")
|
154 |
+
output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
|
155 |
output_audio_1 = gr.Audio(label="Generated Audio #1/3", type="filepath")
|
156 |
output_audio_2 = gr.Audio(label="Generated Audio #2/3", type="filepath")
|
157 |
output_audio_3 = gr.Audio(label="Generated Audio #3/3", type="filepath")
|
158 |
+
denoising_steps = gr.Slider(minimum=10, maximum=200, value=100, step=1, label="Steps", interactive=True)
|
159 |
guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
|
160 |
|
161 |
# Gradio interface
|
162 |
gr_interface = gr.Interface(
|
163 |
fn=gradio_generate,
|
164 |
+
inputs=[input_text, output_format, output_number, denoising_steps, guidance_scale],
|
165 |
outputs=[output_audio_1, output_audio_2, output_audio_3],
|
166 |
title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
|
167 |
description=description_text,
|