haoheliu commited on
Commit
b9719f4
1 Parent(s): a08f8e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -1
app.py CHANGED
@@ -19,6 +19,244 @@ pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(de
19
  # set the generator for reproducibility
20
  generator = torch.Generator(device)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
24
  if text is None:
@@ -34,7 +272,8 @@ def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_c
34
  generator=generator.manual_seed(int(random_seed)),
35
  )["audios"]
36
 
37
- return gr.make_waveform((16000, waveforms[0]), bg_image="bg.png")
 
38
 
39
 
40
  iface = gr.Blocks()
 
19
  # set the generator for reproducibility
20
  generator = torch.Generator(device)
21
 
22
+ @document()
23
+ def make_waveform(
24
+ audio: str | tuple[int, np.ndarray],
25
+ *,
26
+ bg_color: str = "#f3f4f6",
27
+ bg_image: str | None = None,
28
+ fg_alpha: float = 0.75,
29
+ bars_color: str | tuple[str, str] = ("#fbbf24", "#ea580c"),
30
+ bar_count: int = 50,
31
+ bar_width: float = 0.6,
32
+ animate: bool = False,
33
+ ) -> str:
34
+ """
35
+ Generates a waveform video from an audio file. Useful for creating an easy to share audio visualization. The output should be passed into a `gr.Video` component.
36
+ Parameters:
37
+ audio: Audio file path or tuple of (sample_rate, audio_data)
38
+ bg_color: Background color of waveform (ignored if bg_image is provided)
39
+ bg_image: Background image of waveform
40
+ fg_alpha: Opacity of foreground waveform
41
+ bars_color: Color of waveform bars. Can be a single color or a tuple of (start_color, end_color) of gradient
42
+ bar_count: Number of bars in waveform
43
+ bar_width: Width of bars in waveform. 1 represents full width, 0.5 represents half width, etc.
44
+ animate: If true, the audio waveform overlay will be animated, if false, it will be static.
45
+ Returns:
46
+ A filepath to the output video in mp4 format.
47
+ """
48
+ import matplotlib.pyplot as plt
49
+ from matplotlib.animation import FuncAnimation
50
+
51
+ if isinstance(audio, str):
52
+ audio_file = audio
53
+ audio = processing_utils.audio_from_file(audio)
54
+ else:
55
+ tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
56
+ processing_utils.audio_to_file(audio[0], audio[1], tmp_wav.name, format="wav")
57
+ audio_file = tmp_wav.name
58
+
59
+ if not os.path.isfile(audio_file):
60
+ raise ValueError("Audio file not found.")
61
+
62
+ ffmpeg = shutil.which("ffmpeg")
63
+ if not ffmpeg:
64
+ raise RuntimeError("ffmpeg not found.")
65
+
66
+ duration = round(len(audio[1]) / audio[0], 4)
67
+
68
+ # Helper methods to create waveform
69
+ def hex_to_rgb(hex_str):
70
+ return [int(hex_str[i : i + 2], 16) for i in range(1, 6, 2)]
71
+
72
+ def get_color_gradient(c1, c2, n):
73
+ if n < 1:
74
+ raise ValueError("Must have at least one stop in gradient")
75
+ c1_rgb = np.array(hex_to_rgb(c1)) / 255
76
+ c2_rgb = np.array(hex_to_rgb(c2)) / 255
77
+ mix_pcts = [x / (n - 1) for x in range(n)]
78
+ rgb_colors = [((1 - mix) * c1_rgb + (mix * c2_rgb)) for mix in mix_pcts]
79
+ return [
80
+ "#" + "".join(f"{int(round(val * 255)):02x}" for val in item)
81
+ for item in rgb_colors
82
+ ]
83
+
84
+ # Reshape audio to have a fixed number of bars
85
+ samples = audio[1]
86
+ if len(samples.shape) > 1:
87
+ samples = np.mean(samples, 1)
88
+ bins_to_pad = bar_count - (len(samples) % bar_count)
89
+ samples = np.pad(samples, [(0, bins_to_pad)])
90
+ samples = np.reshape(samples, (bar_count, -1))
91
+ samples = np.abs(samples)
92
+ samples = np.max(samples, 1)
93
+
94
+ with utils.MatplotlibBackendMananger():
95
+ plt.clf()
96
+ # Plot waveform
97
+ color = (
98
+ bars_color
99
+ if isinstance(bars_color, str)
100
+ else get_color_gradient(bars_color[0], bars_color[1], bar_count)
101
+ )
102
+
103
+ if animate:
104
+ fig = plt.figure(figsize=(5, 1), dpi=200, frameon=False)
105
+ fig.subplots_adjust(left=0, bottom=0, right=1, top=1)
106
+ plt.axis("off")
107
+ plt.margins(x=0)
108
+
109
+ bar_alpha = fg_alpha if animate else 1.0
110
+ barcollection = plt.bar(
111
+ np.arange(0, bar_count),
112
+ samples * 2,
113
+ bottom=(-1 * samples),
114
+ width=bar_width,
115
+ color=color,
116
+ alpha=bar_alpha,
117
+ )
118
+
119
+ tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
120
+
121
+ savefig_kwargs: dict[str, Any] = {"bbox_inches": "tight"}
122
+ if bg_image is not None:
123
+ savefig_kwargs["transparent"] = True
124
+ if animate:
125
+ savefig_kwargs["facecolor"] = "none"
126
+ else:
127
+ savefig_kwargs["facecolor"] = bg_color
128
+ plt.savefig(tmp_img.name, **savefig_kwargs)
129
+
130
+ if not animate:
131
+ waveform_img = PIL.Image.open(tmp_img.name)
132
+ waveform_img = waveform_img.resize((1000, 400))
133
+
134
+ # Composite waveform with background image
135
+ if bg_image is not None:
136
+ waveform_array = np.array(waveform_img)
137
+ waveform_array[:, :, 3] = waveform_array[:, :, 3] * fg_alpha
138
+ waveform_img = PIL.Image.fromarray(waveform_array)
139
+
140
+ bg_img = PIL.Image.open(bg_image)
141
+ waveform_width, waveform_height = waveform_img.size
142
+ bg_width, bg_height = bg_img.size
143
+ if waveform_width != bg_width:
144
+ bg_img = bg_img.resize(
145
+ (
146
+ waveform_width,
147
+ 2 * int(bg_height * waveform_width / bg_width / 2),
148
+ )
149
+ )
150
+ bg_width, bg_height = bg_img.size
151
+ composite_height = max(bg_height, waveform_height)
152
+ composite = PIL.Image.new(
153
+ "RGBA", (waveform_width, composite_height), "#FFFFFF"
154
+ )
155
+ composite.paste(bg_img, (0, composite_height - bg_height))
156
+ composite.paste(
157
+ waveform_img, (0, composite_height - waveform_height), waveform_img
158
+ )
159
+ composite.save(tmp_img.name)
160
+ img_width, img_height = composite.size
161
+ else:
162
+ img_width, img_height = waveform_img.size
163
+ waveform_img.save(tmp_img.name)
164
+ else:
165
+
166
+ def _animate(_):
167
+ for idx, b in enumerate(barcollection):
168
+ rand_height = np.random.uniform(0.8, 1.2)
169
+ b.set_height(samples[idx] * rand_height * 2)
170
+ b.set_y((-rand_height * samples)[idx])
171
+
172
+ frames = int(duration * 10)
173
+ anim = FuncAnimation(
174
+ fig, # type: ignore
175
+ _animate, # type: ignore
176
+ repeat=False,
177
+ blit=False,
178
+ frames=frames,
179
+ interval=100,
180
+ )
181
+ anim.save(
182
+ tmp_img.name,
183
+ writer="pillow",
184
+ fps=10,
185
+ codec="png",
186
+ savefig_kwargs=savefig_kwargs,
187
+ )
188
+
189
+ # Convert waveform to video with ffmpeg
190
+ output_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
191
+
192
+ if animate and bg_image is not None:
193
+ ffmpeg_cmd = [
194
+ ffmpeg,
195
+ "-loop",
196
+ "1",
197
+ "-i",
198
+ bg_image,
199
+ "-i",
200
+ tmp_img.name,
201
+ "-i",
202
+ audio_file,
203
+ "-filter_complex",
204
+ "[0:v]scale=w=trunc(iw/2)*2:h=trunc(ih/2)*2[bg];[1:v]format=rgba,colorchannelmixer=aa=1.0[ov];[bg][ov]overlay=(main_w-overlay_w*0.9)/2:main_h-overlay_h*0.9/2[output]",
205
+ "-t",
206
+ str(duration),
207
+ "-map",
208
+ "[output]",
209
+ "-map",
210
+ "2:a",
211
+ "-c:v",
212
+ "libx264",
213
+ "-c:a",
214
+ "aac",
215
+ "-shortest",
216
+ "-y",
217
+ output_mp4.name,
218
+ ]
219
+ elif animate and bg_image is None:
220
+ ffmpeg_cmd = [
221
+ ffmpeg,
222
+ "-i",
223
+ tmp_img.name,
224
+ "-i",
225
+ audio_file,
226
+ "-filter_complex",
227
+ "[0:v][1:a]concat=n=1:v=1:a=1[v];[v]scale=1000:400,format=yuv420p[v_scaled]",
228
+ "-map",
229
+ "[v_scaled]",
230
+ "-map",
231
+ "1:a",
232
+ "-c:v",
233
+ "libx264",
234
+ "-c:a",
235
+ "aac",
236
+ "-shortest",
237
+ "-y",
238
+ output_mp4.name,
239
+ ]
240
+ else:
241
+ ffmpeg_cmd = [
242
+ ffmpeg,
243
+ "-loop",
244
+ "1",
245
+ "-i",
246
+ tmp_img.name,
247
+ "-i",
248
+ audio_file,
249
+ "-vf",
250
+ f"color=c=#FFFFFF77:s={img_width}x{img_height}[bar];[0][bar]overlay=-w+(w/{duration})*t:H-h:shortest=1", # type: ignore
251
+ "-t",
252
+ str(duration),
253
+ "-y",
254
+ output_mp4.name,
255
+ ]
256
+
257
+ subprocess.check_call(ffmpeg_cmd)
258
+ return output_mp4.name
259
+
260
 
261
  def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
262
  if text is None:
 
272
  generator=generator.manual_seed(int(random_seed)),
273
  )["audios"]
274
 
275
+ return make_waveform((16000, waveforms[0]), bg_image="bg.png")
276
+ # return gr.Audio(sources=["microphone"], type="filepath")
277
 
278
 
279
  iface = gr.Blocks()