Spaces:
Running
Running
# Mostly from: https://github.com/adefossez/seewav | |
# Original author: adefossez | |
import math | |
import tempfile | |
from pathlib import Path | |
import subprocess | |
import cairo | |
import numpy as np | |
import gradio as gr | |
from pydub import AudioSegment | |
def read_audio(audio, seek=None, duration=None): | |
""" | |
Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds. | |
Returns `float[channels, samples]`. | |
""" | |
audio_segment = AudioSegment.from_file(audio) | |
channels = audio_segment.channels | |
samplerate = audio_segment.frame_rate | |
if seek is not None: | |
seek_ms = int(seek * 1000) | |
audio_segment = audio_segment[seek_ms:] | |
if duration is not None: | |
duration_ms = int(duration * 1000) | |
audio_segment = audio_segment[:duration_ms] | |
samples = audio_segment.get_array_of_samples() | |
wav = np.array(samples, dtype=np.float32) | |
return wav.reshape(channels, -1), samplerate | |
def sigmoid(x): | |
return 1 / (1 + np.exp(-x)) | |
def envelope(wav, window, stride): | |
""" | |
Extract the envelope of the waveform `wav` (float[samples]), using average pooling | |
with `window` samples and the given `stride`. | |
""" | |
# pos = np.pad(np.maximum(wav, 0), window // 2) | |
wav = np.pad(wav, window // 2) | |
out = [] | |
for off in range(0, len(wav) - window, stride): | |
frame = wav[off : off + window] | |
out.append(np.maximum(frame, 0).mean()) | |
out = np.array(out) | |
# Some form of audio compressor based on the sigmoid. | |
out = 1.9 * (sigmoid(2.5 * out) - 0.5) | |
return out | |
def draw_env(envs, out, fg_colors, bg_color, size): | |
""" | |
Internal function, draw a single frame (two frames for stereo) using cairo and save | |
it to the `out` file as png. envs is a list of envelopes over channels, each env | |
is a float[bars] representing the height of the envelope to draw. Each entry will | |
be represented by a bar. | |
""" | |
surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size) | |
ctx = cairo.Context(surface) | |
ctx.scale(*size) | |
ctx.set_source_rgb(*bg_color) | |
ctx.rectangle(0, 0, 1, 1) | |
ctx.fill() | |
K = len(envs) # Number of waves to draw (waves are stacked vertically) | |
T = len(envs[0]) # Numbert of time steps | |
pad_ratio = 0.1 # spacing ratio between 2 bars | |
width = 1.0 / (T * (1 + 2 * pad_ratio)) | |
pad = pad_ratio * width | |
delta = 2 * pad + width | |
ctx.set_line_width(width) | |
for step in range(T): | |
for i in range(K): | |
half = 0.5 * envs[i][step] # (semi-)height of the bar | |
half /= K # as we stack K waves vertically | |
midrule = (1 + 2 * i) / (2 * K) # midrule of i-th wave | |
ctx.set_source_rgb(*fg_colors[i]) | |
ctx.move_to(pad + step * delta, midrule - half) | |
ctx.line_to(pad + step * delta, midrule) | |
ctx.stroke() | |
ctx.set_source_rgba(*fg_colors[i], 0.8) | |
ctx.move_to(pad + step * delta, midrule) | |
ctx.line_to(pad + step * delta, midrule + 0.9 * half) | |
ctx.stroke() | |
surface.write_to_png(out) | |
def interpole(x1, y1, x2, y2, x): | |
return y1 + (y2 - y1) * (x - x1) / (x2 - x1) | |
def visualize( | |
progress, | |
audio, | |
tmp, | |
out, | |
seek=None, | |
duration=None, | |
rate=60, | |
bars=50, | |
speed=4, | |
time=0.4, | |
oversample=3, | |
fg_color=(0.2, 0.2, 0.2), | |
fg_color2=(0.5, 0.3, 0.6), | |
bg_color=(1, 1, 1), | |
size=(400, 400), | |
stereo=False, | |
): | |
""" | |
Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final | |
video in `out`. | |
`seek` and `durations` gives the extract location if any. | |
`rate` is the framerate of the output video. | |
`bars` is the number of bars in the animation. | |
`speed` is the base speed of transition. Depending on volume, actual speed will vary | |
between 0.5 and 2 times it. | |
`time` amount of audio shown at once on a frame. | |
`oversample` higher values will lead to more frequent changes. | |
`fg_color` is the rgb color to use for the foreground. | |
`fg_color2` is the rgb color to use for the second wav if stereo is set. | |
`bg_color` is the rgb color to use for the background. | |
`size` is the `(width, height)` in pixels to generate. | |
`stereo` is whether to create 2 waves. | |
""" | |
try: | |
wav, sr = read_audio(audio, seek=seek, duration=duration) | |
except (IOError, ValueError) as err: | |
raise gr.Error(err) | |
# wavs is a list of wav over channels | |
wavs = [] | |
if stereo: | |
assert wav.shape[0] == 2, "stereo requires stereo audio file" | |
wavs.append(wav[0]) | |
wavs.append(wav[1]) | |
else: | |
wav = wav.mean(0) | |
wavs.append(wav) | |
for i, wav in enumerate(wavs): | |
wavs[i] = wav / wav.std() | |
window = int(sr * time / bars) | |
stride = int(window / oversample) | |
# envs is a list of env over channels | |
envs = [] | |
for wav in wavs: | |
env = envelope(wav, window, stride) | |
env = np.pad(env, (bars // 2, 2 * bars)) | |
envs.append(env) | |
duration = len(wavs[0]) / sr | |
frames = int(rate * duration) | |
smooth = np.hanning(bars) | |
gr.Info("Generating the frames...") | |
for idx in progress(range(frames)): | |
pos = (((idx / rate)) * sr) / stride / bars | |
off = int(pos) | |
loc = pos - off | |
denvs = [] | |
for env in envs: | |
env1 = env[off * bars : (off + 1) * bars] | |
env2 = env[(off + 1) * bars : (off + 2) * bars] | |
# we want loud parts to be updated faster | |
maxvol = math.log10(1e-4 + env2.max()) * 10 | |
speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2) | |
w = sigmoid(speed * speedup * (loc - 0.5)) | |
denv = (1 - w) * env1 + w * env2 | |
denv *= smooth | |
denvs.append(denv) | |
draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size) | |
gr.Info("Encoding the animation video...") | |
subprocess.run([ | |
"ffmpeg", "-y", "-loglevel", "panic", "-r", | |
str(rate), "-f", "image2", "-s", f"{size[0]}x{size[1]}", "-i", "%06d.png", "-i", audio, "-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p", | |
out.resolve() | |
], check=True, cwd=tmp) | |
return out | |
def parse_color(colorstr): | |
""" | |
Given a comma separated rgb(a) colors, returns a 4-tuple of float. | |
""" | |
try: | |
r, g, b = [float(i) for i in colorstr.split(",")] | |
return r, g, b | |
except ValueError: | |
raise gr.Error( | |
"Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order" | |
) | |
def hex_to_rgb(hex_color): | |
hex_color = hex_color.lstrip('#') | |
r = int(hex_color[0:2], 16) / 255.0 | |
g = int(hex_color[2:4], 16) / 255.0 | |
b = int(hex_color[4:6], 16) / 255.0 | |
return (r, g, b) | |
def do_viz( | |
inp_aud, | |
inp_bgcolor, | |
inp_color1, | |
inp_nbars, | |
inp_vidw, | |
inp_vidh, | |
progress=gr.Progress(), | |
): | |
with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile( | |
suffix=".mp4", | |
delete=False | |
) as out: | |
return visualize( | |
progress.tqdm, | |
inp_aud, | |
Path(tmp), | |
Path(out.name), | |
bars=inp_nbars, | |
fg_color=hex_to_rgb(inp_color1), | |
bg_color=hex_to_rgb(inp_bgcolor), | |
size=(inp_vidw, inp_vidh), | |
) | |
import gradio as gr | |
ABOUT = """ | |
# seewav GUI | |
> Have an audio clip but need a video (e.g. for X/Twitter)? | |
**Convert audio into a video!** | |
An online graphical user interface for [seewav](https://github.com/adefossez/seewav). | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown(ABOUT) | |
with gr.Row(): | |
with gr.Column(): | |
inp_aud = gr.Audio(type='filepath') | |
with gr.Group(): | |
inp_color1 = gr.ColorPicker( | |
label="Color", | |
info="Color of the top waveform", | |
value="#00237E", | |
interactive=True, | |
) | |
inp_bgcolor = gr.ColorPicker( | |
label="Background Color", | |
info="Color of the background", | |
value="#000000", | |
interactive=True, | |
) | |
with gr.Accordion("Advanced Configuration", open=False): | |
inp_nbars = gr.Slider( | |
label="Num. Bars", | |
value=50, | |
interactive=True, | |
minimum=5, | |
maximum=1500, | |
) | |
inp_vidw = gr.Slider( | |
label="Video Width", | |
value=400, | |
interactive=True, | |
minimum=100, | |
maximum=3000, | |
) | |
inp_vidh = gr.Slider( | |
label="Video Height", | |
value=400, | |
interactive=True, | |
minimum=100, | |
maximum=3000, | |
) | |
inp_go = gr.Button("Visualize", variant="primary") | |
with gr.Column(): | |
out_vid = gr.Video(interactive=False) | |
inp_go.click( | |
do_viz, | |
inputs=[ | |
inp_aud, | |
inp_bgcolor, | |
inp_color1, | |
inp_nbars, | |
inp_vidw, | |
inp_vidh, | |
], | |
outputs=[out_vid], | |
) | |
demo.queue(api_open=False, default_concurrency_limit=20).launch(show_api=False) |