seewav-gui

Sleeping

File size: 9,743 Bytes

# Thank you to the authors of seewav for dedicating it into the public domain.
# This program is also dedicated into the public domain.
# You may use it, at your choice, under the Unlicense, CC0, or WTFPL license.
# Enjoy!

# Mostly from: https://github.com/adefossez/seewav
# Original author: adefossez


import math
import tempfile
from pathlib import Path
import subprocess
import cairo
import numpy as np
import gradio as gr
from pydub import AudioSegment


def read_audio(audio, seek=None, duration=None):
    """
    Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds.
    Returns `float[channels, samples]`.
    """

    audio_segment = AudioSegment.from_file(audio)
    channels = audio_segment.channels
    samplerate = audio_segment.frame_rate

    if seek is not None:
        seek_ms = int(seek * 1000)
        audio_segment = audio_segment[seek_ms:]

    if duration is not None:
        duration_ms = int(duration * 1000)
        audio_segment = audio_segment[:duration_ms]

    samples = audio_segment.get_array_of_samples()
    wav = np.array(samples, dtype=np.float32)
    return wav.reshape(channels, -1), samplerate


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def envelope(wav, window, stride):
    """
    Extract the envelope of the waveform `wav` (float[samples]), using average pooling
    with `window` samples and the given `stride`.
    """
    # pos = np.pad(np.maximum(wav, 0), window // 2)
    wav = np.pad(wav, window // 2)
    out = []
    for off in range(0, len(wav) - window, stride):
        frame = wav[off : off + window]
        out.append(np.maximum(frame, 0).mean())
    out = np.array(out)
    # Some form of audio compressor based on the sigmoid.
    out = 1.9 * (sigmoid(2.5 * out) - 0.5)
    return out


def draw_env(envs, out, fg_colors, bg_color, size):
    """
    Internal function, draw a single frame (two frames for stereo) using cairo and save
    it to the `out` file as png. envs is a list of envelopes over channels, each env
    is a float[bars] representing the height of the envelope to draw. Each entry will
    be represented by a bar.
    """
    surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size)
    ctx = cairo.Context(surface)
    ctx.scale(*size)

    ctx.set_source_rgb(*bg_color)
    ctx.rectangle(0, 0, 1, 1)
    ctx.fill()

    K = len(envs)  # Number of waves to draw (waves are stacked vertically)
    T = len(envs[0])  # Numbert of time steps
    pad_ratio = 0.1  # spacing ratio between 2 bars
    width = 1.0 / (T * (1 + 2 * pad_ratio))
    pad = pad_ratio * width
    delta = 2 * pad + width

    ctx.set_line_width(width)
    for step in range(T):
        for i in range(K):
            half = 0.5 * envs[i][step]  # (semi-)height of the bar
            half /= K  # as we stack K waves vertically
            midrule = (1 + 2 * i) / (2 * K)  # midrule of i-th wave
            ctx.set_source_rgb(*fg_colors[i])
            ctx.move_to(pad + step * delta, midrule - half)
            ctx.line_to(pad + step * delta, midrule)
            ctx.stroke()
            ctx.set_source_rgba(*fg_colors[i], 0.8)
            ctx.move_to(pad + step * delta, midrule)
            ctx.line_to(pad + step * delta, midrule + 0.9 * half)
            ctx.stroke()

    surface.write_to_png(out)


def interpole(x1, y1, x2, y2, x):
    return y1 + (y2 - y1) * (x - x1) / (x2 - x1)


def visualize(
    progress,
    audio,
    tmp,
    out,
    seek=None,
    duration=None,
    rate=60,
    bars=50,
    speed=4,
    time=0.4,
    oversample=3,
    fg_color=(0.2, 0.2, 0.2),
    fg_color2=(0.5, 0.3, 0.6),
    bg_color=(1, 1, 1),
    size=(400, 400),
    stereo=False,
):
    """
    Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final
    video in `out`.
    `seek` and `durations` gives the extract location if any.
    `rate` is the framerate of the output video.

    `bars` is the number of bars in the animation.
    `speed` is the base speed of transition. Depending on volume, actual speed will vary
        between 0.5 and 2 times it.
    `time` amount of audio shown at once on a frame.
    `oversample` higher values will lead to more frequent changes.
    `fg_color` is the rgb color to use for the foreground.
    `fg_color2` is the rgb color to use for the second wav if stereo is set.
    `bg_color` is the rgb color to use for the background.
    `size` is the `(width, height)` in pixels to generate.
    `stereo` is whether to create 2 waves.
    """
    try:
        wav, sr = read_audio(audio, seek=seek, duration=duration)
    except (IOError, ValueError) as err:
        raise gr.Error(err)
    # wavs is a list of wav over channels
    wavs = []
    if stereo:
        assert wav.shape[0] == 2, "stereo requires stereo audio file"
        wavs.append(wav[0])
        wavs.append(wav[1])
    else:
        wav = wav.mean(0)
        wavs.append(wav)

    for i, wav in enumerate(wavs):
        wavs[i] = wav / wav.std()

    window = int(sr * time / bars)
    stride = int(window / oversample)
    # envs is a list of env over channels
    envs = []
    for wav in wavs:
        env = envelope(wav, window, stride)
        env = np.pad(env, (bars // 2, 2 * bars))
        envs.append(env)

    duration = len(wavs[0]) / sr
    frames = int(rate * duration)
    smooth = np.hanning(bars)

    gr.Info("Generating the frames...")
    for idx in progress(range(frames)):
        pos = (((idx / rate)) * sr) / stride / bars
        off = int(pos)
        loc = pos - off
        denvs = []
        for env in envs:
            env1 = env[off * bars : (off + 1) * bars]
            env2 = env[(off + 1) * bars : (off + 2) * bars]

            # we want loud parts to be updated faster
            maxvol = math.log10(1e-4 + env2.max()) * 10
            speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2)
            w = sigmoid(speed * speedup * (loc - 0.5))
            denv = (1 - w) * env1 + w * env2
            denv *= smooth
            denvs.append(denv)
        draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size)
    gr.Info("Encoding the animation video...")
    subprocess.run([
        "ffmpeg", "-y", "-loglevel", "panic", "-r",
        str(rate), "-f", "image2", "-s", f"{size[0]}x{size[1]}", "-i", "%06d.png", "-i", audio, "-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p",
        out.resolve()
    ], check=True, cwd=tmp)
    return out



def parse_color(colorstr):
    """
    Given a comma separated rgb(a) colors, returns a 4-tuple of float.
    """
    try:
        r, g, b = [float(i) for i in colorstr.split(",")]
        return r, g, b
    except ValueError:
        raise gr.Error(
            "Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order"
        )


def hex_to_rgb(hex_color):
    hex_color = hex_color.lstrip('#')
    r = int(hex_color[0:2], 16) / 255.0
    g = int(hex_color[2:4], 16) / 255.0
    b = int(hex_color[4:6], 16) / 255.0
    return (r, g, b)

def do_viz(
    inp_aud,
    inp_bgcolor,
    inp_color1,
    inp_nbars,
    inp_vidw,
    inp_vidh,
    progress=gr.Progress(),
):
    with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile(
        suffix=".mp4",
        delete=False
    ) as out:
        return visualize(
            progress.tqdm,
            inp_aud,
            Path(tmp),
            Path(out.name),
            bars=inp_nbars,
            fg_color=hex_to_rgb(inp_color1),
            bg_color=hex_to_rgb(inp_bgcolor),
            size=(inp_vidw, inp_vidh),
        )


import gradio as gr

ABOUT = """
# seewav GUI

> Have an audio clip but need a video (e.g. for X/Twitter)?

**Convert audio into a nice video!**

An online graphical user interface for [seewav](https://github.com/adefossez/seewav).

Enjoy!
"""
with gr.Blocks() as demo:
    gr.Markdown(ABOUT)
    with gr.Row():
        with gr.Column():
            inp_aud = gr.Audio(type='filepath')
            with gr.Group():
                inp_color1 = gr.ColorPicker(
                    label="Color",
                    info="Color of the top waveform",
                    value="#00237E",
                    interactive=True,
                )
                inp_bgcolor = gr.ColorPicker(
                    label="Background Color",
                    info="Color of the background",
                    value="#000000",
                    interactive=True,
                )
            with gr.Accordion("Advanced Configuration", open=False):
                inp_nbars = gr.Slider(
                    label="Num. Bars",
                    value=50,
                    interactive=True,
                    minimum=5,
                    maximum=1500,
                )
                inp_vidw = gr.Slider(
                    label="Video Width",
                    value=400,
                    interactive=True,
                    minimum=100,
                    maximum=3000,
                )
                inp_vidh = gr.Slider(
                    label="Video Height",
                    value=400,
                    interactive=True,
                    minimum=100,
                    maximum=3000,
                )
            inp_go = gr.Button("Visualize", variant="primary")
        with gr.Column():
            out_vid = gr.Video(interactive=False)
        inp_go.click(
            do_viz,
            inputs=[
                inp_aud,
                inp_bgcolor,
                inp_color1,
                inp_nbars,
                inp_vidw,
                inp_vidh,
            ],
            outputs=[out_vid],
        )
demo.queue(api_open=True, default_concurrency_limit=20).launch(show_api=True)