|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
import yaml |
|
import json |
|
import pyloudnorm as pyln |
|
from hydra.utils import instantiate |
|
from random import normalvariate |
|
from soxr import resample |
|
from functools import partial |
|
|
|
from modules.utils import chain_functions, vec2statedict, get_chunks |
|
from modules.fx import clip_delay_eq_Q |
|
|
|
|
|
title_md = "# Vocal Effects Generator" |
|
description_md = """ |
|
This is a demo of the paper [DiffVox: A Differentiable Model for Capturing and Analysing Professional Effects Distributions](https://arxiv.org/abs/2504.14735), accepted at DAFx 2025. |
|
In this demo, you can upload a raw vocal audio file (in mono) and apply random effects to make it sound better! |
|
|
|
The effects consist of series of EQ, compressor, delay, and reverb. |
|
The generator is a PCA model derived from 365 vocal effects presets fitted with the same effects chain. |
|
This interface allows you to control the first 10 principal components (PCs) of the generator, randomise them, and render the audio. |
|
For the rest of the PCs, you can choose to randomise them or set them to zero. |
|
|
|
To give you some idea, we emperically found that the first PC controls the amount of reverb and the second PC controls the amount of brightness. |
|
Note that adding these PCs together does not necessarily mean that their effects are additive in the final audio. |
|
We found sometimes the effects of least important PCs are more perceptible. |
|
Try to play around with the sliders and buttons and see what you can come up with! |
|
|
|
Currently only a portion of PCs are tweakable, but in the future we will add more controls and visualisation tools. |
|
For example: |
|
- Exposing all the PCs |
|
- Directly controlling the parameters of the effects |
|
- Visualising the PCA space |
|
- Visualising the frequency responses/dynamic curves of the effects |
|
- Exporting the effects settings as JSON files |
|
""" |
|
|
|
SLIDER_MAX = 3 |
|
SLIDER_MIN = -3 |
|
NUMBER_OF_PCS = 10 |
|
TEMPERATURE = 0.7 |
|
CONFIG_PATH = "presets/rt_config.yaml" |
|
PCA_PARAM_FILE = "presets/internal/gaussian.npz" |
|
INFO_PATH = "presets/internal/info.json" |
|
|
|
|
|
with open(CONFIG_PATH) as fp: |
|
fx_config = yaml.safe_load(fp)["model"] |
|
|
|
fx = instantiate(fx_config) |
|
fx.eval() |
|
|
|
pca_params = np.load(PCA_PARAM_FILE) |
|
mean = pca_params["mean"] |
|
cov = pca_params["cov"] |
|
eigvals, eigvecs = np.linalg.eigh(cov) |
|
eigvals = np.flip(eigvals, axis=0)[:75] |
|
eigvecs = np.flip(eigvecs, axis=1)[:, :75] |
|
U = eigvecs * np.sqrt(eigvals) |
|
U = torch.from_numpy(U).float() |
|
mean = torch.from_numpy(mean).float() |
|
|
|
|
|
with open(INFO_PATH) as f: |
|
info = json.load(f) |
|
|
|
param_keys = info["params_keys"] |
|
original_shapes = list( |
|
map(lambda lst: lst if len(lst) else [1], info["params_original_shapes"]) |
|
) |
|
|
|
*vec2dict_args, _ = get_chunks(param_keys, original_shapes) |
|
vec2dict_args = [param_keys, original_shapes] + vec2dict_args |
|
vec2dict = partial( |
|
vec2statedict, |
|
**dict( |
|
zip( |
|
[ |
|
"keys", |
|
"original_shapes", |
|
"selected_chunks", |
|
"position", |
|
"U_matrix_shape", |
|
], |
|
vec2dict_args, |
|
) |
|
), |
|
) |
|
|
|
|
|
meter = pyln.Meter(44100) |
|
|
|
|
|
@torch.no_grad() |
|
def inference(audio, randomise_rest, *pcs): |
|
sr, y = audio |
|
if sr != 44100: |
|
y = resample(y, sr, 44100) |
|
if y.dtype.kind != "f": |
|
y = y / 32768.0 |
|
|
|
if y.ndim == 1: |
|
y = y[:, None] |
|
loudness = meter.integrated_loudness(y) |
|
y = pyln.normalize.loudness(y, loudness, -18.0) |
|
|
|
y = torch.from_numpy(y).float().T.unsqueeze(0) |
|
if y.shape[1] != 1: |
|
y = y.mean(dim=1, keepdim=True) |
|
|
|
M = eigvals.shape[0] |
|
z = torch.cat( |
|
[ |
|
torch.tensor([float(x) for x in pcs]), |
|
( |
|
torch.randn(M - len(pcs)) * TEMPERATURE |
|
if randomise_rest |
|
else torch.zeros(M - len(pcs)) |
|
), |
|
] |
|
) |
|
x = U @ z + mean |
|
|
|
fx.load_state_dict(vec2dict(x), strict=False) |
|
fx.apply(partial(clip_delay_eq_Q, Q=0.707)) |
|
|
|
rendered = fx(y).squeeze(0).T.numpy() |
|
if np.max(np.abs(rendered)) > 1: |
|
rendered = rendered / np.max(np.abs(rendered)) |
|
return (44100, (rendered * 32768).astype(np.int16)) |
|
|
|
|
|
def get_important_pcs(n=10, **kwargs): |
|
sliders = [ |
|
gr.Slider(minimum=SLIDER_MIN, maximum=SLIDER_MAX, label=f"PC {i}", **kwargs) |
|
for i in range(1, n + 1) |
|
] |
|
return sliders |
|
|
|
|
|
def model2json(): |
|
fx_names = ["PK1", "PK2", "LS", "HS", "LP", "HP", "DRC"] |
|
results = {k: v.toJSON() for k, v in zip(fx_names, fx)} | { |
|
"Panner": fx[7].pan.toJSON() |
|
} |
|
spatial_fx = { |
|
"DLY": fx[7].effects[0].toJSON() | {"LP": fx[7].effects[0].eq.toJSON()}, |
|
"FDN": fx[7].effects[1].toJSON() |
|
| { |
|
"Tone correction PEQ": { |
|
k: v.toJSON() for k, v in zip(fx_names[:4], fx[7].effects[1].eq) |
|
} |
|
}, |
|
"Cross Send (dB)": fx[7].params.sends_0.log10().mul(20).item(), |
|
} |
|
return json.dumps( |
|
{ |
|
"Direct": results, |
|
"Sends": spatial_fx, |
|
} |
|
) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
title_md, |
|
elem_id="title", |
|
) |
|
with gr.Row(): |
|
gr.Markdown( |
|
description_md, |
|
elem_id="description", |
|
) |
|
gr.Image("diffvox_diagram.png", elem_id="diagram") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(type="numpy", sources="upload", label="Input Audio") |
|
with gr.Row(): |
|
random_button = gr.Button( |
|
f"Randomise the first {NUMBER_OF_PCS} PCs", |
|
elem_id="randomise-button", |
|
) |
|
reset_button = gr.Button( |
|
"Reset", |
|
elem_id="reset-button", |
|
) |
|
render_button = gr.Button( |
|
"Run", elem_id="render-button", variant="primary" |
|
) |
|
random_rest_checkbox = gr.Checkbox( |
|
label=f"Randomise PCs > {NUMBER_OF_PCS} (default to zeros)", |
|
value=False, |
|
elem_id="randomise-checkbox", |
|
) |
|
sliders = get_important_pcs(NUMBER_OF_PCS, value=0) |
|
with gr.Column(): |
|
audio_output = gr.Audio( |
|
type="numpy", label="Output Audio", interactive=False |
|
) |
|
json_output = gr.JSON(label="Effect Settings", max_height=800, open=True) |
|
|
|
render_button.click( |
|
lambda *args: (lambda x: (x, model2json()))(inference(*args)), |
|
inputs=[ |
|
audio_input, |
|
random_rest_checkbox, |
|
] |
|
+ sliders, |
|
outputs=[audio_output, json_output], |
|
) |
|
|
|
random_button.click( |
|
lambda *xs: [ |
|
chain_functions( |
|
partial(max, SLIDER_MIN), |
|
partial(min, SLIDER_MAX), |
|
)(normalvariate(0, 1)) |
|
for _ in range(len(xs)) |
|
], |
|
inputs=sliders, |
|
outputs=sliders, |
|
) |
|
reset_button.click( |
|
lambda *xs: [0 for _ in range(len(xs))], |
|
inputs=sliders, |
|
outputs=sliders, |
|
) |
|
|
|
demo.launch() |
|
|