|
|
|
import os |
|
|
|
|
|
|
|
|
|
os.system('pip install cython') |
|
os.system('pip install madmom') |
|
|
|
from pathlib import Path |
|
from typing import Tuple |
|
import yaml |
|
import tempfile |
|
import uuid |
|
import shutil |
|
from dataclasses import dataclass, asdict |
|
|
|
import numpy as np |
|
import audiotools as at |
|
import argbind |
|
import torch |
|
|
|
import gradio as gr |
|
from vampnet.interface import Interface |
|
from vampnet import mask as pmask |
|
|
|
from pyharp import ModelCard, build_endpoint |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
interface = Interface( |
|
device=device, |
|
coarse_ckpt="models/nesquik/coarse.pth", |
|
coarse2fine_ckpt="models/nesquik/c2f.pth", |
|
codec_ckpt="models/nesquik/codec.pth", |
|
) |
|
|
|
|
|
MODEL_CHOICES = { |
|
"default": { |
|
"Interface.coarse_ckpt": str(interface.coarse_path), |
|
"Interface.coarse2fine_ckpt": str(interface.c2f_path), |
|
"Interface.codec_ckpt": str(interface.codec_path), |
|
} |
|
} |
|
generated_confs = Path("conf/generated") |
|
for conf_file in generated_confs.glob("*/interface.yml"): |
|
with open(conf_file) as f: |
|
_conf = yaml.safe_load(f) |
|
MODEL_CHOICES[conf_file.parent.name] = _conf |
|
|
|
|
|
|
|
OUT_DIR = Path("gradio-outputs") |
|
OUT_DIR.mkdir(exist_ok=True, parents=True) |
|
|
|
|
|
def load_audio(file): |
|
print(file) |
|
filepath = file.name |
|
sig = at.AudioSignal.salient_excerpt( |
|
filepath, |
|
duration=interface.coarse.chunk_size_s |
|
) |
|
sig = interface.preprocess(sig) |
|
|
|
out_dir = OUT_DIR / "tmp" / str(uuid.uuid4()) |
|
out_dir.mkdir(parents=True, exist_ok=True) |
|
sig.write(out_dir / "input.wav") |
|
return sig.path_to_file |
|
|
|
|
|
def load_example_audio(): |
|
return "./assets/example.wav" |
|
|
|
|
|
def _vamp(sig, data): |
|
|
|
sig = interface.preprocess(sig) |
|
|
|
loudness = sig.loudness() |
|
print(f"input loudness is {loudness}") |
|
|
|
z = interface.encode(sig) |
|
|
|
|
|
mask = pmask.full_mask(z) |
|
mask = pmask.mask_and( |
|
mask, pmask.periodic_mask( |
|
z, |
|
data[periodic_p], |
|
random_roll=True |
|
) |
|
) |
|
|
|
|
|
mask = pmask.dropout(mask, data[dropout]) |
|
mask = pmask.codebook_mask(mask, int(data[n_mask_codebooks])) |
|
|
|
print(f"sampletemp {data[sampletemp]}") |
|
print(f"num_steps {data[num_steps]}") |
|
print(f"periodic_p {data[periodic_p]}") |
|
print(f"seed {data[seed]}") |
|
|
|
_seed = data[seed] if data[seed] > 0 else None |
|
print(f"processing coarse...") |
|
zv, mask_z = interface.coarse_vamp( |
|
z, |
|
mask=mask, |
|
sampling_steps=data[num_steps], |
|
mask_temperature=1.5*10, |
|
sampling_temperature=data[sampletemp], |
|
return_mask=True, |
|
top_p=0.85, |
|
gen_fn=interface.coarse.generate, |
|
seed=_seed, |
|
sample_cutoff=1.0, |
|
) |
|
|
|
print(f"processing coarse to fine...") |
|
zv = interface.coarse_to_fine( |
|
zv, |
|
mask_temperature=1.5*10, |
|
sampling_temperature=data[sampletemp], |
|
mask=mask, |
|
sampling_steps=data[num_steps] // 2, |
|
sample_cutoff=1.0, |
|
seed=_seed, |
|
) |
|
|
|
sig = interface.to_signal(zv).cpu() |
|
print("done") |
|
return sig |
|
|
|
|
|
def process_fn(data): |
|
|
|
shutil.rmtree(OUT_DIR) |
|
OUT_DIR.mkdir() |
|
|
|
out_dir = OUT_DIR / str(uuid.uuid4()) |
|
out_dir.mkdir() |
|
sig = at.AudioSignal(data[input_audio]) |
|
|
|
for _pass in range(data[num_passes]): |
|
pass |
|
|
|
|
|
sig.write(out_dir / "output.wav") |
|
|
|
return sig.path_to_file |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("# nesquik") |
|
gr.Markdown(" the ultimate bitcrusher! will do it's best to convert your instrumental music into an 8-bit chiptune.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
|
|
manual_audio_upload = gr.File( |
|
label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)", |
|
file_types=["audio"] |
|
) |
|
load_example_audio_button = gr.Button("or load example audio") |
|
|
|
input_audio = gr.Audio( |
|
label="input audio", |
|
interactive=False, |
|
type="filepath", |
|
) |
|
|
|
|
|
|
|
load_example_audio_button.click( |
|
fn=load_example_audio, |
|
inputs=[], |
|
outputs=[ input_audio] |
|
) |
|
|
|
manual_audio_upload.change( |
|
fn=load_audio, |
|
inputs=[manual_audio_upload], |
|
outputs=[ input_audio] |
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
|
with gr.Accordion("manual controls", open=True): |
|
periodic_p = gr.Slider( |
|
label="periodic prompt", |
|
minimum=1, |
|
maximum=3, |
|
step=1, |
|
value=2, |
|
) |
|
|
|
n_mask_codebooks = gr.Slider( |
|
label="first upper codebook level to mask", |
|
minimum=0, |
|
maximum=9, |
|
value=2, |
|
step=1, |
|
) |
|
|
|
sampletemp = gr.Slider( |
|
label="sample temperature", |
|
minimum=0.1, |
|
maximum=10.0, |
|
value=1.0, |
|
step=0.001 |
|
) |
|
|
|
num_steps = gr.Slider( |
|
label="number of steps (should normally be between 12 and 36)", |
|
minimum=1, |
|
maximum=128, |
|
step=6, |
|
value=24 |
|
) |
|
|
|
num_passes = gr.Slider( |
|
label="number of passes (more passes = more time, but better results)", |
|
minimum=2, |
|
maximum=8, |
|
step=1, |
|
value=4 |
|
) |
|
|
|
|
|
seed = gr.Number( |
|
label="seed (0 for random)", |
|
value=0, |
|
precision=0, |
|
) |
|
|
|
|
|
vamp_button = gr.Button("nes, quick!!!!!") |
|
output_audio = gr.Audio( |
|
label="output audio", |
|
interactive=False, |
|
type="filepath" |
|
) |
|
|
|
_inputs = { |
|
input_audio, |
|
num_steps, |
|
sampletemp, |
|
periodic_p, |
|
seed, |
|
n_mask_codebooks, |
|
} |
|
|
|
|
|
vamp_button.click( |
|
fn=process_fn, |
|
inputs=_inputs, |
|
outputs=[output_audio], |
|
) |
|
|
|
|
|
build_endpoint( |
|
inputs=_inputs, |
|
output=output_audio, |
|
process_fn=process_fn, |
|
card=ModelCard( |
|
name="vampnet", |
|
description="turn your music into NES music!! quick!! NOTE: vampnet's has a maximum context length of 10 seconds. Please split all audio clips into 10 second chunks, or processing will result in an error. ", |
|
author="Hugo Flores García", |
|
tags=["music", "generative"] |
|
), |
|
visible=False |
|
) |
|
|
|
demo.queue().launch() |