nesquik / app.py
hugo flores garcia
stuff
3cd737d
raw
history blame
No virus
7.35 kB
# huggingface space exclusive
import os
# print("installing pyharp")
# os.system('pip install "pyharp@git+https://github.com/audacitorch/pyharp.git"')
# print("installing madmom")
os.system('pip install cython')
os.system('pip install madmom')
from pathlib import Path
from typing import Tuple
import yaml
import tempfile
import uuid
import shutil
from dataclasses import dataclass, asdict
import numpy as np
import audiotools as at
import argbind
import torch
import gradio as gr
from vampnet.interface import Interface
from vampnet import mask as pmask
from pyharp import ModelCard, build_endpoint
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
interface = Interface(
device=device,
coarse_ckpt="models/nesquik/coarse.pth",
coarse2fine_ckpt="models/nesquik/c2f.pth",
codec_ckpt="models/nesquik/codec.pth",
)
# populate the model choices with any interface.yml files in the generated confs
MODEL_CHOICES = {
"default": {
"Interface.coarse_ckpt": str(interface.coarse_path),
"Interface.coarse2fine_ckpt": str(interface.c2f_path),
"Interface.codec_ckpt": str(interface.codec_path),
}
}
generated_confs = Path("conf/generated")
for conf_file in generated_confs.glob("*/interface.yml"):
with open(conf_file) as f:
_conf = yaml.safe_load(f)
MODEL_CHOICES[conf_file.parent.name] = _conf
OUT_DIR = Path("gradio-outputs")
OUT_DIR.mkdir(exist_ok=True, parents=True)
def load_audio(file):
print(file)
filepath = file.name
sig = at.AudioSignal.salient_excerpt(
filepath,
duration=interface.coarse.chunk_size_s
)
sig = interface.preprocess(sig)
out_dir = OUT_DIR / "tmp" / str(uuid.uuid4())
out_dir.mkdir(parents=True, exist_ok=True)
sig.write(out_dir / "input.wav")
return sig.path_to_file
def load_example_audio():
return "./assets/example.wav"
def _vamp(sig, data):
sig = interface.preprocess(sig)
loudness = sig.loudness()
print(f"input loudness is {loudness}")
z = interface.encode(sig)
# build the mask
mask = pmask.full_mask(z)
mask = pmask.mask_and(
mask, pmask.periodic_mask(
z,
data[periodic_p],
random_roll=True
)
)
# these should be the last two mask ops
mask = pmask.dropout(mask, data[dropout])
mask = pmask.codebook_mask(mask, int(data[n_mask_codebooks]))
print(f"sampletemp {data[sampletemp]}")
print(f"num_steps {data[num_steps]}")
print(f"periodic_p {data[periodic_p]}")
print(f"seed {data[seed]}")
_seed = data[seed] if data[seed] > 0 else None
print(f"processing coarse...")
zv, mask_z = interface.coarse_vamp(
z,
mask=mask,
sampling_steps=data[num_steps],
mask_temperature=1.5*10,
sampling_temperature=data[sampletemp],
return_mask=True,
top_p=0.85,
gen_fn=interface.coarse.generate,
seed=_seed,
sample_cutoff=1.0,
)
print(f"processing coarse to fine...")
zv = interface.coarse_to_fine(
zv,
mask_temperature=1.5*10,
sampling_temperature=data[sampletemp],
mask=mask,
sampling_steps=data[num_steps] // 2,
sample_cutoff=1.0,
seed=_seed,
)
sig = interface.to_signal(zv).cpu()
print("done")
return sig
def process_fn(data):
# remove any old files in the output directory (from previous runs)
shutil.rmtree(OUT_DIR)
OUT_DIR.mkdir()
out_dir = OUT_DIR / str(uuid.uuid4())
out_dir.mkdir()
sig = at.AudioSignal(data[input_audio])
for _pass in range(data[num_passes]):
pass
sig.write(out_dir / "output.wav")
return sig.path_to_file
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
gr.Markdown("# nesquik")
gr.Markdown(" the ultimate bitcrusher! will do it's best to convert your instrumental music into an 8-bit chiptune.")
with gr.Row():
with gr.Column():
manual_audio_upload = gr.File(
label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
file_types=["audio"]
)
load_example_audio_button = gr.Button("or load example audio")
input_audio = gr.Audio(
label="input audio",
interactive=False,
type="filepath",
)
# connect widgets
load_example_audio_button.click(
fn=load_example_audio,
inputs=[],
outputs=[ input_audio]
)
manual_audio_upload.change(
fn=load_audio,
inputs=[manual_audio_upload],
outputs=[ input_audio]
)
# mask settings
with gr.Column():
with gr.Accordion("manual controls", open=True):
periodic_p = gr.Slider(
label="periodic prompt",
minimum=1,
maximum=3,
step=1,
value=2,
)
n_mask_codebooks = gr.Slider(
label="first upper codebook level to mask",
minimum=0,
maximum=9,
value=2,
step=1,
)
sampletemp = gr.Slider(
label="sample temperature",
minimum=0.1,
maximum=10.0,
value=1.0,
step=0.001
)
num_steps = gr.Slider(
label="number of steps (should normally be between 12 and 36)",
minimum=1,
maximum=128,
step=6,
value=24
)
num_passes = gr.Slider(
label="number of passes (more passes = more time, but better results)",
minimum=2,
maximum=8,
step=1,
value=4
)
seed = gr.Number(
label="seed (0 for random)",
value=0,
precision=0,
)
vamp_button = gr.Button("nes, quick!!!!!")
output_audio = gr.Audio(
label="output audio",
interactive=False,
type="filepath"
)
_inputs = {
input_audio,
num_steps,
sampletemp,
periodic_p,
seed,
n_mask_codebooks,
}
# connect widgets
vamp_button.click(
fn=process_fn,
inputs=_inputs,
outputs=[output_audio],
)
build_endpoint(
inputs=_inputs,
output=output_audio,
process_fn=process_fn,
card=ModelCard(
name="vampnet",
description="turn your music into NES music!! quick!! NOTE: vampnet's has a maximum context length of 10 seconds. Please split all audio clips into 10 second chunks, or processing will result in an error. ",
author="Hugo Flores García",
tags=["music", "generative"]
),
visible=False
)
demo.queue().launch()