# huggingface space exclusive import os # print("installing pyharp") # os.system('pip install "pyharp@git+https://github.com/audacitorch/pyharp.git"') # print("installing madmom") os.system('pip install cython') os.system('pip install madmom') from pathlib import Path from typing import Tuple import yaml import tempfile import uuid import shutil from dataclasses import dataclass, asdict import numpy as np import audiotools as at import argbind import torch import gradio as gr from vampnet.interface import Interface from vampnet import mask as pmask from pyharp import ModelCard, build_endpoint device = torch.device("cuda" if torch.cuda.is_available() else "cpu") interface = Interface( device=device, coarse_ckpt="models/nesquik/coarse.pth", coarse2fine_ckpt="models/nesquik/c2f.pth", codec_ckpt="models/nesquik/codec.pth", ) # populate the model choices with any interface.yml files in the generated confs MODEL_CHOICES = { "default": { "Interface.coarse_ckpt": str(interface.coarse_path), "Interface.coarse2fine_ckpt": str(interface.c2f_path), "Interface.codec_ckpt": str(interface.codec_path), } } generated_confs = Path("conf/generated") for conf_file in generated_confs.glob("*/interface.yml"): with open(conf_file) as f: _conf = yaml.safe_load(f) MODEL_CHOICES[conf_file.parent.name] = _conf OUT_DIR = Path("gradio-outputs") OUT_DIR.mkdir(exist_ok=True, parents=True) def load_audio(file): print(file) filepath = file.name sig = at.AudioSignal.salient_excerpt( filepath, duration=interface.coarse.chunk_size_s ) sig = interface.preprocess(sig) out_dir = OUT_DIR / "tmp" / str(uuid.uuid4()) out_dir.mkdir(parents=True, exist_ok=True) sig.write(out_dir / "input.wav") return sig.path_to_file def load_example_audio(): return "./assets/example.wav" def _vamp(sig, data): sig = interface.preprocess(sig) loudness = sig.loudness() print(f"input loudness is {loudness}") z = interface.encode(sig) # build the mask mask = pmask.full_mask(z) mask = pmask.mask_and( mask, pmask.periodic_mask( z, data[periodic_p], random_roll=True ) ) # these should be the last two mask ops mask = pmask.dropout(mask, data[dropout]) mask = pmask.codebook_mask(mask, int(data[n_mask_codebooks])) print(f"sampletemp {data[sampletemp]}") print(f"num_steps {data[num_steps]}") print(f"periodic_p {data[periodic_p]}") print(f"seed {data[seed]}") _seed = data[seed] if data[seed] > 0 else None print(f"processing coarse...") zv, mask_z = interface.coarse_vamp( z, mask=mask, sampling_steps=data[num_steps], mask_temperature=1.5*10, sampling_temperature=data[sampletemp], return_mask=True, top_p=0.85, gen_fn=interface.coarse.generate, seed=_seed, sample_cutoff=1.0, ) print(f"processing coarse to fine...") zv = interface.coarse_to_fine( zv, mask_temperature=1.5*10, sampling_temperature=data[sampletemp], mask=mask, sampling_steps=data[num_steps] // 2, sample_cutoff=1.0, seed=_seed, ) sig = interface.to_signal(zv).cpu() print("done") return sig def process_fn(data): # remove any old files in the output directory (from previous runs) shutil.rmtree(OUT_DIR) OUT_DIR.mkdir() out_dir = OUT_DIR / str(uuid.uuid4()) out_dir.mkdir() sig = at.AudioSignal(data[input_audio]) for _pass in range(data[num_passes]): pass sig.write(out_dir / "output.wav") return sig.path_to_file with gr.Blocks() as demo: with gr.Row(): with gr.Column(): gr.Markdown("# nesquik") gr.Markdown(" the ultimate bitcrusher! will do it's best to convert your instrumental music into an 8-bit chiptune.") with gr.Row(): with gr.Column(): manual_audio_upload = gr.File( label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)", file_types=["audio"] ) load_example_audio_button = gr.Button("or load example audio") input_audio = gr.Audio( label="input audio", interactive=False, type="filepath", ) # connect widgets load_example_audio_button.click( fn=load_example_audio, inputs=[], outputs=[ input_audio] ) manual_audio_upload.change( fn=load_audio, inputs=[manual_audio_upload], outputs=[ input_audio] ) # mask settings with gr.Column(): with gr.Accordion("manual controls", open=True): periodic_p = gr.Slider( label="periodic prompt", minimum=1, maximum=3, step=1, value=2, ) n_mask_codebooks = gr.Slider( label="first upper codebook level to mask", minimum=0, maximum=9, value=2, step=1, ) sampletemp = gr.Slider( label="sample temperature", minimum=0.1, maximum=10.0, value=1.0, step=0.001 ) num_steps = gr.Slider( label="number of steps (should normally be between 12 and 36)", minimum=1, maximum=128, step=6, value=24 ) num_passes = gr.Slider( label="number of passes (more passes = more time, but better results)", minimum=2, maximum=8, step=1, value=4 ) seed = gr.Number( label="seed (0 for random)", value=0, precision=0, ) vamp_button = gr.Button("nes, quick!!!!!") output_audio = gr.Audio( label="output audio", interactive=False, type="filepath" ) _inputs = { input_audio, num_steps, sampletemp, periodic_p, seed, n_mask_codebooks, } # connect widgets vamp_button.click( fn=process_fn, inputs=_inputs, outputs=[output_audio], ) build_endpoint( inputs=_inputs, output=output_audio, process_fn=process_fn, card=ModelCard( name="vampnet", description="turn your music into NES music!! quick!! NOTE: vampnet's has a maximum context length of 10 seconds. Please split all audio clips into 10 second chunks, or processing will result in an error. ", author="Hugo Flores GarcĂ­a", tags=["music", "generative"] ), visible=False ) demo.queue().launch()