Spaces:
Runtime error
Runtime error
| import torch | |
| import einops | |
| import gradio as gr | |
| import datetime | |
| import numpy as np | |
| import spaces | |
| import soundfile | |
| import os | |
| import sys | |
| import zipfile | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| sys.path.append("sf-creator-fork") | |
| from main import sfz, decentsampler | |
| # Download models from Hugging Face Hub | |
| decoder_path = hf_hub_download("erl-j/soundfont-generator-assets", "decoder.pt") | |
| model_path = hf_hub_download( | |
| "erl-j/soundfont-generator-assets", "synth_lfm_modern_full.ckpt" | |
| ) | |
| # Load models once at startup | |
| device = "cuda" | |
| decoder = torch.load(decoder_path, map_location=device).eval() | |
| model = torch.load(model_path, map_location=device).eval() | |
| def generate_and_export_soundfont(text, steps=20, instrument_name=None): | |
| sample_start = datetime.datetime.now() | |
| # Generate audio as before | |
| z = model.sample(1, text=[text], steps=steps) | |
| z_reshaped = einops.rearrange(z, "b t c d -> (b c) d t") | |
| with torch.no_grad(): | |
| audio = decoder.decode(z_reshaped) | |
| audio_output = einops.rearrange(audio, "b c t -> c (b t)").cpu().numpy() | |
| audio_output = audio_output / np.max(np.abs(audio_output)) | |
| # Export individual wav files | |
| export_audio = audio.cpu().numpy().astype(np.float32) | |
| output_dir = "output" | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Create instrument name if not provided | |
| if not instrument_name: | |
| instrument_name = text.replace(" ", "_")[:20] | |
| # Save individual WAV files | |
| pitches = [ | |
| "C1", | |
| "F#1", | |
| "C2", | |
| "F#2", | |
| "C3", | |
| "F#3", | |
| "C4", | |
| "F#4", | |
| "C5", | |
| "F#5", | |
| "C6", | |
| "F#6", | |
| "C7", | |
| "F#7", | |
| "C8", | |
| ] | |
| wav_files = [] | |
| for i in range(audio.shape[0]): | |
| wav_path = f"{output_dir}/{pitches[i]}.wav" | |
| soundfile.write(wav_path, export_audio[i].T, 44100) | |
| wav_files.append(wav_path) | |
| # Generate SFZ file | |
| sfz( | |
| directory=output_dir, | |
| lowkey="21", | |
| highkey="108", | |
| instrument=instrument_name, | |
| loopmode="no_loop", | |
| polyphony=None, | |
| ) | |
| # Create zip file containing SFZ and WAV files for the complete soundfont | |
| zip_path = f"{output_dir}/{instrument_name}_package.zip" | |
| with zipfile.ZipFile(zip_path, "w") as zipf: | |
| # Add SFZ file | |
| sfz_file = f"{output_dir}/{instrument_name}.sfz" | |
| zipf.write(sfz_file, os.path.basename(sfz_file)) | |
| # Add all WAV files | |
| for wav_file in wav_files: | |
| if os.path.exists(wav_file): | |
| zipf.write(wav_file, os.path.basename(wav_file)) | |
| total_time = (datetime.datetime.now() - sample_start).total_seconds() | |
| return ( | |
| (44100, audio_output.T), | |
| f"Generation took {total_time:.2f}s\nFiles saved in {output_dir}", | |
| zip_path, | |
| wav_files, | |
| ) | |
| custom_js = open("custom.js").read() | |
| custom_css = open("custom.css").read() | |
| demo = gr.Blocks( | |
| title="Erl-j's Soundfont Generator", | |
| theme=gr.themes.Default( | |
| primary_hue="green", | |
| font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"], | |
| ), | |
| js=custom_js, | |
| css=custom_css, | |
| ) | |
| with demo: | |
| gr.Markdown(open("intro.md").read()) | |
| with gr.Row(): | |
| steps = gr.Slider( | |
| minimum=1, maximum=50, value=20, step=1, label="Generation steps" | |
| ) | |
| with gr.Row(): | |
| text_input = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Enter text description (e.g. 'hard bass', 'sparkly bells')", | |
| lines=2, | |
| ) | |
| with gr.Row(): | |
| generate_btn = gr.Button("Generate Soundfont", variant="primary") | |
| with gr.Row(): | |
| audio_output = gr.Audio(label="Generated Audio Preview", visible=False) | |
| status_output = gr.Textbox(label="Status", lines=2, visible=False) | |
| with gr.Row(): | |
| wav_files = gr.File( | |
| label="Individual WAV Files", | |
| file_count="multiple", | |
| visible=False, | |
| elem_id="individual-wav-files", | |
| ) | |
| gr.Markdown("## Download Soundfont Package here:") | |
| with gr.Row(): | |
| sf = gr.File( | |
| label="Download SFZ Soundfont Package", | |
| type="filepath", | |
| visible=True, | |
| elem_id="sfz", | |
| ) | |
| html = """ | |
| <div id="keyboard-container"></div> | |
| """ | |
| gr.HTML(html) | |
| gr.Markdown(""" | |
| # About | |
| The model is a modified version of [stable audio open](https://huggingface.co/stabilityai/stable-audio-open-1.0). | |
| Unlike the original model, this version uses latent flow matching rather than latent diffusion. | |
| Secondly, the pitches are stacked in a channel dimension rather than concatenated in the time dimension. | |
| This allows for faster generation. | |
| Soundfont export code is based on the [sf-creator](https://github.com/paulwellnerbou/sf-creator) project. | |
| Similar work by Nercessian and Imort: [InstrumentGen](https://instrumentgen.netlify.app/). | |
| Thank you @carlthome for coming up with the name. | |
| To cite this work, please use the following BibTeX entry: | |
| ```bibtex | |
| @misc{erl-j-soundfont-generator, | |
| title={Erl-j's Soundfont Generator}, | |
| author={Nicolas Jonason}, | |
| year={2024}, | |
| publisher={Huggingface}, | |
| } | |
| ``` | |
| """) | |
| generate_btn.click( | |
| fn=generate_and_export_soundfont, | |
| inputs=[text_input, steps], | |
| outputs=[audio_output, status_output, sf, wav_files], | |
| ).success(js="() => console.log('Success')") | |
| text_input.submit( | |
| fn=generate_and_export_soundfont, | |
| inputs=[text_input, steps], | |
| outputs=[audio_output, status_output, sf, wav_files], | |
| ) | |
| if __name__ == "__main__": | |
| print("Starting demo...") | |
| demo.launch() | |