import torch import gradio as gr from transformers import ViTImageProcessor, ViTModel from audiodiffusion import AudioDiffusionPipeline, ImageEncoder from pedalboard.io import AudioFile from pedalboard import Pedalboard, NoiseGate, Compressor, LowShelfFilter, Gain, HighShelfFilter, Reverb device = "cuda" if torch.cuda.is_available() else "cpu" generator1 = torch.Generator(device) generator2 = torch.Generator(device) pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device) processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k') extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k') image_encoder = ImageEncoder(processor, extractor) board = Pedalboard([ NoiseGate(threshold_db=-60, ratio=10.0), Compressor(threshold_db=60, ratio=1.0), LowShelfFilter(cutoff_frequency_hz=220, gain_db=-10), HighShelfFilter(cutoff_frequency_hz=1200, gain_db=-10), Gain(gain_db=40), Reverb(room_size=0.5), ]) def _encode_image(image): return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device) def _generate_spectrogram(condition, steps, eta): images, (sample_rate, audios) = pipe( batch_size=1, steps=steps, generator=generator1, step_generator=generator2, encoding=condition, eta=eta, return_dict=False, ) return images[0], (sample_rate, audios[0]) def _denoise_audio(audio, sr): return board(audio, sr) def run_generation(image, steps, eta): condition = _encode_image(image) spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta) audio = _denoise_audio(audio, sr) return spectrogram, (sr, audio) with gr.Blocks(title="Image-based soundtrack generation") as demo: gr.Markdown(''' # Image-based soundtrack generation ''') with gr.Row(): with gr.Column(): image = gr.Image( type="pil", label="Conditioning image" ) steps = gr.Slider( minimum=10, maximum=1000, step=10, value=50, label="Denoising steps" ) eta = gr.Slider( minimum=0.0, maximum=1.0, step=0.1, value=0.6, label="η" ) gr.Markdown(''' Eta (η) is a variable that controls the level of interpolation between deterministic (η=0.0) and stochastic (η=1.0) denoising schedule. ''') btn = gr.Button("Generate") clear = gr.ClearButton(image) with gr.Column(): spectrogram = gr.Image( label="Generated Mel spectrogram" ) audio = gr.Audio( label="Resulting audio" ) btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio]) demo.launch()