import gradio as gr import torch from spectro import wav_bytes_from_spectrogram_image from diffusers import StableDiffusionPipeline model_id = "riffusion/riffusion-model-v1" pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) pipe = pipe.to("cuda") def predict(prompt): spec = pipe(prompt).images[0] print(spec) wav = wav_bytes_from_spectrogram_image(spec) with open("output.wav", "wb") as f: f.write(wav[0].getbuffer()) return 'output.wav' gr.Interface( predict, inputs="text", outputs=[gr.Audio(type='filepath')], title="Riffusion Text-to-Music", description="Describe a musical prompt, generate music by getting a Riffusion spectrogram and its corresponding sound" ).queue(max_size=32, concurrency_count=20).launch(debug=True)