Spaces:

fffiloni
/

spectrogram-to-music

Sleeping

File size: 8,648 Bytes

0c98d02
398c97e
0c98d02
96bc128
e5ca74b
05e653a
f4a11e0
85d7512
f4a11e0
85d7512
c4e12c1
 
48fb8e1
f4a11e0
 
48fb8e1
1c42a58
48fb8e1
85d7512
1c42a58
05e653a
1c42a58
 
 
 
 
 
 
51aa8a2
 
 
b5dc05a
05e653a
c380881
c2149f9
18b0529
 
c4e12c1
18b0529
1c42a58
 
c8d083e
96bc128
fef57d1
 
f5a7a36
fef57d1
63e6915
1f944a7
 
48c20ac
9150f85
1c42a58
 
9186464
f4a11e0
e5b27eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
 
 
 
 
 
 
 
 
d6b0b08
b0e04c2
 
 
02b84d5
b0e04c2
 
 
02b84d5
95bce55
b0e04c2
c0f3731
b0e04c2
 
fba55a6
b0e04c2
02b84d5
b0e04c2
02b84d5
b0e04c2
 
 
 
 
 
 
fba55a6
b0e04c2
 
fba55a6
9ca1653
fba55a6
 
d6b0b08
 
fba55a6
 
b0e04c2
 
 
 
 
 
 
 
 
 
 
 
 
bebe771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
bebe771
 
b0e04c2
bebe771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
 
 
 
 
 
 
 
 
 
c4e12c1
8057d43
05e653a
 
 
 
c4e12c1
b0e04c2
 
c4e12c1
 
 
 
 
 
 
 
 
b0e04c2
 
1c42a58
c4e12c1
b0e04c2

import gradio as gr
import torch

from PIL import Image
import numpy as np
from spectro import wav_bytes_from_spectrogram_image

from diffusers import StableDiffusionPipeline
from diffusers import StableDiffusionImg2ImgPipeline

from share_btn import community_icon_html, loading_icon_html, share_js

device = "cuda"
MODEL_ID = "riffusion/riffusion-model-v1"
pipe = StableDiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe = pipe.to(device)
pipe2 = StableDiffusionImg2ImgPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe2 = pipe2.to(device)

spectro_from_wav = gr.Interface.load("spaces/fffiloni/audio-to-spectrogram")

def predict(prompt, negative_prompt, audio_input, duration):
    if audio_input == None :
        return classic(prompt, negative_prompt, duration)
    else :
        return style_transfer(prompt, negative_prompt, audio_input)

def classic(prompt, negative_prompt, duration):
    if duration == 5:
        width_duration=512
    else :
        width_duration = 512 + ((int(duration)-5) * 128)
    spec = pipe(prompt, negative_prompt=negative_prompt, height=512, width=width_duration).images[0]
    print(spec)
    wav = wav_bytes_from_spectrogram_image(spec)
    with open("output.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return spec, 'output.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

def style_transfer(prompt, negative_prompt, audio_input):
    spec = spectro_from_wav(audio_input)
    print(spec)
    # Open the image
    im = Image.open(spec)
    
    
    # Open the image
    im = image_from_spectrogram(im, 1)
   
    
    new_spectro = pipe2(prompt=prompt, image=im, strength=0.5, guidance_scale=7).images
    wav = wav_bytes_from_spectrogram_image(new_spectro[0])
    with open("output.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return new_spectro[0], 'output.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

def image_from_spectrogram(
    spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25
) -> Image.Image:
    """
    Compute a spectrogram image from a spectrogram magnitude array.
    """
    # Apply the power curve
    data = np.power(spectrogram, power_for_image)

    # Rescale to 0-255
    data = data * 255 / max_volume

    # Invert
    data = 255 - data

    # Convert to a PIL image
    image = Image.fromarray(data.astype(np.uint8))

    # Flip Y
    image = image.transpose(Image.FLIP_TOP_BOTTOM)

    # Convert to RGB
    image = image.convert("RGB")

    return image

title = """
    <div style="text-align: center; max-width: 500px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
            margin-bottom: 10px;
            line-height: 1em;
        "
        >
        <h1 style="font-weight: 600; margin-bottom: 7px;">
            Riffusion real-time music generation
        </h1>
        </div>
        <p style="margin-bottom: 10px;font-size: 94%;font-weight: 100;line-height: 1.5em;">
        Describe a musical prompt, generate music by getting a spectrogram image & sound.
        </p>
    </div>
"""

article = """
    <p style="font-size: 0.8em;line-height: 1.2em;border: 1px solid #374151;border-radius: 8px;padding: 20px;">
    About the model: Riffusion is a latent text-to-image diffusion model capable of generating spectrogram images given any text input. These spectrograms can be converted into audio clips.
    <br />—
    <br />The Riffusion model was created by fine-tuning the Stable-Diffusion-v1-5 checkpoint.
    <br />—
    <br />The model is intended for research purposes only. Possible research areas and tasks include 
    generation of artworks, audio, and use in creative processes, applications in educational or creative tools, research on generative models.

    </p>
    <div class="footer">
        <p>
        <a href="https://huggingface.co/riffusion/riffusion-model-v1" target="_blank">Riffusion model</a> by Seth Forsgren and Hayk Martiros - 
        Demo by 🤗 <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>
        </p>
    </div>

    <p style="text-align: center;font-size: 94%">
        Do you need faster results ? You can skip the queue by duplicating this space: 
        <span style="display: flex;align-items: center;justify-content: center;height: 30px;">
            <a href="https://huggingface.co/fffiloni/spectrogram-to-music?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>       
            <a href="https://colab.research.google.com/drive/1FhH3HlN8Ps_Pr9OR6Qcfbfz7utDvICl0?usp=sharing" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
        </span>
    </p>
"""

css = '''
    #col-container, #col-container-2 {max-width: 510px; margin-left: auto; margin-right: auto;}
    a {text-decoration-line: underline; font-weight: 600;}
    div#record_btn > .mt-6 {
        margin-top: 0!important;
    }
    div#record_btn > .mt-6 button {
        width: 100%;
        height: 40px;
    }
    .footer {
        margin-bottom: 45px;
        margin-top: 10px;
        text-align: center;
        border-bottom: 1px solid #e5e5e5;
    }
    .footer>p {
        font-size: .8rem;
        display: inline-block;
        padding: 0 10px;
        transform: translateY(10px);
        background: white;
    }
    .dark .footer {
        border-color: #303030;
    }
    .dark .footer>p {
        background: #0b0f19;
    }
    .animate-spin {
        animation: spin 1s linear infinite;
    }
    @keyframes spin {
        from {
            transform: rotate(0deg);
        }
        to {
            transform: rotate(360deg);
        }
    }
    #share-btn-container {
        display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
    }
    #share-btn {
        all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
    }
    #share-btn * {
        all: unset;
    }
    #share-btn-container div:nth-child(-n+2){
        width: auto !important;
        min-height: 0px !important;
    }
    #share-btn-container .wrap {
        display: none !important;
    }

'''
 


with gr.Blocks(css=css) as demo:
    
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
        
        prompt_input = gr.Textbox(placeholder="a cat diva singing in a New York jazz club", label="Musical prompt", elem_id="prompt-in")
        audio_input = gr.Audio(source="upload", type="filepath", visible=False)
        with gr.Row():
            negative_prompt = gr.Textbox(label="Negative prompt")
            duration_input = gr.Slider(label="Duration in seconds", minimum=5, maximum=10, step=1, value=8, elem_id="duration-slider")
            
        send_btn = gr.Button(value="Get a new spectrogram ! ", elem_id="submit-btn")
            
    with gr.Column(elem_id="col-container-2"):
        
        spectrogram_output = gr.Image(label="spectrogram image result", elem_id="img-out")
        sound_output = gr.Audio(type='filepath', label="spectrogram sound", elem_id="music-out")
        
        with gr.Group(elem_id="share-btn-container"):
            community_icon = gr.HTML(community_icon_html, visible=False)
            loading_icon = gr.HTML(loading_icon_html, visible=False)
            share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
        
        gr.HTML(article)
    
    send_btn.click(predict, inputs=[prompt_input, negative_prompt, audio_input, duration_input], outputs=[spectrogram_output, sound_output, share_button, community_icon, loading_icon])
    share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=250).launch(debug=True)