Spaces:

susnato
/

pop2piano_dev

Runtime error

File size: 7,246 Bytes

e9a98af
 
 
 
 
c8ce7ce
 
e9a98af
 
c8ce7ce
e9a98af
 
 
 
 
 
 
 
 
 
 
 
 
cd7c7f4
e9a98af
 
 
 
 
 
 
 
 
 
 
c8ce7ce
 
 
 
 
 
 
 
 
 
 
e9a98af
c8ce7ce
e9a98af
 
 
 
 
 
 
 
 
 
c8ce7ce
e9a98af
c8ce7ce
340707d
c8ce7ce
 
 
 
 
 
 
 
e9a98af
c8ce7ce
 
e9a98af
c8ce7ce
e9a98af
 
e3c975e
cef9668
e3c975e
e9a98af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36b4085
 
 
cef9668
 
09e9bd4
 
e9a98af
cef9668
 
 
 
 
 
c8ce7ce
1fbd075
cef9668
 
 
 
c8ce7ce
 
 
 
 
 
 
 
 
 
340707d
c8ce7ce
 
 
 
 
340707d
e9a98af
cef9668
e9a98af
 
 
 
 
 
 
 
 
 
 
cef9668
 
 
 
 
 
 
 
 
6b6ff8d
c8ce7ce
e9a98af
 
 
 
 
c8ce7ce

import os
import torch
import librosa
import binascii
import warnings
import midi2audio      # to convert midi to wav
import numpy as np
import pytube as pt    # to download the youtube videos as audios
import gradio as gr
import soundfile as sf # to make the stereo mix
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor


yt_video_dir = "./yt_dir"
outputs_dir = "./midi_wav_outputs"
os.makedirs(outputs_dir, exist_ok=True)
os.makedirs(yt_video_dir, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
composers = model.generation_config.composer_to_feature_token.keys()


def get_audio_from_yt_video(yt_link):
    try:
        yt = pt.YouTube(yt_link)
        t = yt.streams.filter(only_audio=True)
        filename = os.path.join(yt_video_dir, binascii.hexlify(os.urandom(8)).decode() + ".mp4")
        t[0].download(filename=filename)
    except:
        warnings.warn(f"Video Not Found at {yt_link}")
        filename = None
    
    return filename, filename
    
def inference(file_uploaded, composer):
    # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
    # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
    waveform, sr = librosa.load(file_uploaded, sr=None) 
    
    inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
    model_output = model.generate(input_features=inputs["input_features"], composer=composer)
    tokenizer_output = processor.batch_decode(token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu"))["pretty_midi_objects"]

    return prepare_output_file(tokenizer_output, sr)    

def prepare_output_file(tokenizer_output, sr):
    # Add some random values so that no two file names are same
    output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
    midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
        
    # write the .mid file
    tokenizer_output[0].write(midi_output)
    
    # convert .mid file to .wav using `midi2audio`
    wav_output = midi_output.replace(".mid", ".wav")
    midi2audio.FluidSynth().midi_to_audio(midi_output, wav_output)
    
    return wav_output, wav_output, midi_output

def get_stereo(pop_path, midi, pop_scale=0.5):
    pop_y, sr = librosa.load(pop_path, sr=None)
    midi_y, _ = librosa.load(midi.name, sr=None)

    if len(pop_y) > len(midi_y):
        midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
    elif len(pop_y) < len(midi_y):
        pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
    stereo = np.stack((midi_y, pop_y * pop_scale))
    
    stereo_mix_path = pop_path.replace("output", "output_stereo_mix") 
    sf.write(file=stereo_mix_path, data=stereo.T, samplerate=sr, format="wav",)
    
    return stereo_mix_path, stereo_mix_path


# Thanks a lot to "https://huggingface.co/Taithrah" for this theme.
# taken from https://huggingface.co/spaces/NoCrypt/miku
block = gr.Blocks(theme="Taithrah/Minimal")

with block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex;
                  align-items: center;
                  gap: 0.8rem;
                  font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px;">
                  Pop2piano
                </h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
                Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
              </p>
            </div>
        """
    )
    with gr.Group():
        with gr.Row(equal_height=True):
            with gr.Column():
                file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
            with gr.Column():
                with gr.Row():
                    yt_link = gr.Textbox(label="Enter YouTube Link of the Video", autofocus=True, lines=3)
                    yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")

                yt_audio_path = gr.Audio(label="Audio Extracted from the YouTube Video", interactive=False)
                yt_btn.click(get_audio_from_yt_video, inputs=[yt_link], outputs=[yt_audio_path, file_uploaded])
    
    with gr.Group():
        with gr.Column():
            composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
            generate_btn = gr.Button("Generate")

        with gr.Row().style(mobile_collapse=False, equal_height=True):
            wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
            wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
            midi_output = gr.File(label="Download the Generated MIDI (.mid)")
            generate_btn.click(inference, 
                               inputs=[file_uploaded, composer], 
                               outputs=[wav_output1, wav_output2, midi_output])
            
    with gr.Group():
        gr.HTML(
            """
            <div> <h3> <center> Get the Stereo Mix from the Pop Music and Generated MIDI </h3> </div>
            """
        )
        pop_scale = gr.Slider(0, 1, value=0.5, label="Choose the ratio between Pop and MIDI", info="1.0 = Only Pop, 0.0=Only MIDI", interactive=True),
        stereo_btn = gr.Button("Get Stereo Mix")
        with gr.Row():
            stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
            stereo_mix2 = gr.File(label="Download the Stereo Mix")
        
        stereo_btn.click(get_stereo, inputs=[file_uploaded, wav_output2, pop_scale[0]], outputs=[stereo_mix1, stereo_mix2])
                
    with gr.Group():
        gr.Examples([
            ["./examples/custom_song.mp3", "composer1"],
        ],
            fn=inference,
            inputs=[file_uploaded, composer],
            outputs=[wav_output1, wav_output2, midi_output],
            cache_examples=True
        )
        gr.HTML(
            """
        <div class="footer">
                    <center>The design for this Space is taken from <a href="https://huggingface.co/spaces/NoCrypt/miku"> NoCrypt/miku </a>
        </div>
        """
        )
        
        gr.HTML(
            """
        <div class="footer">
                    <center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a> 
                    <center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
                    <center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
                    </p>
        </div>
        """
        )

block.launch(debug=False)