File size: 5,530 Bytes
e9a98af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import torch
import shutil
import librosa
import binascii
import warnings
import midi2audio
import pytube as pt    # to download the youtube videos as audios
import gradio as gr
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor


yt_video_dir = "./yt_dir"
outputs_dir = "./midi_wav_outputs"
os.makedirs(outputs_dir, exist_ok=True)
os.makedirs(yt_video_dir, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
composers = model.generation_config.composer_to_feature_token.keys()

def get_audio_from_yt_video(yt_link):
    try:
        yt = pt.YouTube(yt_link)
        t = yt.streams.filter(only_audio=True)
        filename = os.path.join(yt_video_dir, binascii.hexlify(os.urandom(8)).decode() + ".mp4")
        t[0].download(filename=filename)
    except:
        warnings.warn(f"Video Not Found at {yt_link}")
        filename = None
    
    return filename, filename

def prepare_output_file(tokenizer_output):
    # Add some random values so that no two file names are same
    output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
    midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
        
    # write the .mid file
    tokenizer_output[0].write(midi_output)
    
    # convert .mid file to .wav using `midi2audio`
    wav_output = midi_output.replace(".mid", ".wav")
    midi2audio.FluidSynth().midi_to_audio(midi_output, wav_output)

    from IPython.display import Audio
    return wav_output, wav_output, midi_output
    
def inference(file_uploaded, composer):
    # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
    # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
    waveform, sr = librosa.load(file_uploaded, sr=None) 
    
    inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
    model_output = model.generate(input_features=inputs["input_features"], composer=composer)
    tokenizer_output = processor.batch_decode(token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu"))["pretty_midi_objects"]

    return prepare_output_file(tokenizer_output)    


block = gr.Blocks()


with block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex;
                  align-items: center;
                  gap: 0.8rem;
                  font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px;">
                  Pop2piano
                </h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
                Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
              </p>
            </div>
        """
    )
    with gr.Group():
        with gr.Box():
            with gr.Row().style(mobile_collapse=False, equal_height=True):
                file_uploaded = gr.Audio(label="Upload an audio", type="filepath")

                with gr.Column():
                    with gr.Row():
                        yt_link = gr.Textbox(label="Enter YouTube link of the Video")
                        yt_btn = gr.Button("Get Audio from the YT link(Press this before pressing Generate)")
                    
                    
                    yt_audio_path = gr.Audio(label="Audio Extracted from the YouTube Video", interactive=False)
                    
                    yt_btn.click(get_audio_from_yt_video, inputs=[yt_link], outputs=[yt_audio_path, file_uploaded])

        with gr.Box():
            with gr.Row().style(mobile_collapse=False, equal_height=True):
                composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
                
            with gr.Row().style(mobile_collapse=False, equal_height=True):
                btn = gr.Button("Generate")
        
        with gr.Box():
            with gr.Row().style(mobile_collapse=False, equal_height=True):
                wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
                wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
                midi_output = gr.File(label="Download the Generated MIDI (.mid)")
                btn.click(inference, inputs=[file_uploaded, composer], outputs=[wav_output1, wav_output2, midi_output])
                
        gr.Examples([
            ["./examples/custom_song.mp3", "composer1"],
            ["./examples/BornThisWay.mp3", "composer1"],
            ["./examples/Sk8erBoi.mp3", "composer2"],
        ],
            fn=inference,
            inputs=[file_uploaded, composer],
            outputs=[wav_output1, wav_output2, midi_output],
            cache_examples=True
        )
        gr.HTML(
            """
        <div class="footer">
                    <p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a>
                    </p>
        </div>
        """
        )

block.launch(debug=False)
shutil.rmtree("./midi_wav_outputs")