File size: 4,858 Bytes
ef98e8e
 
 
 
2daf15a
 
0760318
cf24f3c
2ab00ef
d58023a
0760318
13af1af
1f1f657
13af1af
d9489e4
 
 
85250f0
ed28ae4
85250f0
d58023a
 
 
 
 
 
bc718b3
0627c0d
bc718b3
8eb2669
bc718b3
 
2ab00ef
 
7a8f502
2ab00ef
 
 
 
 
 
 
7a8f502
85250f0
cf24f3c
7a8f502
cf24f3c
7a8f502
c0fa1b2
cf24f3c
bc718b3
c0fa1b2
2a4494f
 
 
 
bc718b3
 
1a06f79
ed28ae4
 
 
a5934c8
b37a7e3
fa0462c
b37a7e3
1f1f657
089aaf1
6b76298
b732dcb
9875faf
85250f0
ed28ae4
cf24f3c
 
6b76298
ed28ae4
8eb2669
85250f0
ed28ae4
 
85250f0
ed28ae4
 
7a8f502
ed28ae4
 
 
 
80f9ee2
 
 
 
 
 
 
22467cb
80f9ee2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3ff253
80f9ee2
 
 
 
058f5c3
7c17274
 
 
1f1f657
 
bdfb432
2a4494f
a3ff253
9cdb5a4
 
0241227
 
 
 
 
058f5c3
bdfb432
be33ae4
ed28ae4
11e63a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os

os.system("python3 -m pip install -e .")

import gradio as gr

import note_seq
from pytube import YouTube
from pydub import AudioSegment
from music21 import converter, environment

from inferencemodel import InferenceModel
from utils import upload_audio, create_image_from_note_sequence

import nest_asyncio
nest_asyncio.apply()

SAMPLE_RATE = 16000
SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"

# Set up music21 with musescore
us = environment.UserSettings()
us["musescoreDirectPNGPath"] = "/usr/bin/mscore3"
os.putenv("QT_QPA_PLATFORM", "offscreen")
os.putenv("XDG_RUNTIME_DIR", environment.Environment().getRootTempDir())

def load_model(model=str):
    checkpoint_path = f"/home/user/app/checkpoints/{model}/"
    # Start inference model
    inference_model = InferenceModel(checkpoint_path, model)
    return inference_model
    

# Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
def get_audio(url):
    yt = YouTube(url)
    video = yt.streams.filter(only_audio=True).first()
    out_file = video.download(output_path=".")
    base, ext = os.path.splitext(out_file)
    new_file = base + ".wav"
    os.rename(out_file, new_file)
    a = new_file
    return a

# Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer
def populate_metadata(link):
    yt = YouTube(link)
    audio = get_audio(link)
    return yt.thumbnail_url, yt.title, audio, audio

def inference(yt_audio_path, model):

    with open(yt_audio_path, 'rb') as fd:
      contents = fd.read()
    
    audio = upload_audio(contents,sample_rate=SAMPLE_RATE)
    
    inference_model = load_model(model)

    est_ns = inference_model(audio)
    
    note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")

    synth = note_seq.midi_synth.fluidsynth
    array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH)
    int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
    piano_roll = create_image_from_note_sequence(est_ns)
    
    parsed = converter.parse("./transcribed.mid")
    score = parsed.write("musicxml.png")
    return "./transcribed.mid", (SAMPLE_RATE, int16_data), piano_roll, score
  
title = "Transcribe music from YouTube videos using Transformers."
description = """
Gradio demo for Music Transcription with Transformers. Read more in the links below.
To use this demo, just add a YouTube link with the music you want to transcribe.
"""
article = "<p style='text-align: center'><a href='https://magenta.tensorflow.org/transcription-with-transformers' target='_blank'>Blog: Music Transcription with Transformers</a> | <a href='https://github.com/magenta/mt3' target='_blank'>Github Repo</a></p>"

# Create a block object
demo = gr.Blocks()

# Use your Block object as a context
with demo:
    gr.Markdown("<h1 style='text-align: center'>" 
                + title 
                + "</h1>")
    gr.Markdown(description)
    with gr.Box():
        with gr.Box():
            model_label = """
            What kind of model you want to use? 
            The ismir2021 model transcribes piano only, with note velocities. 
            The mt3 model transcribes multiple simultaneous instruments, but without velocities.
            """
            model = gr.Radio(
                ["mt3"], 
                label=model_label, 
                value="mt3"
            )

            with gr.Row():
                link = gr.Textbox(label="YouTube Link")
            with gr.Row():
                preview_btn = gr.Button("Preview")
                
        with gr.Box():
            with gr.Row().style(mobile_collapse=False, equal_height=True):
                title = gr.Label(label="Video Title", placeholder="Title")
                img = gr.Image(label="Thumbnail")
            with gr.Row():
                yt_audio = gr.Audio()
                yt_audio_path = gr.Textbox(visible=False)

            preview_btn.click(fn=populate_metadata,
                            inputs=[link],
                            outputs=[img, title, yt_audio, yt_audio_path])
            
            with gr.Row():
                btn = gr.Button("Transcribe music")
        
        with gr.Row():
            midi_file = gr.File()
            midi_audio = gr.Audio()
        with gr.Row():
            piano_roll = gr.Image()
            score = gr.Image()
        btn.click(inference,
                  inputs=[yt_audio_path, model],
                  outputs=[midi_file, midi_audio, piano_roll, score],
                  api_name="transcribe_wav_to_midi")

    gr.Markdown('''
      [![Twitter Follow](https://img.shields.io/twitter/follow/juancopi81?style=social)](https://twitter.com/juancopi81)
      ![visitors](https://visitor-badge.glitch.me/badge?page_id=Juancopi81.YoutubeMusicTranscribe)
    ''')
        
    gr.Markdown(article)
        

demo.launch()