import os os.system("python3 -m pip install -e .") import gradio as gr import note_seq from pytube import YouTube from pydub import AudioSegment from music21 import converter, environment from inferencemodel import InferenceModel from utils import upload_audio, create_image_from_note_sequence import nest_asyncio nest_asyncio.apply() SAMPLE_RATE = 16000 SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2" # Set up music21 with musescore us = environment.UserSettings() us["musescoreDirectPNGPath"] = "/usr/bin/mscore3" os.putenv("QT_QPA_PLATFORM", "offscreen") os.putenv("XDG_RUNTIME_DIR", environment.Environment().getRootTempDir()) def load_model(model=str): checkpoint_path = f"/home/user/app/checkpoints/{model}/" # Start inference model inference_model = InferenceModel(checkpoint_path, model) return inference_model # Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper def get_audio(url): yt = YouTube(url) video = yt.streams.filter(only_audio=True).first() out_file = video.download(output_path=".") base, ext = os.path.splitext(out_file) new_file = base + ".wav" os.rename(out_file, new_file) a = new_file return a # Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer def populate_metadata(link): yt = YouTube(link) audio = get_audio(link) return yt.thumbnail_url, yt.title, audio, audio def inference(yt_audio_path, model): with open(yt_audio_path, 'rb') as fd: contents = fd.read() audio = upload_audio(contents,sample_rate=SAMPLE_RATE) inference_model = load_model(model) est_ns = inference_model(audio) note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid") synth = note_seq.midi_synth.fluidsynth array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH) int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats) piano_roll = create_image_from_note_sequence(est_ns) parsed = converter.parse("./transcribed.mid") score = parsed.write("musicxml.png") return "./transcribed.mid", (SAMPLE_RATE, int16_data), piano_roll, score title = "Transcribe music from YouTube videos using Transformers." description = """ Gradio demo for Music Transcription with Transformers. Read more in the links below. To use this demo, just add a YouTube link with the music you want to transcribe. """ article = "

Blog: Music Transcription with Transformers | Github Repo

" # Create a block object demo = gr.Blocks() # Use your Block object as a context with demo: gr.Markdown("

" + title + "

") gr.Markdown(description) with gr.Box(): with gr.Box(): model_label = """ What kind of model you want to use? The ismir2021 model transcribes piano only, with note velocities. The mt3 model transcribes multiple simultaneous instruments, but without velocities. """ model = gr.Radio( ["mt3"], label=model_label, value="mt3" ) with gr.Row(): link = gr.Textbox(label="YouTube Link") with gr.Row(): preview_btn = gr.Button("Preview") with gr.Box(): with gr.Row().style(mobile_collapse=False, equal_height=True): title = gr.Label(label="Video Title", placeholder="Title") img = gr.Image(label="Thumbnail") with gr.Row(): yt_audio = gr.Audio() yt_audio_path = gr.Textbox(visible=False) preview_btn.click(fn=populate_metadata, inputs=[link], outputs=[img, title, yt_audio, yt_audio_path]) with gr.Row(): btn = gr.Button("Transcribe music") with gr.Row(): midi_file = gr.File() midi_audio = gr.Audio() with gr.Row(): piano_roll = gr.Image() score = gr.Image() btn.click(inference, inputs=[yt_audio_path, model], outputs=[midi_file, midi_audio, piano_roll, score], api_name="transcribe_wav_to_midi") gr.Markdown(''' [![Twitter Follow](https://img.shields.io/twitter/follow/juancopi81?style=social)](https://twitter.com/juancopi81) ![visitors](https://visitor-badge.glitch.me/badge?page_id=Juancopi81.YoutubeMusicTranscribe) ''') gr.Markdown(article) demo.launch()