import os os.system("python3 -m pip install -e .") import gradio as gr import note_seq from pytube import YouTube from pydub import AudioSegment from music21 import converter, environment from inferencemodel import InferenceModel from utils import upload_audio, create_image_from_note_sequence import nest_asyncio nest_asyncio.apply() SAMPLE_RATE = 16000 SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2" # Set up music21 with musescore us = environment.UserSettings() us["musescoreDirectPNGPath"] = "/usr/bin/mscore3" os.putenv("QT_QPA_PLATFORM", "offscreen") os.putenv("XDG_RUNTIME_DIR", environment.Environment().getRootTempDir()) # Start inference model inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3") current_model = "mt3" def change_model(model): global current_model global inference_model print("Inferece model", inference_model) print("Current model", current_model) checkpoint_path = f"/home/user/app/checkpoints/{model}/" if model == current_model: return inference_model = InferenceModel(checkpoint_path, model) current_model = model # Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper def get_audio(url, start_second): yt = YouTube(url) video = yt.streams.filter(only_audio=True).first() out_file = video.download(output_path=".") base, ext = os.path.splitext(out_file) new_file = base + ".wav" os.rename(out_file, new_file) a = new_file wav_to_cut = AudioSegment.from_file(a) # pydub does things in milliseconds ten_seconds = 10 * 1000 start_second = start_second * 1000 first_10_seconds = wav_to_cut[start_second:start_second+ten_seconds] os.remove(new_file) first_10_seconds.export("final_audio.wav", format="wav") return "final_audio.wav" # Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer def populate_metadata(link, start_second): yt = YouTube(link) audio = get_audio(link, start_second) return yt.thumbnail_url, yt.title, audio, audio def inference(yt_audio_path): with open(yt_audio_path, 'rb') as fd: contents = fd.read() audio = upload_audio(contents,sample_rate=SAMPLE_RATE) est_ns = inference_model(audio) note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid") synth = note_seq.midi_synth.fluidsynth array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH) int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats) piano_roll = create_image_from_note_sequence(est_ns) parsed = converter.parse("./transcribed.mid") score = parsed.write("musicxml.png") return "./transcribed.mid", (SAMPLE_RATE, int16_data), piano_roll, score title = "Transcribe music from YouTube videos using Transformers." description = """ Gradio demo for Music Transcription with Transformers. Read more in the links below. To use this demo, just add a YouTube link with the music you want to transcribe. """ article = "

Blog: Music Transcription with Transformers | Github Repo

" # Create a block object demo = gr.Blocks() # Use your Block object as a context with demo: gr.Markdown("

" + title + "

") gr.Markdown(description) with gr.Box(): model_label = """ What kind of model you want to use? The ismir2021 model transcribes piano only, with note velocities. The mt3 model transcribes multiple simultaneous instruments, but without velocities. """ model = gr.Radio( ["mt3", "ismir2021"], label=model_label, value="mt3" ) model.change(fn=change_model, inputs=model, outputs=[]) with gr.Row(): link = gr.Textbox(label="YouTube Link") start_second = gr.Number(label="Select starting point (in seconds) for the transcription", value=0, precision=0) with gr.Row(): preview_btn = gr.Button("Preview") with gr.Row().style(mobile_collapse=False, equal_height=True): title = gr.Label(label="Video Title", placeholder="Title") img = gr.Image(label="Thumbnail") with gr.Row(): yt_audio = gr.Audio() yt_audio_path = gr.Textbox(visible=False) preview_btn.click(fn=populate_metadata, inputs=[link, start_second], outputs=[img, title, yt_audio, yt_audio_path]) with gr.Row(): btn = gr.Button("Transcribe music") with gr.Row(): midi_file = gr.File() midi_audio = gr.Audio() with gr.Row(): piano_roll = gr.Image() score = gr.Image() btn.click(inference, inputs=yt_audio_path, outputs=[midi_file, midi_audio, piano_roll, score]) gr.Markdown(''' [![Twitter Follow](https://img.shields.io/twitter/follow/juancopi81?style=social)](https://twitter.com/juancopi81) ![visitors](https://visitor-badge.glitch.me/badge?page_id=Juancopi81.YoutubeMusicTranscribe) ''') gr.Markdown(article) demo.launch()