import os os.system("python3 -m pip install -e .") os.system("add-apt-repository ppa:mscore-ubuntu/mscore3-stable") import gradio as gr import note_seq from pytube import YouTube from pydub import AudioSegment from inferencemodel import InferenceModel from utils import upload_audio, create_image_from_note_sequence import nest_asyncio nest_asyncio.apply() SAMPLE_RATE = 16000 SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2" # Start inference model inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3") current_model = "mt3" def change_model(model): global current_model global inference_model print("Inferece model", inference_model) print("Current model", current_model) checkpoint_path = f"/home/user/app/checkpoints/{model}/" if model == current_model: return inference_model = InferenceModel(checkpoint_path, model) current_model = model # Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper def get_audio(url): yt = YouTube(url) video = yt.streams.filter(only_audio=True).first() out_file = video.download(output_path=".") base, ext = os.path.splitext(out_file) new_file = base + ".wav" os.rename(out_file, new_file) a = new_file wav_to_cut = AudioSegment.from_file(a) # pydub does things in milliseconds ten_seconds = 10 * 1000 first_10_seconds = wav_to_cut[:ten_seconds] os.remove(new_file) first_10_seconds.export("final_audio.wav", format="wav") return "final_audio.wav" # Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer def populate_metadata(link): yt = YouTube(link) audio = get_audio(link) return yt.thumbnail_url, yt.title, audio, audio def inference(yt_audio_path): with open(yt_audio_path, 'rb') as fd: contents = fd.read() audio = upload_audio(contents,sample_rate=SAMPLE_RATE) est_ns = inference_model(audio) note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid") synth = note_seq.midi_synth.fluidsynth array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH) int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats) piano_roll = create_image_from_note_sequence(est_ns) return "./transcribed.mid", (SAMPLE_RATE, int16_data), piano_roll title = "Transcribe music from YouTube videos using Transformers." description = """ Gradio demo for Music Transcription with Transformers. Read more in the links below. """ article = "

Blog: Music Transcription with Transformers | Github Repo

" # Create a block object demo = gr.Blocks() # Use your Block object as a context with demo: gr.Markdown("

" + title + "

") gr.Markdown(description) with gr.Box(): model_label = """ What kind of model you want to use? The ismir2021 model transcribes piano only, with note velocities. The mt3 model transcribes multiple simultaneous instruments, but without velocities. """ model = gr.Radio( ["mt3", "ismir2021"], label=model_label, value="mt3" ) model.change(fn=change_model, inputs=model, outputs=[]) link = gr.Textbox(label="YouTube Link") with gr.Row().style(mobile_collapse=False, equal_height=True): title = gr.Label(label="Video Title", placeholder="Title") img = gr.Image(label="Thumbnail") with gr.Row(): yt_audio = gr.Audio() yt_audio_path = gr.Textbox(visible=False) link.change(fn=populate_metadata, inputs=link, outputs=[img, title, yt_audio, yt_audio_path]) with gr.Row(): btn = gr.Button("Transcribe music") with gr.Row(): midi_file = gr.File() midi_audio = gr.Audio() with gr.Row(): piano_roll = gr.Image() btn.click(inference, inputs=yt_audio_path, outputs=[midi_file, midi_audio, piano_roll]) gr.Markdown(article) demo.launch()