import os os.system("python3 -m pip install -e .") import gradio as gr import note_seq from pytube import YouTube from pydub import AudioSegment from inferencemodel import InferenceModel from utils import upload_audio import nest_asyncio nest_asyncio.apply() SAMPLE_RATE = 16000 SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2" # Start inference model inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3") current_model = "mt3" def change_model(model): global current_model global inference_model checkpoint_path = f"/home/user/app/checkpoints/{model}/" if model == current_model: return inference_model = InferenceModel(checkpoint_path, model) current_model = model print("Inferece model", inference_model) print("Current model", current_model) # Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper def get_audio(url): yt = YouTube(url) video = yt.streams.filter(only_audio=True).first() out_file = video.download(output_path=".") base, ext = os.path.splitext(out_file) new_file = base + ".wav" os.rename(out_file, new_file) a = new_file wav_to_cut = AudioSegment.from_file(a) # pydub does things in milliseconds ten_seconds = 10 * 1000 first_10_seconds = wav_to_cut[:ten_seconds] os.remove(new_file) first_10_seconds.export("final_audio.wav", format="wav") return "final_audio.wav" # Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer def populate_metadata(link): yt = YouTube(link) audio = get_audio(link) return yt.thumbnail_url, yt.title, audio def inference(yt_audio_path): with open(yt_audio_path, 'rb') as fd: contents = fd.read() audio = upload_audio(contents,sample_rate=SAMPLE_RATE) est_ns = inference_model(audio) note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid") synth = note_seq.midi_synth.fluidsynth array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH) int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats) # piano_roll = create_image_from_note_sequence(note_sequence) return "./transcribed.mid", (SAMPLE_RATE, int16_data) title = "Transcribe music from YouTube videos using Transformers." description = """ Gradio demo for Music Transcription with Transformers. Read more in the links below. """ article = "
Blog: Music Transcription with Transformers | Github Repo
" # Create a block object demo = gr.Blocks() # Use your Block object as a context with demo: gr.Markdown("