Spaces:

juancopi81
/

youtube-music-transcribe

Build error

App Files Files Community

youtube-music-transcribe / app.py

juancopi81

Add musescoreDirectPNGPath

d58023a over 2 years ago

raw

history blame

5.27 kB

	import os

	os.system("python3 -m pip install -e .")

	import gradio as gr

	import note_seq
	from pytube import YouTube
	from pydub import AudioSegment
	from music21 import converter, environment

	from inferencemodel import InferenceModel
	from utils import upload_audio, create_image_from_note_sequence

	import nest_asyncio
	nest_asyncio.apply()

	SAMPLE_RATE = 16000
	SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"

	# Set up music21 with musescore
	us = environment.UserSettings()
	us["musescoreDirectPNGPath"] = "/usr/bin/mscore3"
	os.putenv("QT_QPA_PLATFORM", "offscreen")
	os.putenv("XDG_RUNTIME_DIR", environment.Environment().getRootTempDir())

	# Start inference model
	inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3")
	current_model = "mt3"

	def change_model(model):
	global current_model
	global inference_model
	print("Inferece model", inference_model)
	print("Current model", current_model)
	checkpoint_path = f"/home/user/app/checkpoints/{model}/"
	if model == current_model:
	return
	inference_model = InferenceModel(checkpoint_path, model)
	current_model = model

	# Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
	def get_audio(url, start_second):
	yt = YouTube(url)
	video = yt.streams.filter(only_audio=True).first()
	out_file = video.download(output_path=".")
	base, ext = os.path.splitext(out_file)
	new_file = base + ".wav"
	os.rename(out_file, new_file)
	a = new_file
	wav_to_cut = AudioSegment.from_file(a)
	# pydub does things in milliseconds
	ten_seconds = 10 * 1000
	start_second = start_second * 1000
	first_10_seconds = wav_to_cut[start_second:start_second+ten_seconds]
	os.remove(new_file)
	first_10_seconds.export("final_audio.wav", format="wav")
	return "final_audio.wav"

	# Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer
	def populate_metadata(link, start_second):
	yt = YouTube(link)
	audio = get_audio(link, start_second)
	return yt.thumbnail_url, yt.title, audio, audio

	def inference(yt_audio_path):

	with open(yt_audio_path, 'rb') as fd:
	contents = fd.read()

	audio = upload_audio(contents,sample_rate=SAMPLE_RATE)

	est_ns = inference_model(audio)

	note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")

	synth = note_seq.midi_synth.fluidsynth
	array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH)
	int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
	piano_roll = create_image_from_note_sequence(est_ns)

	parsed = converter.parse("./transcribed.mid")
	score = parsed.write("musicxml.png")
	return "./transcribed.mid", (SAMPLE_RATE, int16_data), piano_roll, score

	title = "Transcribe music from YouTube videos using Transformers."
	description = """
	Gradio demo for Music Transcription with Transformers. Read more in the links below.
	To use this demo, just add a YouTube link with the music you want to transcribe.
	"""
	article = "<p style='text-align: center'><a href='https://magenta.tensorflow.org/transcription-with-transformers' target='_blank'>Blog: Music Transcription with Transformers</a> \| <a href='https://github.com/magenta/mt3' target='_blank'>Github Repo</a></p>"

	# Create a block object
	demo = gr.Blocks()

	# Use your Block object as a context
	with demo:
	gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
	+ title
	+ "</h1>")
	gr.Markdown(description)
	with gr.Box():
	model_label = """
	What kind of model you want to use?
	The ismir2021 model transcribes piano only, with note velocities.
	The mt3 model transcribes multiple simultaneous instruments, but without velocities.
	"""
	model = gr.Radio(
	["mt3", "ismir2021"],
	label=model_label,
	value="mt3"
	)
	model.change(fn=change_model, inputs=model, outputs=[])

	with gr.Row():
	link = gr.Textbox(label="YouTube Link")
	start_second = gr.Number(label="Select starting point (in seconds) for the transcription",
	value=0,
	precision=0)
	with gr.Row():
	preview_btn = gr.Button("Preview")
	with gr.Row().style(mobile_collapse=False, equal_height=True):
	title = gr.Label(label="Video Title", placeholder="Title")
	img = gr.Image(label="Thumbnail")
	with gr.Row():
	yt_audio = gr.Audio()
	yt_audio_path = gr.Textbox(visible=False)

	preview_btn.click(fn=populate_metadata,
	inputs=[link, start_second],
	outputs=[img, title, yt_audio, yt_audio_path])

	with gr.Row():
	btn = gr.Button("Transcribe music")

	with gr.Row():
	midi_file = gr.File()
	midi_audio = gr.Audio()
	with gr.Row():
	piano_roll = gr.Image()
	score = gr.Image()
	btn.click(inference,
	inputs=yt_audio_path,
	outputs=[midi_file, midi_audio, piano_roll, score])

	gr.Markdown(article)


	demo.launch()