juancopi81's picture
Use of Tableau color
11e63a0
raw history blame
No virus
4.33 kB
import os
os.system("python3 -m pip install -e .")
import gradio as gr
import note_seq
from pytube import YouTube
from pydub import AudioSegment
from inferencemodel import InferenceModel
from utils import upload_audio, create_image_from_note_sequence
import nest_asyncio
nest_asyncio.apply()
SAMPLE_RATE = 16000
SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"
# Start inference model
inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3")
current_model = "mt3"
def change_model(model):
global current_model
global inference_model
print("Inferece model", inference_model)
print("Current model", current_model)
checkpoint_path = f"/home/user/app/checkpoints/{model}/"
if model == current_model:
return
inference_model = InferenceModel(checkpoint_path, model)
current_model = model
# Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
def get_audio(url):
yt = YouTube(url)
video = yt.streams.filter(only_audio=True).first()
out_file = video.download(output_path=".")
base, ext = os.path.splitext(out_file)
new_file = base + ".wav"
os.rename(out_file, new_file)
a = new_file
wav_to_cut = AudioSegment.from_file(a)
# pydub does things in milliseconds
ten_seconds = 10 * 1000
first_10_seconds = wav_to_cut[:ten_seconds]
os.remove(new_file)
first_10_seconds.export("final_audio.wav", format="wav")
return "final_audio.wav"
# Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer
def populate_metadata(link):
yt = YouTube(link)
audio = get_audio(link)
return yt.thumbnail_url, yt.title, audio, audio
def inference(yt_audio_path):
with open(yt_audio_path, 'rb') as fd:
contents = fd.read()
audio = upload_audio(contents,sample_rate=SAMPLE_RATE)
est_ns = inference_model(audio)
note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")
synth = note_seq.midi_synth.fluidsynth
array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH)
int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
piano_roll = create_image_from_note_sequence(est_ns)
return "./transcribed.mid", (SAMPLE_RATE, int16_data), piano_roll
title = "Transcribe music from YouTube videos using Transformers."
description = """
Gradio demo for Music Transcription with Transformers. Read more in the links below.
"""
article = "<p style='text-align: center'><a href='https://magenta.tensorflow.org/transcription-with-transformers' target='_blank'>Blog: Music Transcription with Transformers</a> | <a href='https://github.com/magenta/mt3' target='_blank'>Github Repo</a></p>"
# Create a block object
demo = gr.Blocks()
# Use your Block object as a context
with demo:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
gr.Markdown(description)
with gr.Box():
model_label = """
What kind of model you want to use?
The ismir2021 model transcribes piano only, with note velocities.
The mt3 model transcribes multiple simultaneous instruments, but without velocities.
"""
model = gr.Radio(
["mt3", "ismir2021"],
label=model_label,
value="mt3"
)
model.change(fn=change_model, inputs=model, outputs=[])
link = gr.Textbox(label="YouTube Link")
with gr.Row().style(mobile_collapse=False, equal_height=True):
title = gr.Label(label="Video Title", placeholder="Title")
img = gr.Image(label="Thumbnail")
with gr.Row():
yt_audio = gr.Audio()
yt_audio_path = gr.Textbox(visible=False)
link.change(fn=populate_metadata, inputs=link, outputs=[img, title, yt_audio, yt_audio_path])
with gr.Row():
btn = gr.Button("Transcribe music")
with gr.Row():
midi_file = gr.File()
midi_audio = gr.Audio()
with gr.Row():
piano_roll = gr.Image()
btn.click(inference,
inputs=yt_audio_path,
outputs=[midi_file, midi_audio, piano_roll])
gr.Markdown(article)
demo.launch()