Spaces:
Build error
Build error
File size: 4,590 Bytes
ef98e8e 2daf15a 0760318 cf24f3c 2ab00ef 831c029 0760318 13af1af 1f1f657 13af1af d9489e4 85250f0 ed28ae4 85250f0 ed28ae4 d9489e4 1f1f657 be33ae4 ed28ae4 8eb2669 ed28ae4 2ab00ef 8eb2669 2ab00ef 058f5c3 089aaf1 85250f0 cf24f3c 2ab00ef c0fa1b2 cf24f3c 2a4494f c0fa1b2 2a4494f 1a06f79 ed28ae4 a5934c8 b37a7e3 fa0462c b37a7e3 1f1f657 089aaf1 6b76298 fff3582 bdfb432 85250f0 ed28ae4 cf24f3c 6b76298 ed28ae4 8eb2669 85250f0 ed28ae4 85250f0 ed28ae4 be33ae4 cf24f3c fa0462c be33ae4 cf24f3c 2ab00ef b487a20 1f1f657 058f5c3 c0fa1b2 058f5c3 7c17274 1f1f657 bdfb432 2a4494f c0fa1b2 bdfb432 058f5c3 bdfb432 be33ae4 ed28ae4 11e63a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
os.system("python3 -m pip install -e .")
import gradio as gr
import note_seq
from pytube import YouTube
from pydub import AudioSegment
from music21 import converter
from inferencemodel import InferenceModel
from utils import upload_audio, create_image_from_note_sequence
import nest_asyncio
nest_asyncio.apply()
SAMPLE_RATE = 16000
SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"
# Start inference model
inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3")
current_model = "mt3"
def change_model(model):
global current_model
global inference_model
print("Inferece model", inference_model)
print("Current model", current_model)
checkpoint_path = f"/home/user/app/checkpoints/{model}/"
if model == current_model:
return
inference_model = InferenceModel(checkpoint_path, model)
current_model = model
# Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
def get_audio(url):
yt = YouTube(url)
video = yt.streams.filter(only_audio=True).first()
out_file = video.download(output_path=".")
base, ext = os.path.splitext(out_file)
new_file = base + ".wav"
os.rename(out_file, new_file)
a = new_file
wav_to_cut = AudioSegment.from_file(a)
# pydub does things in milliseconds
ten_seconds = 10 * 1000
first_10_seconds = wav_to_cut[:ten_seconds]
os.remove(new_file)
first_10_seconds.export("final_audio.wav", format="wav")
return "final_audio.wav"
# Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer
def populate_metadata(link):
yt = YouTube(link)
audio = get_audio(link)
return yt.thumbnail_url, yt.title, audio, audio
def inference(yt_audio_path):
with open(yt_audio_path, 'rb') as fd:
contents = fd.read()
audio = upload_audio(contents,sample_rate=SAMPLE_RATE)
est_ns = inference_model(audio)
note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")
synth = note_seq.midi_synth.fluidsynth
array_of_floats = synth(est_ns, sample_rate=SAMPLE_RATE, sf2_path=SF2_PATH)
int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
piano_roll = create_image_from_note_sequence(est_ns)
parsed = converter.parse("./transcribed.mid")
score = parsed.write("musicxml.png", fp="score")
print(score)
return "./transcribed.mid", (SAMPLE_RATE, int16_data), piano_roll, score
title = "Transcribe music from YouTube videos using Transformers."
description = """
Gradio demo for Music Transcription with Transformers. Read more in the links below.
To use this demo, just add a YouTube link with the music you want to transcribe.
"""
article = "<p style='text-align: center'><a href='https://magenta.tensorflow.org/transcription-with-transformers' target='_blank'>Blog: Music Transcription with Transformers</a> | <a href='https://github.com/magenta/mt3' target='_blank'>Github Repo</a></p>"
# Create a block object
demo = gr.Blocks()
# Use your Block object as a context
with demo:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
gr.Markdown(description)
with gr.Box():
model_label = """
What kind of model you want to use?
The ismir2021 model transcribes piano only, with note velocities.
The mt3 model transcribes multiple simultaneous instruments, but without velocities.
"""
model = gr.Radio(
["mt3", "ismir2021"],
label=model_label,
value="mt3"
)
model.change(fn=change_model, inputs=model, outputs=[])
link = gr.Textbox(label="YouTube Link")
with gr.Row().style(mobile_collapse=False, equal_height=True):
title = gr.Label(label="Video Title", placeholder="Title")
img = gr.Image(label="Thumbnail")
with gr.Row():
yt_audio = gr.Audio()
yt_audio_path = gr.Textbox(visible=False)
link.change(fn=populate_metadata, inputs=link, outputs=[img, title, yt_audio, yt_audio_path])
with gr.Row():
btn = gr.Button("Transcribe music")
with gr.Row():
midi_file = gr.File()
midi_audio = gr.Audio()
with gr.Row():
piano_roll = gr.Image()
score = gr.Image()
btn.click(inference,
inputs=yt_audio_path,
outputs=[midi_file, midi_audio, piano_roll, score])
gr.Markdown(article)
demo.launch() |