Spaces:
Build error
Build error
File size: 4,356 Bytes
ef98e8e 2daf15a 0760318 cf24f3c 2ab00ef 0760318 13af1af c970deb 13af1af d9489e4 85250f0 ed28ae4 85250f0 ed28ae4 d9489e4 be33ae4 ed28ae4 8eb2669 ed28ae4 2ab00ef d9489e4 2ab00ef 8eb2669 2ab00ef 058f5c3 089aaf1 85250f0 cf24f3c 2ab00ef cf24f3c 7c17274 d9489e4 ed28ae4 0f3083a d9489e4 1a06f79 ed28ae4 a5934c8 b37a7e3 a5934c8 b37a7e3 089aaf1 7c17274 85250f0 ed28ae4 cf24f3c ed28ae4 8eb2669 85250f0 ed28ae4 85250f0 ed28ae4 be33ae4 cf24f3c f8401ac be33ae4 cf24f3c 2ab00ef a5934c8 058f5c3 2ab00ef 058f5c3 7c17274 058f5c3 7c17274 058f5c3 be33ae4 ed28ae4 85250f0 ed28ae4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
os.system("python3 -m pip install -e .")
import gradio as gr
import note_seq
from pytube import YouTube
from pydub import AudioSegment
from inferencemodel import InferenceModel
from utils import upload_audio
import nest_asyncio
nest_asyncio.apply()
SAMPLE_RATE = 16000
SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"
# Start inference model
inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3")
current_model = "mt3"
def change_model(model):
global current_model
global inference_model
checkpoint_path = f"/home/user/app/checkpoints/{model}/"
if model == current_model:
return
inference_model = InferenceModel(checkpoint_path, model)
current_model = model
print("Inferece model", inference_model)
print("Current model", current_model)
# Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
def get_audio(url):
yt = YouTube(url)
video = yt.streams.filter(only_audio=True).first()
out_file = video.download(output_path=".")
base, ext = os.path.splitext(out_file)
new_file = base + ".wav"
os.rename(out_file, new_file)
a = new_file
wav_to_cut = AudioSegment.from_file(a)
# pydub does things in milliseconds
ten_seconds = 10 * 1000
first_10_seconds = wav_to_cut[:ten_seconds]
os.remove(new_file)
first_10_seconds.export("final_audio.wav", format="wav")
return "final_audio.wav"
# Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer
def populate_metadata(link):
yt = YouTube(link)
audio = get_audio(link)
return yt.thumbnail_url, yt.title, audio
def inference(yt_audio):
with open(yt_audio[1], "rb") as fd:
contents = fd.read()
audio = upload_audio(contents,sample_rate=SAMPLE_RATE)
est_ns = inference_model(audio)
note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")
synth = note_seq.midi_synth.fluidsynth
array_of_floats = synth(est_ns, sample_rate=44100)
int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
# piano_roll = create_image_from_note_sequence(note_sequence)
return "./transcribed.mid", (SAMPLE_RATE, int16_data)
title = "Transcribe music from YouTube videos using Transformers."
description = """
Gradio demo for Music Transcription with Transformers. Read more in the links below.
"""
article = "<p style='text-align: center'><a href='https://magenta.tensorflow.org/transcription-with-transformers' target='_blank'>Blog: Music Transcription with Transformers</a> | <a href='https://github.com/magenta/mt3' target='_blank'>Github Repo</a></p>"
# Create a block object
demo = gr.Blocks()
# Use your Block object as a context
with demo:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
gr.Markdown(description)
with gr.Box():
model_label = """
What kind of model you want to use?
The ismir2021 model transcribes piano only, with note velocities.
The mt3 model transcribes multiple simultaneous instruments, but without velocities.
"""
model = gr.Radio(
["mt3",],
label=model_label,
value="mt3"
)
model.change(fn=change_model, inputs=model, outputs=[])
link = gr.Textbox(label="YouTube Link")
with gr.Row().style(mobile_collapse=False, equal_height=True):
title = gr.Label(label="Video Title", placeholder="Title")
img = gr.Image(label="Thumbnail")
with gr.Row():
yt_audio = gr.Audio(type="filepath", label="First 10 seconds")
link.change(fn=populate_metadata, inputs=link, outputs=[img, title, yt_audio])
with gr.Row():
btn = gr.Button("Transcribe music")
with gr.Row():
midi_file = gr.File()
midi_audio = gr.Audio()
btn.click(inference(yt_audio),
outputs=[midi_file, midi_audio])
gr.Markdown(article)
demo.launch()
""" gr.Interface(
inference,
gr.inputs.Audio(type="filepath", label="Input"),
[gr.outputs.File(label="Output")],
title=title,
description=description,
article=article,
examples=examples,
).launch().queue() """ |