Spaces:

juancopi81
/

youtube-music-transcribe

Build error

App Files Files Community

juancopi81 commited on Nov 7, 2022

Commit

d9489e4

•

1 Parent(s): 7c17274

Add plotting function

Browse files

Files changed (2) hide show

app.py +8 -5
utils.py +43 -1

app.py CHANGED Viewed

@@ -11,6 +11,9 @@ from pydub import AudioSegment
 from inferencemodel import InferenceModel
 from utils import upload_audio
 SAMPLE_RATE = 16000
 SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"
@@ -20,13 +23,14 @@ current_model = "mt3"
 def change_model(model):
     global current_model
     checkpoint_path = f"/home/user/app/checkpoints/{model}/"
     if model == current_model:
         return
-    global inference_model
     inference_model = InferenceModel(checkpoint_path, model)
     current_model = model
     print("Inferece model", inference_model)
 # Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
 def get_audio(url):
@@ -37,7 +41,6 @@ def get_audio(url):
     new_file = base + ".wav"
     os.rename(out_file, new_file)
     a = new_file
-    print("file a is:", a)
     wav_to_cut = AudioSegment.from_file(a)
     # pydub does things in milliseconds
     ten_seconds = 10 * 1000
@@ -53,17 +56,17 @@ def populate_metadata(link):
     return yt.thumbnail_url, yt.title, audio
 def inference(yt_audio):
-    with open(yt_audio, "rb") as fd:
         contents = fd.read()
-    audio = upload_audio(contents,sample_rate=16000)
     est_ns = inference_model(audio)
     note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")
     note_sequence = note_seq.midi_to_note_sequence("./transcribed.mid")
     synth = note_seq.midi_synth.fluidsynth
-    array_of_floats = synth(note_sequence, sample_rate=SAMPLE_RATE)
     int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
     # piano_roll = create_image_from_note_sequence(note_sequence)

 from inferencemodel import InferenceModel
 from utils import upload_audio
+import nest_asyncio
+nest_asyncio.apply()
 SAMPLE_RATE = 16000
 SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"
 def change_model(model):
     global current_model
+    global inference_model
     checkpoint_path = f"/home/user/app/checkpoints/{model}/"
     if model == current_model:
         return
     inference_model = InferenceModel(checkpoint_path, model)
     current_model = model
     print("Inferece model", inference_model)
+    print("Current model", current_model)
 # Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
 def get_audio(url):
     new_file = base + ".wav"
     os.rename(out_file, new_file)
     a = new_file
     wav_to_cut = AudioSegment.from_file(a)
     # pydub does things in milliseconds
     ten_seconds = 10 * 1000
     return yt.thumbnail_url, yt.title, audio
 def inference(yt_audio):
+    with open(yt_audio[1], "rb") as fd:
         contents = fd.read()
+    audio = upload_audio(contents,sample_rate=SAMPLE_RATE)
     est_ns = inference_model(audio)
     note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")
     note_sequence = note_seq.midi_to_note_sequence("./transcribed.mid")
     synth = note_seq.midi_synth.fluidsynth
+    array_of_floats = synth(note_sequence, sample_rate=44100)
     int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
     # piano_roll = create_image_from_note_sequence(note_sequence)

utils.py CHANGED Viewed

@@ -1,8 +1,14 @@
 import tempfile
 import librosa
 class AudioIOReadError(BaseException):  # pylint:disable=g-bad-exception-name
   pass
@@ -51,4 +57,40 @@ def load_audio(audio_filename, sample_rate, duration=10):
     y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True, duration=duration)
   except Exception as e:  # pylint: disable=broad-except
     raise AudioIOReadError(e)
-  return y

 import tempfile
+import collections
 import librosa
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+from PIL import Image
 class AudioIOReadError(BaseException):  # pylint:disable=g-bad-exception-name
   pass
     y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True, duration=duration)
   except Exception as e:  # pylint: disable=broad-except
     raise AudioIOReadError(e)
+  return y
+# Generate piano_roll
+def sequence_to_pandas_dataframe(sequence):
+    pd_dict = collections.defaultdict(list)
+    for note in sequence.notes:
+        pd_dict["start_time"].append(note.start_time)
+        pd_dict["end_time"].append(note.end_time)
+        pd_dict["duration"].append(note.end_time - note.start_time)
+        pd_dict["pitch"].append(note.pitch)
+    return pd.DataFrame(pd_dict)
+def dataframe_to_pianoroll_img(df):
+    fig = plt.figure(figsize=(8, 5))
+    ax = fig.add_subplot(111)
+    ax.scatter(df.start_time, df.pitch, c="white")
+    for _, row in df.iterrows():
+        ax.add_patch(Rectangle((row["start_time"], row["pitch"]-0.4), row["duration"], 0.4, color="black"))
+    plt.xlabel('time (sec.)', fontsize=18)
+    plt.ylabel('pitch (MIDI)', fontsize=16)
+    return fig
+def fig2img(fig):
+    """Convert a Matplotlib figure to a PIL Image and return it"""
+    import io
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png")
+    buf.seek(0)
+    img = Image.open(buf)
+    return img
+def create_image_from_note_sequence(sequence):
+    df_sequence = sequence_to_pandas_dataframe(sequence)
+    fig = dataframe_to_pianoroll_img(df_sequence)
+    img = fig2img(fig)
+    return img