Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import numpy as np | |
import audioread | |
import librosa | |
from mido import MidiFile | |
from piano_vad import (note_detection_with_onset_offset_regress, | |
pedal_detection_with_onset_offset_regress) | |
import config | |
def create_folder(fd): | |
if not os.path.exists(fd): | |
os.makedirs(fd) | |
def get_filename(path): | |
path = os.path.realpath(path) | |
na_ext = path.split('/')[-1] | |
na = os.path.splitext(na_ext)[0] | |
return na | |
def note_to_freq(piano_note): | |
return 2 ** ((piano_note - 39) / 12) * 440 | |
def float32_to_int16(x): | |
assert np.max(np.abs(x)) <= 1. | |
return (x * 32767.).astype(np.int16) | |
def int16_to_float32(x): | |
return (x / 32767.).astype(np.float32) | |
def pad_truncate_sequence(x, max_len): | |
if len(x) < max_len: | |
return np.concatenate((x, np.zeros(max_len - len(x)))) | |
else: | |
return x[0 : max_len] | |
def read_midi(midi_path): | |
"""Parse MIDI file. | |
Args: | |
midi_path: str | |
Returns: | |
midi_dict: dict, e.g. { | |
'midi_event': [ | |
'program_change channel=0 program=0 time=0', | |
'control_change channel=0 control=64 value=127 time=0', | |
'control_change channel=0 control=64 value=63 time=236', | |
...], | |
'midi_event_time': [0., 0, 0.98307292, ...]} | |
""" | |
midi_file = MidiFile(midi_path) | |
ticks_per_beat = midi_file.ticks_per_beat | |
assert len(midi_file.tracks) == 2 | |
"""The first track contains tempo, time signature. The second track | |
contains piano events.""" | |
microseconds_per_beat = midi_file.tracks[0][0].tempo | |
beats_per_second = 1e6 / microseconds_per_beat | |
ticks_per_second = ticks_per_beat * beats_per_second | |
message_list = [] | |
ticks = 0 | |
time_in_second = [] | |
for message in midi_file.tracks[1]: | |
message_list.append(str(message)) | |
ticks += message.time | |
time_in_second.append(ticks / ticks_per_second) | |
midi_dict = { | |
'midi_event': np.array(message_list), | |
'midi_event_time': np.array(time_in_second)} | |
return midi_dict | |
def write_events_to_midi(start_time, note_events, pedal_events, midi_path): | |
"""Write out note events to MIDI file. | |
Args: | |
start_time: float | |
note_events: list of dict, e.g. [ | |
{'midi_note': 51, 'onset_time': 696.63544, 'offset_time': 696.9948, 'velocity': 44}, | |
{'midi_note': 58, 'onset_time': 696.99585, 'offset_time': 697.18646, 'velocity': 50} | |
...] | |
midi_path: str | |
""" | |
from mido import Message, MidiFile, MidiTrack, MetaMessage | |
# This configuration is the same as MIDIs in MAESTRO dataset | |
ticks_per_beat = 384 | |
beats_per_second = 2 | |
ticks_per_second = ticks_per_beat * beats_per_second | |
microseconds_per_beat = int(1e6 // beats_per_second) | |
midi_file = MidiFile() | |
midi_file.ticks_per_beat = ticks_per_beat | |
# Track 0 | |
track0 = MidiTrack() | |
track0.append(MetaMessage('set_tempo', tempo=microseconds_per_beat, time=0)) | |
track0.append(MetaMessage('time_signature', numerator=4, denominator=4, time=0)) | |
track0.append(MetaMessage('end_of_track', time=1)) | |
midi_file.tracks.append(track0) | |
# Track 1 | |
track1 = MidiTrack() | |
# Message rolls of MIDI | |
message_roll = [] | |
for note_event in note_events: | |
# Onset | |
message_roll.append({ | |
'time': note_event['onset_time'], | |
'midi_note': note_event['midi_note'], | |
'velocity': note_event['velocity']}) | |
# Offset | |
message_roll.append({ | |
'time': note_event['offset_time'], | |
'midi_note': note_event['midi_note'], | |
'velocity': 0}) | |
if pedal_events: | |
for pedal_event in pedal_events: | |
message_roll.append({'time': pedal_event['onset_time'], 'control_change': 64, 'value': 127}) | |
message_roll.append({'time': pedal_event['offset_time'], 'control_change': 64, 'value': 0}) | |
# Sort MIDI messages by time | |
message_roll.sort(key=lambda note_event: note_event['time']) | |
previous_ticks = 0 | |
for message in message_roll: | |
this_ticks = int((message['time'] - start_time) * ticks_per_second) | |
if this_ticks >= 0: | |
diff_ticks = this_ticks - previous_ticks | |
previous_ticks = this_ticks | |
if 'midi_note' in message.keys(): | |
track1.append(Message('note_on', note=message['midi_note'], velocity=message['velocity'], time=diff_ticks)) | |
elif 'control_change' in message.keys(): | |
track1.append(Message('control_change', channel=0, control=message['control_change'], value=message['value'], time=diff_ticks)) | |
track1.append(MetaMessage('end_of_track', time=1)) | |
midi_file.tracks.append(track1) | |
midi_file.save(midi_path) | |
class RegressionPostProcessor(object): | |
def __init__(self, frames_per_second, classes_num, onset_threshold, | |
offset_threshold, frame_threshold, pedal_offset_threshold): | |
"""Postprocess the output probabilities of a transription model to MIDI | |
events. | |
Args: | |
frames_per_second: int | |
classes_num: int | |
onset_threshold: float | |
offset_threshold: float | |
frame_threshold: float | |
pedal_offset_threshold: float | |
""" | |
self.frames_per_second = frames_per_second | |
self.classes_num = classes_num | |
self.onset_threshold = onset_threshold | |
self.offset_threshold = offset_threshold | |
self.frame_threshold = frame_threshold | |
self.pedal_offset_threshold = pedal_offset_threshold | |
self.begin_note = config.begin_note | |
self.velocity_scale = config.velocity_scale | |
def output_dict_to_midi_events(self, output_dict): | |
"""Main function. Post process model outputs to MIDI events. | |
Args: | |
output_dict: { | |
'reg_onset_output': (segment_frames, classes_num), | |
'reg_offset_output': (segment_frames, classes_num), | |
'frame_output': (segment_frames, classes_num), | |
'velocity_output': (segment_frames, classes_num), | |
'reg_pedal_onset_output': (segment_frames, 1), | |
'reg_pedal_offset_output': (segment_frames, 1), | |
'pedal_frame_output': (segment_frames, 1)} | |
Outputs: | |
est_note_events: list of dict, e.g. [ | |
{'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83}, | |
{'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}] | |
est_pedal_events: list of dict, e.g. [ | |
{'onset_time': 0.17, 'offset_time': 0.96}, | |
{'osnet_time': 1.17, 'offset_time': 2.65}] | |
""" | |
# Post process piano note outputs to piano note and pedal events information | |
(est_on_off_note_vels, est_pedal_on_offs) = \ | |
self.output_dict_to_note_pedal_arrays(output_dict) | |
"""est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity], | |
est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]""" | |
# Reformat notes to MIDI events | |
est_note_events = self.detected_notes_to_events(est_on_off_note_vels) | |
if est_pedal_on_offs is None: | |
est_pedal_events = None | |
else: | |
est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs) | |
return est_note_events, est_pedal_events | |
def output_dict_to_note_pedal_arrays(self, output_dict): | |
"""Postprocess the output probabilities of a transription model to MIDI | |
events. | |
Args: | |
output_dict: dict, { | |
'reg_onset_output': (frames_num, classes_num), | |
'reg_offset_output': (frames_num, classes_num), | |
'frame_output': (frames_num, classes_num), | |
'velocity_output': (frames_num, classes_num), | |
...} | |
Returns: | |
est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time, | |
offset_time, piano_note and velocity. E.g. [ | |
[39.74, 39.87, 27, 0.65], | |
[11.98, 12.11, 33, 0.69], | |
...] | |
est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time | |
and offset_time. E.g. [ | |
[0.17, 0.96], | |
[1.17, 2.65], | |
...] | |
""" | |
# ------ 1. Process regression outputs to binarized outputs ------ | |
# For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.] | |
# will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.] | |
# Calculate binarized onset output from regression output | |
(onset_output, onset_shift_output) = \ | |
self.get_binarized_output_from_regression( | |
reg_output=output_dict['reg_onset_output'], | |
threshold=self.onset_threshold, neighbour=2) | |
output_dict['onset_output'] = onset_output # Values are 0 or 1 | |
output_dict['onset_shift_output'] = onset_shift_output | |
# Calculate binarized offset output from regression output | |
(offset_output, offset_shift_output) = \ | |
self.get_binarized_output_from_regression( | |
reg_output=output_dict['reg_offset_output'], | |
threshold=self.offset_threshold, neighbour=4) | |
output_dict['offset_output'] = offset_output # Values are 0 or 1 | |
output_dict['offset_shift_output'] = offset_shift_output | |
if 'reg_pedal_onset_output' in output_dict.keys(): | |
"""Pedal onsets are not used in inference. Instead, frame-wise pedal | |
predictions are used to detect onsets. We empirically found this is | |
more accurate to detect pedal onsets.""" | |
pass | |
if 'reg_pedal_offset_output' in output_dict.keys(): | |
# Calculate binarized pedal offset output from regression output | |
(pedal_offset_output, pedal_offset_shift_output) = \ | |
self.get_binarized_output_from_regression( | |
reg_output=output_dict['reg_pedal_offset_output'], | |
threshold=self.pedal_offset_threshold, neighbour=4) | |
output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1 | |
output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output | |
# ------ 2. Process matrices results to event results ------ | |
# Detect piano notes from output_dict | |
est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict) | |
if 'reg_pedal_onset_output' in output_dict.keys(): | |
# Detect piano pedals from output_dict | |
est_pedal_on_offs = self.output_dict_to_detected_pedals(output_dict) | |
else: | |
est_pedal_on_offs = None | |
return est_on_off_note_vels, est_pedal_on_offs | |
def get_binarized_output_from_regression(self, reg_output, threshold, neighbour): | |
"""Calculate binarized output and shifts of onsets or offsets from the | |
regression results. | |
Args: | |
reg_output: (frames_num, classes_num) | |
threshold: float | |
neighbour: int | |
Returns: | |
binary_output: (frames_num, classes_num) | |
shift_output: (frames_num, classes_num) | |
""" | |
binary_output = np.zeros_like(reg_output) | |
shift_output = np.zeros_like(reg_output) | |
(frames_num, classes_num) = reg_output.shape | |
for k in range(classes_num): | |
x = reg_output[:, k] | |
for n in range(neighbour, frames_num - neighbour): | |
if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour): | |
binary_output[n, k] = 1 | |
"""See Section III-D in [1] for deduction. | |
[1] Q. Kong, et al., High-resolution Piano Transcription | |
with Pedals by Regressing Onsets and Offsets Times, 2020.""" | |
if x[n - 1] > x[n + 1]: | |
shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2 | |
else: | |
shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2 | |
shift_output[n, k] = shift | |
return binary_output, shift_output | |
def is_monotonic_neighbour(self, x, n, neighbour): | |
"""Detect if values are monotonic in both side of x[n]. | |
Args: | |
x: (frames_num,) | |
n: int | |
neighbour: int | |
Returns: | |
monotonic: bool | |
""" | |
monotonic = True | |
for i in range(neighbour): | |
if x[n - i] < x[n - i - 1]: | |
monotonic = False | |
if x[n + i] < x[n + i + 1]: | |
monotonic = False | |
return monotonic | |
def output_dict_to_detected_notes(self, output_dict): | |
"""Postprocess output_dict to piano notes. | |
Args: | |
output_dict: dict, e.g. { | |
'onset_output': (frames_num, classes_num), | |
'onset_shift_output': (frames_num, classes_num), | |
'offset_output': (frames_num, classes_num), | |
'offset_shift_output': (frames_num, classes_num), | |
'frame_output': (frames_num, classes_num), | |
'onset_output': (frames_num, classes_num), | |
...} | |
Returns: | |
est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets, | |
MIDI notes and velocities. E.g., | |
[[39.7375, 39.7500, 27., 0.6638], | |
[11.9824, 12.5000, 33., 0.6892], | |
...] | |
""" | |
est_tuples = [] | |
est_midi_notes = [] | |
classes_num = output_dict['frame_output'].shape[-1] | |
for piano_note in range(classes_num): | |
"""Detect piano notes""" | |
est_tuples_per_note = note_detection_with_onset_offset_regress( | |
frame_output=output_dict['frame_output'][:, piano_note], | |
onset_output=output_dict['onset_output'][:, piano_note], | |
onset_shift_output=output_dict['onset_shift_output'][:, piano_note], | |
offset_output=output_dict['offset_output'][:, piano_note], | |
offset_shift_output=output_dict['offset_shift_output'][:, piano_note], | |
velocity_output=output_dict['velocity_output'][:, piano_note], | |
frame_threshold=self.frame_threshold) | |
est_tuples += est_tuples_per_note | |
est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note) | |
est_tuples = np.array(est_tuples) # (notes, 5) | |
"""(notes, 5), the five columns are onset, offset, onset_shift, | |
offset_shift and normalized_velocity""" | |
est_midi_notes = np.array(est_midi_notes) # (notes,) | |
if len(est_tuples) == 0: | |
return np.array([]) | |
else: | |
onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second | |
offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second | |
velocities = est_tuples[:, 4] | |
est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1) | |
"""(notes, 3), the three columns are onset_times, offset_times and velocity.""" | |
est_on_off_note_vels = est_on_off_note_vels.astype(np.float32) | |
return est_on_off_note_vels | |
def output_dict_to_detected_pedals(self, output_dict): | |
"""Postprocess output_dict to piano pedals. | |
Args: | |
output_dict: dict, e.g. { | |
'pedal_frame_output': (frames_num,), | |
'pedal_offset_output': (frames_num,), | |
'pedal_offset_shift_output': (frames_num,), | |
...} | |
Returns: | |
est_on_off: (notes, 2), the two columns are pedal onsets and pedal | |
offsets. E.g., | |
[[0.1800, 0.9669], | |
[1.1400, 2.6458], | |
...] | |
""" | |
frames_num = output_dict['pedal_frame_output'].shape[0] | |
est_tuples = pedal_detection_with_onset_offset_regress( | |
frame_output=output_dict['pedal_frame_output'][:, 0], | |
offset_output=output_dict['pedal_offset_output'][:, 0], | |
offset_shift_output=output_dict['pedal_offset_shift_output'][:, 0], | |
frame_threshold=0.5) | |
est_tuples = np.array(est_tuples) | |
"""(notes, 2), the two columns are pedal onsets and pedal offsets""" | |
if len(est_tuples) == 0: | |
return np.array([]) | |
else: | |
onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second | |
offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second | |
est_on_off = np.stack((onset_times, offset_times), axis=-1) | |
est_on_off = est_on_off.astype(np.float32) | |
return est_on_off | |
def detected_notes_to_events(self, est_on_off_note_vels): | |
"""Reformat detected notes to midi events. | |
Args: | |
est_on_off_vels: (notes, 3), the three columns are onset_times, | |
offset_times and velocity. E.g. | |
[[32.8376, 35.7700, 0.7932], | |
[37.3712, 39.9300, 0.8058], | |
...] | |
Returns: | |
midi_events, list, e.g., | |
[{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84}, | |
{'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88}, | |
...] | |
""" | |
midi_events = [] | |
for i in range(est_on_off_note_vels.shape[0]): | |
midi_events.append({ | |
'onset_time': est_on_off_note_vels[i][0], | |
'offset_time': est_on_off_note_vels[i][1], | |
'midi_note': int(est_on_off_note_vels[i][2]), | |
'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)}) | |
return midi_events | |
def detected_pedals_to_events(self, pedal_on_offs): | |
"""Reformat detected pedal onset and offsets to events. | |
Args: | |
pedal_on_offs: (notes, 2), the two columns are pedal onsets and pedal | |
offsets. E.g., | |
[[0.1800, 0.9669], | |
[1.1400, 2.6458], | |
...] | |
Returns: | |
pedal_events: list of dict, e.g., | |
[{'onset_time': 0.1800, 'offset_time': 0.9669}, | |
{'onset_time': 1.1400, 'offset_time': 2.6458}, | |
...] | |
""" | |
pedal_events = [] | |
for i in range(len(pedal_on_offs)): | |
pedal_events.append({ | |
'onset_time': pedal_on_offs[i, 0], | |
'offset_time': pedal_on_offs[i, 1]}) | |
return pedal_events | |
def load_audio(path, sr=22050, mono=True, offset=0.0, duration=None, | |
dtype=np.float32, res_type='kaiser_best', | |
backends=[audioread.ffdec.FFmpegAudioFile]): | |
"""Load audio. Copied from librosa.core.load() except that ffmpeg backend is | |
always used in this function.""" | |
y = [] | |
with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file: | |
sr_native = input_file.samplerate | |
n_channels = input_file.channels | |
s_start = int(np.round(sr_native * offset)) * n_channels | |
if duration is None: | |
s_end = np.inf | |
else: | |
s_end = s_start + (int(np.round(sr_native * duration)) | |
* n_channels) | |
n = 0 | |
for frame in input_file: | |
frame = frame = librosa.util.buf_to_float(frame, n_bytes=2, dtype=dtype) | |
n_prev = n | |
n = n + len(frame) | |
if n < s_start: | |
# offset is after the current frame | |
# keep reading | |
continue | |
if s_end < n_prev: | |
# we're off the end. stop reading | |
break | |
if s_end < n: | |
# the end is in this frame. crop. | |
frame = frame[:s_end - n_prev] | |
if n_prev <= s_start <= n: | |
# beginning is in this frame | |
frame = frame[(s_start - n_prev):] | |
# tack on the current frame | |
y.append(frame) | |
if y: | |
y = np.concatenate(y) | |
if n_channels > 1: | |
y = y.reshape((-1, n_channels)).T | |
if mono: | |
y = librosa.to_mono(y) | |
if sr is not None: | |
y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type) | |
else: | |
sr = sr_native | |
# Final cleanup for dtype and contiguity | |
y = np.ascontiguousarray(y, dtype=dtype) | |
return (y, sr) |