Spaces:

asigalov61
/

ByteDance-Solo-Piano-Audio-to-MIDI-Transcription

Running on Zero

App Files Files Community

ByteDance-Solo-Piano-Audio-to-MIDI-Transcription / utilities.py

asigalov61

Update utilities.py

208cf4e verified 8 months ago

raw

history blame

20.6 kB

	import os
	import numpy as np
	import audioread
	import librosa
	from mido import MidiFile

	from piano_vad import (note_detection_with_onset_offset_regress,
	pedal_detection_with_onset_offset_regress)
	import config


	def create_folder(fd):
	if not os.path.exists(fd):
	os.makedirs(fd)


	def get_filename(path):
	path = os.path.realpath(path)
	na_ext = path.split('/')[-1]
	na = os.path.splitext(na_ext)[0]
	return na


	def note_to_freq(piano_note):
	return 2 ** ((piano_note - 39) / 12) * 440


	def float32_to_int16(x):
	assert np.max(np.abs(x)) <= 1.
	return (x * 32767.).astype(np.int16)


	def int16_to_float32(x):
	return (x / 32767.).astype(np.float32)


	def pad_truncate_sequence(x, max_len):
	if len(x) < max_len:
	return np.concatenate((x, np.zeros(max_len - len(x))))
	else:
	return x[0 : max_len]


	def read_midi(midi_path):
	"""Parse MIDI file.

	Args:
	midi_path: str

	Returns:
	midi_dict: dict, e.g. {
	'midi_event': [
	'program_change channel=0 program=0 time=0',
	'control_change channel=0 control=64 value=127 time=0',
	'control_change channel=0 control=64 value=63 time=236',
	...],
	'midi_event_time': [0., 0, 0.98307292, ...]}
	"""

	midi_file = MidiFile(midi_path)
	ticks_per_beat = midi_file.ticks_per_beat

	assert len(midi_file.tracks) == 2
	"""The first track contains tempo, time signature. The second track
	contains piano events."""

	microseconds_per_beat = midi_file.tracks[0][0].tempo
	beats_per_second = 1e6 / microseconds_per_beat
	ticks_per_second = ticks_per_beat * beats_per_second

	message_list = []

	ticks = 0
	time_in_second = []

	for message in midi_file.tracks[1]:
	message_list.append(str(message))
	ticks += message.time
	time_in_second.append(ticks / ticks_per_second)

	midi_dict = {
	'midi_event': np.array(message_list),
	'midi_event_time': np.array(time_in_second)}

	return midi_dict


	def write_events_to_midi(start_time, note_events, pedal_events, midi_path):
	"""Write out note events to MIDI file.

	Args:
	start_time: float
	note_events: list of dict, e.g. [
	{'midi_note': 51, 'onset_time': 696.63544, 'offset_time': 696.9948, 'velocity': 44},
	{'midi_note': 58, 'onset_time': 696.99585, 'offset_time': 697.18646, 'velocity': 50}
	...]
	midi_path: str
	"""
	from mido import Message, MidiFile, MidiTrack, MetaMessage

	# This configuration is the same as MIDIs in MAESTRO dataset
	ticks_per_beat = 384
	beats_per_second = 2
	ticks_per_second = ticks_per_beat * beats_per_second
	microseconds_per_beat = int(1e6 // beats_per_second)

	midi_file = MidiFile()
	midi_file.ticks_per_beat = ticks_per_beat

	# Track 0
	track0 = MidiTrack()
	track0.append(MetaMessage('set_tempo', tempo=microseconds_per_beat, time=0))
	track0.append(MetaMessage('time_signature', numerator=4, denominator=4, time=0))
	track0.append(MetaMessage('end_of_track', time=1))
	midi_file.tracks.append(track0)

	# Track 1
	track1 = MidiTrack()

	# Message rolls of MIDI
	message_roll = []

	for note_event in note_events:
	# Onset
	message_roll.append({
	'time': note_event['onset_time'],
	'midi_note': note_event['midi_note'],
	'velocity': note_event['velocity']})

	# Offset
	message_roll.append({
	'time': note_event['offset_time'],
	'midi_note': note_event['midi_note'],
	'velocity': 0})

	if pedal_events:
	for pedal_event in pedal_events:
	message_roll.append({'time': pedal_event['onset_time'], 'control_change': 64, 'value': 127})
	message_roll.append({'time': pedal_event['offset_time'], 'control_change': 64, 'value': 0})

	# Sort MIDI messages by time
	message_roll.sort(key=lambda note_event: note_event['time'])

	previous_ticks = 0
	for message in message_roll:
	this_ticks = int((message['time'] - start_time) * ticks_per_second)
	if this_ticks >= 0:
	diff_ticks = this_ticks - previous_ticks
	previous_ticks = this_ticks
	if 'midi_note' in message.keys():
	track1.append(Message('note_on', note=message['midi_note'], velocity=message['velocity'], time=diff_ticks))
	elif 'control_change' in message.keys():
	track1.append(Message('control_change', channel=0, control=message['control_change'], value=message['value'], time=diff_ticks))
	track1.append(MetaMessage('end_of_track', time=1))
	midi_file.tracks.append(track1)

	midi_file.save(midi_path)


	class RegressionPostProcessor(object):
	def __init__(self, frames_per_second, classes_num, onset_threshold,
	offset_threshold, frame_threshold, pedal_offset_threshold):
	"""Postprocess the output probabilities of a transription model to MIDI
	events.

	Args:
	frames_per_second: int
	classes_num: int
	onset_threshold: float
	offset_threshold: float
	frame_threshold: float
	pedal_offset_threshold: float
	"""
	self.frames_per_second = frames_per_second
	self.classes_num = classes_num
	self.onset_threshold = onset_threshold
	self.offset_threshold = offset_threshold
	self.frame_threshold = frame_threshold
	self.pedal_offset_threshold = pedal_offset_threshold
	self.begin_note = config.begin_note
	self.velocity_scale = config.velocity_scale

	def output_dict_to_midi_events(self, output_dict):
	"""Main function. Post process model outputs to MIDI events.

	Args:
	output_dict: {
	'reg_onset_output': (segment_frames, classes_num),
	'reg_offset_output': (segment_frames, classes_num),
	'frame_output': (segment_frames, classes_num),
	'velocity_output': (segment_frames, classes_num),
	'reg_pedal_onset_output': (segment_frames, 1),
	'reg_pedal_offset_output': (segment_frames, 1),
	'pedal_frame_output': (segment_frames, 1)}

	Outputs:
	est_note_events: list of dict, e.g. [
	{'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
	{'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]

	est_pedal_events: list of dict, e.g. [
	{'onset_time': 0.17, 'offset_time': 0.96},
	{'osnet_time': 1.17, 'offset_time': 2.65}]
	"""

	# Post process piano note outputs to piano note and pedal events information
	(est_on_off_note_vels, est_pedal_on_offs) = \
	self.output_dict_to_note_pedal_arrays(output_dict)
	"""est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
	est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""

	# Reformat notes to MIDI events
	est_note_events = self.detected_notes_to_events(est_on_off_note_vels)

	if est_pedal_on_offs is None:
	est_pedal_events = None
	else:
	est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)

	return est_note_events, est_pedal_events

	def output_dict_to_note_pedal_arrays(self, output_dict):
	"""Postprocess the output probabilities of a transription model to MIDI
	events.

	Args:
	output_dict: dict, {
	'reg_onset_output': (frames_num, classes_num),
	'reg_offset_output': (frames_num, classes_num),
	'frame_output': (frames_num, classes_num),
	'velocity_output': (frames_num, classes_num),
	...}

	Returns:
	est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
	offset_time, piano_note and velocity. E.g. [
	[39.74, 39.87, 27, 0.65],
	[11.98, 12.11, 33, 0.69],
	...]

	est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
	and offset_time. E.g. [
	[0.17, 0.96],
	[1.17, 2.65],
	...]
	"""

	# ------ 1. Process regression outputs to binarized outputs ------
	# For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
	# will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]

	# Calculate binarized onset output from regression output
	(onset_output, onset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_onset_output'],
	threshold=self.onset_threshold, neighbour=2)

	output_dict['onset_output'] = onset_output # Values are 0 or 1
	output_dict['onset_shift_output'] = onset_shift_output

	# Calculate binarized offset output from regression output
	(offset_output, offset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_offset_output'],
	threshold=self.offset_threshold, neighbour=4)

	output_dict['offset_output'] = offset_output # Values are 0 or 1
	output_dict['offset_shift_output'] = offset_shift_output

	if 'reg_pedal_onset_output' in output_dict.keys():
	"""Pedal onsets are not used in inference. Instead, frame-wise pedal
	predictions are used to detect onsets. We empirically found this is
	more accurate to detect pedal onsets."""
	pass

	if 'reg_pedal_offset_output' in output_dict.keys():
	# Calculate binarized pedal offset output from regression output
	(pedal_offset_output, pedal_offset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_pedal_offset_output'],
	threshold=self.pedal_offset_threshold, neighbour=4)

	output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1
	output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output

	# ------ 2. Process matrices results to event results ------
	# Detect piano notes from output_dict
	est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)

	if 'reg_pedal_onset_output' in output_dict.keys():
	# Detect piano pedals from output_dict
	est_pedal_on_offs = self.output_dict_to_detected_pedals(output_dict)

	else:
	est_pedal_on_offs = None

	return est_on_off_note_vels, est_pedal_on_offs

	def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
	"""Calculate binarized output and shifts of onsets or offsets from the
	regression results.

	Args:
	reg_output: (frames_num, classes_num)
	threshold: float
	neighbour: int

	Returns:
	binary_output: (frames_num, classes_num)
	shift_output: (frames_num, classes_num)
	"""
	binary_output = np.zeros_like(reg_output)
	shift_output = np.zeros_like(reg_output)
	(frames_num, classes_num) = reg_output.shape

	for k in range(classes_num):
	x = reg_output[:, k]
	for n in range(neighbour, frames_num - neighbour):
	if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
	binary_output[n, k] = 1

	"""See Section III-D in [1] for deduction.
	[1] Q. Kong, et al., High-resolution Piano Transcription
	with Pedals by Regressing Onsets and Offsets Times, 2020."""
	if x[n - 1] > x[n + 1]:
	shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
	else:
	shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
	shift_output[n, k] = shift

	return binary_output, shift_output

	def is_monotonic_neighbour(self, x, n, neighbour):
	"""Detect if values are monotonic in both side of x[n].

	Args:
	x: (frames_num,)
	n: int
	neighbour: int

	Returns:
	monotonic: bool
	"""
	monotonic = True
	for i in range(neighbour):
	if x[n - i] < x[n - i - 1]:
	monotonic = False
	if x[n + i] < x[n + i + 1]:
	monotonic = False

	return monotonic

	def output_dict_to_detected_notes(self, output_dict):
	"""Postprocess output_dict to piano notes.

	Args:
	output_dict: dict, e.g. {
	'onset_output': (frames_num, classes_num),
	'onset_shift_output': (frames_num, classes_num),
	'offset_output': (frames_num, classes_num),
	'offset_shift_output': (frames_num, classes_num),
	'frame_output': (frames_num, classes_num),
	'onset_output': (frames_num, classes_num),
	...}

	Returns:
	est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
	MIDI notes and velocities. E.g.,
	[[39.7375, 39.7500, 27., 0.6638],
	[11.9824, 12.5000, 33., 0.6892],
	...]
	"""
	est_tuples = []
	est_midi_notes = []
	classes_num = output_dict['frame_output'].shape[-1]

	for piano_note in range(classes_num):
	"""Detect piano notes"""
	est_tuples_per_note = note_detection_with_onset_offset_regress(
	frame_output=output_dict['frame_output'][:, piano_note],
	onset_output=output_dict['onset_output'][:, piano_note],
	onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
	offset_output=output_dict['offset_output'][:, piano_note],
	offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
	velocity_output=output_dict['velocity_output'][:, piano_note],
	frame_threshold=self.frame_threshold)

	est_tuples += est_tuples_per_note
	est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)

	est_tuples = np.array(est_tuples) # (notes, 5)
	"""(notes, 5), the five columns are onset, offset, onset_shift,
	offset_shift and normalized_velocity"""

	est_midi_notes = np.array(est_midi_notes) # (notes,)

	if len(est_tuples) == 0:
	return np.array([])

	else:
	onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
	offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
	velocities = est_tuples[:, 4]

	est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
	"""(notes, 3), the three columns are onset_times, offset_times and velocity."""

	est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)

	return est_on_off_note_vels

	def output_dict_to_detected_pedals(self, output_dict):
	"""Postprocess output_dict to piano pedals.

	Args:
	output_dict: dict, e.g. {
	'pedal_frame_output': (frames_num,),
	'pedal_offset_output': (frames_num,),
	'pedal_offset_shift_output': (frames_num,),
	...}

	Returns:
	est_on_off: (notes, 2), the two columns are pedal onsets and pedal
	offsets. E.g.,
	[[0.1800, 0.9669],
	[1.1400, 2.6458],
	...]
	"""
	frames_num = output_dict['pedal_frame_output'].shape[0]

	est_tuples = pedal_detection_with_onset_offset_regress(
	frame_output=output_dict['pedal_frame_output'][:, 0],
	offset_output=output_dict['pedal_offset_output'][:, 0],
	offset_shift_output=output_dict['pedal_offset_shift_output'][:, 0],
	frame_threshold=0.5)

	est_tuples = np.array(est_tuples)
	"""(notes, 2), the two columns are pedal onsets and pedal offsets"""

	if len(est_tuples) == 0:
	return np.array([])

	else:
	onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
	offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
	est_on_off = np.stack((onset_times, offset_times), axis=-1)
	est_on_off = est_on_off.astype(np.float32)
	return est_on_off

	def detected_notes_to_events(self, est_on_off_note_vels):
	"""Reformat detected notes to midi events.

	Args:
	est_on_off_vels: (notes, 3), the three columns are onset_times,
	offset_times and velocity. E.g.
	[[32.8376, 35.7700, 0.7932],
	[37.3712, 39.9300, 0.8058],
	...]

	Returns:
	midi_events, list, e.g.,
	[{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
	{'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
	...]
	"""
	midi_events = []
	for i in range(est_on_off_note_vels.shape[0]):
	midi_events.append({
	'onset_time': est_on_off_note_vels[i][0],
	'offset_time': est_on_off_note_vels[i][1],
	'midi_note': int(est_on_off_note_vels[i][2]),
	'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})

	return midi_events

	def detected_pedals_to_events(self, pedal_on_offs):
	"""Reformat detected pedal onset and offsets to events.

	Args:
	pedal_on_offs: (notes, 2), the two columns are pedal onsets and pedal
	offsets. E.g.,
	[[0.1800, 0.9669],
	[1.1400, 2.6458],
	...]

	Returns:
	pedal_events: list of dict, e.g.,
	[{'onset_time': 0.1800, 'offset_time': 0.9669},
	{'onset_time': 1.1400, 'offset_time': 2.6458},
	...]
	"""
	pedal_events = []
	for i in range(len(pedal_on_offs)):
	pedal_events.append({
	'onset_time': pedal_on_offs[i, 0],
	'offset_time': pedal_on_offs[i, 1]})

	return pedal_events


	def load_audio(path, sr=22050, mono=True, offset=0.0, duration=None,
	dtype=np.float32, res_type='kaiser_best',
	backends=[audioread.ffdec.FFmpegAudioFile]):
	"""Load audio. Copied from librosa.core.load() except that ffmpeg backend is
	always used in this function."""

	y = []
	with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file:
	sr_native = input_file.samplerate
	n_channels = input_file.channels

	s_start = int(np.round(sr_native * offset)) * n_channels

	if duration is None:
	s_end = np.inf
	else:
	s_end = s_start + (int(np.round(sr_native * duration))
	* n_channels)

	n = 0

	for frame in input_file:
	frame = frame = librosa.util.buf_to_float(frame, n_bytes=2, dtype=dtype)
	n_prev = n
	n = n + len(frame)

	if n < s_start:
	# offset is after the current frame
	# keep reading
	continue

	if s_end < n_prev:
	# we're off the end. stop reading
	break

	if s_end < n:
	# the end is in this frame. crop.
	frame = frame[:s_end - n_prev]

	if n_prev <= s_start <= n:
	# beginning is in this frame
	frame = frame[(s_start - n_prev):]

	# tack on the current frame
	y.append(frame)

	if y:
	y = np.concatenate(y)

	if n_channels > 1:
	y = y.reshape((-1, n_channels)).T
	if mono:
	y = librosa.to_mono(y)

	if sr is not None:
	y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type)

	else:
	sr = sr_native

	# Final cleanup for dtype and contiguity
	y = np.ascontiguousarray(y, dtype=dtype)

	return (y, sr)