Spaces:

shethjenil
/

Audio2Melody

Paused

App Files Files Community

Audio2Melody / utils_violin_transcript.py

shethjenil

Upload 3 files

f8b3075 verified 12 days ago

raw

history blame contribute delete

145 kB

	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.cm import ScalarMappable
	from matplotlib.patches import Rectangle
	from librosa.sequence import viterbi_discriminative , dtw
	from librosa import note_to_hz,midi_to_hz
	from numba import jit
	from scipy.stats import norm
	from scipy.ndimage import gaussian_filter1d
	from scipy.signal import medfilt ,upfirdn,argrelmax
	from torchaudio.models.conformer import ConformerLayer
	from torchaudio import load as torchaudio_load
	from torchaudio.functional import resample as torchaudio_functional_resample
	from torch import cat as torch_cat , load as torch_load ,Tensor as torch_Tensor , from_numpy as torch_from_numpy,no_grad as torch_no_grad ,mean as torch_mean,std as torch_std,sigmoid as torch_sigmoid,nan_to_num as torch_nan_to_num,nn
	from sklearn.metrics.pairwise import euclidean_distances
	from mir_eval.melody import hz2cents
	from pretty_midi import PrettyMIDI , Instrument , Note , PitchBend , instrument_name_to_program ,note_name_to_number
	from time import perf_counter
	from collections import defaultdict
	from typing import DefaultDict, Dict, List, Optional, Tuple
	from pathlib import Path
	from mido import MidiFile,MidiTrack


	class PitchEstimator(nn.Module):
	"""
	This is the base class that everything else inherits from. The hierarchy is:
	PitchEstimator -> Transcriber -> Synchronizer -> AutonomousAgent -> The n-Head Music Performance Analysis Models
	PitchEstimator can handle reading the audio, predicting all the features,
	estimating a single frame level f0 using viterbi, or
	MIDI pitch bend creation for the predicted note events when used inside a Transcriber, or
	score-informed f0 estimation when used inside a Synchronizer.
	"""
	def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
	super().__init__()
	self.labeling = labeling
	self.sr = sr
	self.window_size = window_size
	self.hop_length = hop_length
	self.instrument = instrument
	self.f0_bins_per_semitone = int(np.round(100/self.labeling.f0_granularity_c))


	def read_audio(self, audio):
	"""
	Read and resample an audio file, convert to mono, and unfold into representation frames.
	The time array represents the center of each small frame with 5.8ms hop length. This is different than the chunk
	level frames. The chunk level frames represent the entire sequence the model sees. Whereas it predicts with the
	small frames intervals (5.8ms).
	:param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
	:return: frames: (n_big_frames, frame_length), times: (n_small_frames,)
	"""
	if isinstance(audio, str) or isinstance(audio, Path):
	audio, sample_rate = torchaudio_load(audio, normalize=True)
	audio = audio.mean(axis=0) # convert to mono
	if sample_rate != self.sr:
	audio = torchaudio_functional_resample(audio, sample_rate, self.sr)
	elif isinstance(audio, np.ndarray):
	audio = torch_from_numpy(audio)
	else:
	assert isinstance(audio, torch_Tensor)
	len_audio = audio.shape[-1]
	n_frames = int(np.ceil((len_audio + sum(self.frame_overlap)) / (self.hop_length * self.chunk_size)))
	audio = nn.functional.pad(audio, (self.frame_overlap[0],
	self.frame_overlap[1] + (n_frames * self.hop_length * self.chunk_size) - len_audio))
	frames = audio.unfold(0, self.max_window_size, self.hop_length*self.chunk_size)
	times = np.arange(0, len_audio, self.hop_length) / self.sr # not tensor, we don't compute anything with it
	return frames, times

	def predict(self, audio, batch_size):
	frames, times = self.read_audio(audio)
	performance = {'f0': [], 'note': [], 'onset': [], 'offset': []}
	self.eval()
	device = self.main.conv0.conv2d.weight.device
	with torch_no_grad():
	for i in range(0, len(frames), batch_size):
	f = frames[i:min(i + batch_size, len(frames))].to(device)
	f -= (torch_mean(f, axis=1).unsqueeze(-1))
	f /= (torch_std(f, axis=1).unsqueeze(-1))
	out = self.forward(f)
	for key, value in out.items():
	value = torch_sigmoid(value)
	value = torch_nan_to_num(value) # the model outputs nan when the frame is silent (this is an expected behavior due to normalization)
	value = value.view(-1, value.shape[-1])
	value = value.detach().cpu().numpy()
	performance[key].append(value)
	performance = {key: np.concatenate(value, axis=0)[:len(times)] for key, value in performance.items()}
	performance['time'] = times
	return performance

	def estimate_pitch(self, audio, batch_size, viterbi=False):
	out = self.predict(audio, batch_size)
	f0_hz = self.out2f0(out, viterbi)
	return out['time'], f0_hz

	def out2f0(self, out, viterbi=False):
	"""
	Monophonic f0 estimation from the model output. The viterbi postprocessing is specialized for the violin family.
	"""
	salience = out['f0']
	if viterbi == 'constrained':
	assert hasattr(self, 'out2note')
	notes = spotify_create_notes( out["note"], out["onset"], note_low=self.labeling.midi_centers[0],
	note_high=self.labeling.midi_centers[-1], onset_thresh=0.5, frame_thresh=0.3,
	infer_onsets=True, melodia_trick=True,
	min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))))
	note_cents = self.get_pitch_bends(salience, notes, to_midi=False, timing_refinement_range=0)
	cents = np.zeros_like(out['time'])
	cents[note_cents[:,0].astype(int)] = note_cents[:,1]
	elif viterbi:
	# transition probabilities inducing continuous pitch
	# big changes are penalized with one order of magnitude
	transition = gaussian_filter1d(np.eye(self.labeling.f0_n_bins), 30) + 99 * gaussian_filter1d(
	np.eye(self.labeling.f0_n_bins), 2)
	transition = transition / np.sum(transition, axis=1)[:, None]

	p = salience / salience.sum(axis=1)[:, None]
	p[np.isnan(p.sum(axis=1)), :] = np.ones(self.labeling.f0_n_bins) * 1 / self.labeling.f0_n_bins
	path = viterbi_discriminative(p.T, transition)
	cents = np.array([self.labeling.f0_label2c(salience[i, :], path[i]) for i in range(len(path))])
	else:
	cents = self.labeling.f0_label2c(salience, center=None) # use argmax for center

	f0_hz = self.labeling.f0_c2hz(cents)
	f0_hz[np.isnan(f0_hz)] = 0
	return f0_hz

	def get_pitch_bends(
	self,
	contours: np.ndarray, note_events: List[Tuple[int, int, int, float]],
	timing_refinement_range: int = 0, to_midi: bool = True,
	) -> List[Tuple[int, int, int, float, Optional[List[int]]]]:
	"""Modified version of an excellent script from Spotify/basic_pitch!! Thank you!!!!
	Given note events and contours, estimate pitch bends per note.
	Pitch bends are represented as a sequence of evenly spaced midi pitch bend control units.
	The time stamps of each pitch bend can be inferred by computing an evenly spaced grid between
	the start and end times of each note event.
	Args:
	contours: Matrix of estimated pitch contours
	note_events: note event tuple
	timing_refinement_range: if > 0, refine onset/offset boundaries with f0 confidence
	to_midi: whether to convert pitch bends to midi pitch bends. If False, return pitch estimates in the format
	[time (index), pitch (Hz), confidence in range [0, 1]].
	Returns:
	note events with pitch bends
	"""

	f0_matrix = [] # [time (index), pitch (Hz), confidence in range [0, 1]]
	note_events_with_pitch_bends = []
	for start_idx, end_idx, pitch_midi, amplitude in note_events:
	if timing_refinement_range:
	start_idx = np.max([0, start_idx - timing_refinement_range])
	end_idx = np.min([contours.shape[0], end_idx + timing_refinement_range])
	freq_idx = int(np.round(self.midi_pitch_to_contour_bin(pitch_midi)))
	freq_start_idx = np.max([freq_idx - self.labeling.f0_tolerance_bins, 0])
	freq_end_idx = np.min([self.labeling.f0_n_bins, freq_idx + self.labeling.f0_tolerance_bins + 1])

	trans_start_idx = np.max([0, self.labeling.f0_tolerance_bins - freq_idx])
	trans_end_idx = (2 * self.labeling.f0_tolerance_bins + 1) - \
	np.max([0, freq_idx - (self.labeling.f0_n_bins - self.labeling.f0_tolerance_bins - 1)])

	# apply regional viterbi to estimate the intonation
	# observation probabilities come from the f0_roll matrix
	observation = contours[start_idx:end_idx, freq_start_idx:freq_end_idx]
	observation = observation / observation.sum(axis=1)[:, None]
	observation[np.isnan(observation.sum(axis=1)), :] = np.ones(freq_end_idx - freq_start_idx) * 1 / (
	freq_end_idx - freq_start_idx)

	# transition probabilities assure continuity
	transition = self.labeling.f0_transition_matrix[trans_start_idx:trans_end_idx,
	trans_start_idx:trans_end_idx] + 1e-6
	transition = transition / np.sum(transition, axis=1)[:, None]

	path = viterbi_discriminative(observation.T / observation.sum(axis=1), transition) + freq_start_idx

	cents = np.array([self.labeling.f0_label2c(contours[i + start_idx, :], path[i]) for i in range(len(path))])
	bends = cents - self.labeling.midi_centers_c[pitch_midi - self.labeling.midi_centers[0]]
	if to_midi:
	bends = (bends * 4096 / 100).astype(int)
	bends[bends > 8191] = 8191
	bends[bends < -8192] = -8192

	if timing_refinement_range:
	confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
	threshold = np.median(confidences)
	threshold = (np.median(confidences > threshold) + threshold) / 2 # some magic
	median_kernel = 2 * (timing_refinement_range // 2) + 1 # some more magic
	confidences = medfilt(confidences, kernel_size=median_kernel)
	conf_bool = confidences > threshold
	onset_idx = np.argmax(conf_bool)
	offset_idx = len(confidences) - np.argmax(conf_bool[::-1])
	bends = bends[onset_idx:offset_idx]
	start_idx = start_idx + onset_idx
	end_idx = start_idx + offset_idx

	note_events_with_pitch_bends.append((start_idx, end_idx, pitch_midi, amplitude, bends))
	else:
	confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
	time_idx = np.arange(len(path)) + start_idx
	# f0_hz = self.labeling.f0_c2hz(cents)
	possible_f0s = np.array([time_idx, cents, confidences]).T
	f0_matrix.append(possible_f0s[np.abs(bends)<100]) # filter out pitch bends that are too large
	if not to_midi:
	return np.vstack(f0_matrix)
	else:
	return note_events_with_pitch_bends


	def midi_pitch_to_contour_bin(self, pitch_midi: int) -> np.array:
	"""Convert midi pitch to corresponding index in contour matrix
	Args:
	pitch_midi: pitch in midi
	Returns:
	index in contour matrix
	"""
	pitch_hz = midi_to_hz(pitch_midi)
	return np.argmin(np.abs(self.labeling.f0_centers_hz - pitch_hz))

	# SPOTIFY

	def get_inferred_onsets(onset_roll: np.array, note_roll: np.array, n_diff: int = 2) -> np.array:
	"""
	Infer onsets from large changes in note roll matrix amplitudes.
	Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
	:param onset_roll: Onset activation matrix (n_times, n_freqs).
	:param note_roll: Frame-level note activation matrix (n_times, n_freqs).
	:param n_diff: Differences used to detect onsets.
	:return: The maximum between the predicted onsets and its differences.
	"""

	diffs = []
	for n in range(1, n_diff + 1):
	frames_appended = np.concatenate([np.zeros((n, note_roll.shape[1])), note_roll])
	diffs.append(frames_appended[n:, :] - frames_appended[:-n, :])
	frame_diff = np.min(diffs, axis=0)
	frame_diff[frame_diff < 0] = 0
	frame_diff[:n_diff, :] = 0
	frame_diff = np.max(onset_roll) * frame_diff / np.max(frame_diff) # rescale to have the same max as onsets

	max_onsets_diff = np.max([onset_roll, frame_diff],
	axis=0) # use the max of the predicted onsets and the differences

	return max_onsets_diff

	def spotify_create_notes(
	note_roll: np.array,
	onset_roll: np.array,
	onset_thresh: float,
	frame_thresh: float,
	min_note_len: int,
	infer_onsets: bool,
	note_low : int, #self.labeling.midi_centers[0]
	note_high : int, #self.labeling.midi_centers[-1],
	melodia_trick: bool = True,
	energy_tol: int = 11,
	) -> List[Tuple[int, int, int, float]]:
	"""Decode raw model output to polyphonic note events
	Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
	Args:
	note_roll: Frame activation matrix (n_times, n_freqs).
	onset_roll: Onset activation matrix (n_times, n_freqs).
	onset_thresh: Minimum amplitude of an onset activation to be considered an onset.
	frame_thresh: Minimum amplitude of a frame activation for a note to remain "on".
	min_note_len: Minimum allowed note length in frames.
	infer_onsets: If True, add additional onsets when there are large differences in frame amplitudes.
	melodia_trick : Whether to use the melodia trick to better detect notes.
	energy_tol: Drop notes below this energy.
	Returns:
	list of tuples [(start_time_frames, end_time_frames, pitch_midi, amplitude)]
	representing the note events, where amplitude is a number between 0 and 1
	"""

	n_frames = note_roll.shape[0]

	# use onsets inferred from frames in addition to the predicted onsets
	if infer_onsets:
	onset_roll = get_inferred_onsets(onset_roll, note_roll)

	peak_thresh_mat = np.zeros(onset_roll.shape)
	peaks = argrelmax(onset_roll, axis=0)
	peak_thresh_mat[peaks] = onset_roll[peaks]

	onset_idx = np.where(peak_thresh_mat >= onset_thresh)
	onset_time_idx = onset_idx[0][::-1] # sort to go backwards in time
	onset_freq_idx = onset_idx[1][::-1] # sort to go backwards in time

	remaining_energy = np.zeros(note_roll.shape)
	remaining_energy[:, :] = note_roll[:, :]

	# loop over onsets
	note_events = []
	for note_start_idx, freq_idx in zip(onset_time_idx, onset_freq_idx):
	# if we're too close to the end of the audio, continue
	if note_start_idx >= n_frames - 1:
	continue

	# find time index at this frequency band where the frames drop below an energy threshold
	i = note_start_idx + 1
	k = 0 # number of frames since energy dropped below threshold
	while i < n_frames - 1 and k < energy_tol:
	if remaining_energy[i, freq_idx] < frame_thresh:
	k += 1
	else:
	k = 0
	i += 1

	i -= k # go back to frame above threshold

	# if the note is too short, skip it
	if i - note_start_idx <= min_note_len:
	continue

	remaining_energy[note_start_idx:i, freq_idx] = 0
	if freq_idx < note_high:
	remaining_energy[note_start_idx:i, freq_idx + 1] = 0
	if freq_idx > note_low:
	remaining_energy[note_start_idx:i, freq_idx - 1] = 0

	# add the note
	amplitude = np.mean(note_roll[note_start_idx:i, freq_idx])
	note_events.append(
	(
	note_start_idx,
	i,
	freq_idx + note_low,
	amplitude,
	)
	)

	if melodia_trick:
	energy_shape = remaining_energy.shape

	while np.max(remaining_energy) > frame_thresh:
	i_mid, freq_idx = np.unravel_index(np.argmax(remaining_energy), energy_shape)
	remaining_energy[i_mid, freq_idx] = 0

	# forward pass
	i = i_mid + 1
	k = 0
	while i < n_frames - 1 and k < energy_tol:
	if remaining_energy[i, freq_idx] < frame_thresh:
	k += 1
	else:
	k = 0

	remaining_energy[i, freq_idx] = 0
	if freq_idx < note_high:
	remaining_energy[i, freq_idx + 1] = 0
	if freq_idx > note_low:
	remaining_energy[i, freq_idx - 1] = 0

	i += 1

	i_end = i - 1 - k # go back to frame above threshold

	# backward pass
	i = i_mid - 1
	k = 0
	while i > 0 and k < energy_tol:
	if remaining_energy[i, freq_idx] < frame_thresh:
	k += 1
	else:
	k = 0

	remaining_energy[i, freq_idx] = 0
	if freq_idx < note_high:
	remaining_energy[i, freq_idx + 1] = 0
	if freq_idx > note_low:
	remaining_energy[i, freq_idx - 1] = 0

	i -= 1

	i_start = i + 1 + k # go back to frame above threshold
	assert i_start >= 0, "{}".format(i_start)
	assert i_end < n_frames

	if i_end - i_start <= min_note_len:
	# note is too short, skip it
	continue

	# add the note
	amplitude = np.mean(note_roll[i_start:i_end, freq_idx])
	note_events.append(
	(
	i_start,
	i_end,
	freq_idx + note_low,
	amplitude,
	)
	)

	return note_events

	# TIKTOK

	def note_detection_with_onset_offset_regress(frame_output, onset_output,
	onset_shift_output, offset_output, offset_shift_output, velocity_output,
	frame_threshold):
	"""Process prediction matrices to note events information.
	First, detect onsets with onset outputs. Then, detect offsets
	with frame and offset outputs.

	Args:
	frame_output: (frames_num,)
	onset_output: (frames_num,)
	onset_shift_output: (frames_num,)
	offset_output: (frames_num,)
	offset_shift_output: (frames_num,)
	velocity_output: (frames_num,)
	frame_threshold: float
	Returns:
	output_tuples: list of [bgn, fin, onset_shift, offset_shift, normalized_velocity],
	e.g., [
	[1821, 1909, 0.47498, 0.3048533, 0.72119445],
	[1909, 1947, 0.30730522, -0.45764327, 0.64200014],
	...]
	"""
	output_tuples = []
	bgn = None
	frame_disappear = None
	offset_occur = None

	for i in range(onset_output.shape[0]):
	if onset_output[i] == 1:
	"""Onset detected"""
	if bgn:
	"""Consecutive onsets. E.g., pedal is not released, but two
	consecutive notes being played."""
	fin = max(i - 1, 0)
	output_tuples.append([bgn, fin, onset_shift_output[bgn],
	0, velocity_output[bgn]])
	frame_disappear, offset_occur = None, None
	bgn = i

	if bgn and i > bgn:
	"""If onset found, then search offset"""
	if frame_output[i] <= frame_threshold and not frame_disappear:
	"""Frame disappear detected"""
	frame_disappear = i

	if offset_output[i] == 1 and not offset_occur:
	"""Offset detected"""
	offset_occur = i

	if frame_disappear:
	if offset_occur and offset_occur - bgn > frame_disappear - offset_occur:
	"""bgn --------- offset_occur --- frame_disappear"""
	fin = offset_occur
	else:
	"""bgn --- offset_occur --------- frame_disappear"""
	fin = frame_disappear
	output_tuples.append([bgn, fin, onset_shift_output[bgn],
	offset_shift_output[fin], velocity_output[bgn]])
	bgn, frame_disappear, offset_occur = None, None, None

	if bgn and (i - bgn >= 600 or i == onset_output.shape[0] - 1):
	"""Offset not detected"""
	fin = i
	output_tuples.append([bgn, fin, onset_shift_output[bgn],
	offset_shift_output[fin], velocity_output[bgn]])
	bgn, frame_disappear, offset_occur = None, None, None

	# Sort pairs by onsets
	output_tuples.sort(key=lambda pair: pair[0])

	return output_tuples

	class RegressionPostProcessor(object):
	def __init__(self, frames_per_second, classes_num, onset_threshold,
	offset_threshold, frame_threshold, pedal_offset_threshold,
	begin_note):
	"""Postprocess the output probabilities of a transription model to MIDI
	events.

	Args:
	frames_per_second: float
	classes_num: int
	onset_threshold: float
	offset_threshold: float
	frame_threshold: float
	pedal_offset_threshold: float
	"""
	self.frames_per_second = frames_per_second
	self.classes_num = classes_num
	self.onset_threshold = onset_threshold
	self.offset_threshold = offset_threshold
	self.frame_threshold = frame_threshold
	self.pedal_offset_threshold = pedal_offset_threshold
	self.begin_note = begin_note
	self.velocity_scale = 128

	def output_dict_to_midi_events(self, output_dict):
	"""Main function. Post process model outputs to MIDI events.

	Args:
	output_dict: {
	'reg_onset_output': (segment_frames, classes_num),
	'reg_offset_output': (segment_frames, classes_num),
	'frame_output': (segment_frames, classes_num),
	'velocity_output': (segment_frames, classes_num),
	'reg_pedal_onset_output': (segment_frames, 1),
	'reg_pedal_offset_output': (segment_frames, 1),
	'pedal_frame_output': (segment_frames, 1)}

	Outputs:
	est_note_events: list of dict, e.g. [
	{'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
	{'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]

	est_pedal_events: list of dict, e.g. [
	{'onset_time': 0.17, 'offset_time': 0.96},
	{'osnet_time': 1.17, 'offset_time': 2.65}]
	"""
	output_dict['frame_output'] = output_dict['note']
	output_dict['velocity_output'] = output_dict['note']
	output_dict['reg_onset_output'] = output_dict['onset']
	output_dict['reg_offset_output'] = output_dict['offset']
	# Post process piano note outputs to piano note and pedal events information
	(est_on_off_note_vels, est_pedal_on_offs) = \
	self.output_dict_to_note_pedal_arrays(output_dict)
	"""est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
	est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""

	# Reformat notes to MIDI events
	est_note_events = self.detected_notes_to_events(est_on_off_note_vels)

	if est_pedal_on_offs is None:
	est_pedal_events = None
	else:
	est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)

	return est_note_events, est_pedal_events

	def output_dict_to_note_pedal_arrays(self, output_dict):
	"""Postprocess the output probabilities of a transription model to MIDI
	events.

	Args:
	output_dict: dict, {
	'reg_onset_output': (frames_num, classes_num),
	'reg_offset_output': (frames_num, classes_num),
	'frame_output': (frames_num, classes_num),
	'velocity_output': (frames_num, classes_num),
	...}

	Returns:
	est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
	offset_time, piano_note and velocity. E.g. [
	[39.74, 39.87, 27, 0.65],
	[11.98, 12.11, 33, 0.69],
	...]

	est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
	and offset_time. E.g. [
	[0.17, 0.96],
	[1.17, 2.65],
	...]
	"""

	# ------ 1. Process regression outputs to binarized outputs ------
	# For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
	# will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]

	# Calculate binarized onset output from regression output
	(onset_output, onset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_onset_output'],
	threshold=self.onset_threshold, neighbour=2)

	output_dict['onset_output'] = onset_output # Values are 0 or 1
	output_dict['onset_shift_output'] = onset_shift_output

	# Calculate binarized offset output from regression output
	(offset_output, offset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_offset_output'],
	threshold=self.offset_threshold, neighbour=4)

	output_dict['offset_output'] = offset_output # Values are 0 or 1
	output_dict['offset_shift_output'] = offset_shift_output

	if 'reg_pedal_onset_output' in output_dict.keys():
	"""Pedal onsets are not used in inference. Instead, frame-wise pedal
	predictions are used to detect onsets. We empirically found this is
	more accurate to detect pedal onsets."""
	pass

	if 'reg_pedal_offset_output' in output_dict.keys():
	# Calculate binarized pedal offset output from regression output
	(pedal_offset_output, pedal_offset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_pedal_offset_output'],
	threshold=self.pedal_offset_threshold, neighbour=4)

	output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1
	output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output

	# ------ 2. Process matrices results to event results ------
	# Detect piano notes from output_dict
	est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)

	est_pedal_on_offs = None

	return est_on_off_note_vels, est_pedal_on_offs

	def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
	"""Calculate binarized output and shifts of onsets or offsets from the
	regression results.

	Args:
	reg_output: (frames_num, classes_num)
	threshold: float
	neighbour: int

	Returns:
	binary_output: (frames_num, classes_num)
	shift_output: (frames_num, classes_num)
	"""
	binary_output = np.zeros_like(reg_output)
	shift_output = np.zeros_like(reg_output)
	(frames_num, classes_num) = reg_output.shape

	for k in range(classes_num):
	x = reg_output[:, k]
	for n in range(neighbour, frames_num - neighbour):
	if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
	binary_output[n, k] = 1

	"""See Section III-D in [1] for deduction.
	[1] Q. Kong, et al., High-resolution Piano Transcription
	with Pedals by Regressing Onsets and Offsets Times, 2020."""
	if x[n - 1] > x[n + 1]:
	shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
	else:
	shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
	shift_output[n, k] = shift

	return binary_output, shift_output

	def is_monotonic_neighbour(self, x, n, neighbour):
	"""Detect if values are monotonic in both side of x[n].

	Args:
	x: (frames_num,)
	n: int
	neighbour: int

	Returns:
	monotonic: bool
	"""
	monotonic = True
	for i in range(neighbour):
	if x[n - i] < x[n - i - 1]:
	monotonic = False
	if x[n + i] < x[n + i + 1]:
	monotonic = False

	return monotonic

	def output_dict_to_detected_notes(self, output_dict):
	"""Postprocess output_dict to piano notes.

	Args:
	output_dict: dict, e.g. {
	'onset_output': (frames_num, classes_num),
	'onset_shift_output': (frames_num, classes_num),
	'offset_output': (frames_num, classes_num),
	'offset_shift_output': (frames_num, classes_num),
	'frame_output': (frames_num, classes_num),
	'onset_output': (frames_num, classes_num),
	...}

	Returns:
	est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
	MIDI notes and velocities. E.g.,
	[[39.7375, 39.7500, 27., 0.6638],
	[11.9824, 12.5000, 33., 0.6892],
	...]
	"""

	est_tuples = []
	est_midi_notes = []
	classes_num = output_dict['frame_output'].shape[-1]

	for piano_note in range(classes_num):
	"""Detect piano notes"""
	est_tuples_per_note = note_detection_with_onset_offset_regress(
	frame_output=output_dict['frame_output'][:, piano_note],
	onset_output=output_dict['onset_output'][:, piano_note],
	onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
	offset_output=output_dict['offset_output'][:, piano_note],
	offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
	velocity_output=output_dict['velocity_output'][:, piano_note],
	frame_threshold=self.frame_threshold)

	est_tuples += est_tuples_per_note
	est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)

	est_tuples = np.array(est_tuples) # (notes, 5)
	"""(notes, 5), the five columns are onset, offset, onset_shift,
	offset_shift and normalized_velocity"""

	est_midi_notes = np.array(est_midi_notes) # (notes,)

	onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
	offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
	velocities = est_tuples[:, 4]

	est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
	"""(notes, 3), the three columns are onset_times, offset_times and velocity."""

	est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)

	return est_on_off_note_vels

	def detected_notes_to_events(self, est_on_off_note_vels):
	"""Reformat detected notes to midi events.

	Args:
	est_on_off_vels: (notes, 3), the three columns are onset_times,
	offset_times and velocity. E.g.
	[[32.8376, 35.7700, 0.7932],
	[37.3712, 39.9300, 0.8058],
	...]

	Returns:
	midi_events, list, e.g.,
	[{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
	{'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
	...]
	"""
	midi_events = []
	for i in range(est_on_off_note_vels.shape[0]):
	midi_events.append({
	'onset_time': est_on_off_note_vels[i][0],
	'offset_time': est_on_off_note_vels[i][1],
	'midi_note': int(est_on_off_note_vels[i][2]),
	'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})

	return midi_events

	def sync_visualize_step1(cost_matrices: List,
	num_rows: int,
	num_cols: int,
	anchors: np.ndarray,
	wp: np.ndarray) -> Tuple[plt.Figure, plt.Axes]:

	fig, ax = plt.subplots(1, 1, dpi=72)
	ax = __visualize_cost_matrices(ax, cost_matrices)
	__visualize_constraint_rectangles(anchors[[1, 0], :],
	edgecolor='firebrick')

	__visualize_path_in_matrix(ax=ax,
	wp=wp,
	axisX=np.arange(0, num_rows),
	axisY=np.arange(0, num_cols),
	path_color='firebrick')

	return fig, ax

	def sync_visualize_step2(ax: plt.Axes,
	cost_matrices: list,
	wp_step2: np.ndarray,
	wp_step1: np.ndarray,
	num_rows_step1: int,
	num_cols_step1: int,
	anchors_step1: np.ndarray,
	neighboring_anchors: np.ndarray,
	plot_title: str = ""):

	offset_x = neighboring_anchors[0, 0] - 1
	offset_y = neighboring_anchors[1, 0] - 1
	ax = __visualize_cost_matrices(ax=ax,
	cost_matrices=cost_matrices,
	offset_x=offset_x,
	offset_y=offset_y)

	__visualize_constraint_rectangles(anchors_step1[[1, 0], :],
	edgecolor='firebrick')

	__visualize_path_in_matrix(ax=ax,
	wp=wp_step1,
	axisX=np.arange(0, num_rows_step1),
	axisY=np.arange(0, num_cols_step1),
	path_color='firebrick')

	__visualize_constraint_rectangles(neighboring_anchors[[1, 0], :] - 1,
	edgecolor='orangered',
	linestyle='--')

	__visualize_path_in_matrix(ax=ax,
	wp=wp_step2,
	axisX=np.arange(0, num_rows_step1),
	axisY=np.arange(0, num_cols_step1),
	path_color='orangered')

	ax.set_title(plot_title)
	ax.set_ylabel("Version 1 (frames)")
	ax.set_xlabel("Version 2 (frames)")

	ax = plt.gca() # get the current axes
	pcm = None
	for pcm in ax.get_children():
	if isinstance(pcm, ScalarMappable):
	break
	plt.colorbar(pcm, ax=ax)
	plt.tight_layout()
	plt.show()

	def __size_dtw_matrices(dtw_matrices: List) -> Tuple[List[np.ndarray], List[np.ndarray]]:
	"""Gives information about the dimensionality of a DTW matrix
	given in form of a list matrix

	Parameters
	----------
	dtw_matrices: list
	The DTW matrix (cost matrix or accumulated cost matrix) given in form a list.

	Returns
	-------
	axisX_list: list
	A list containing a horizontal axis for each of the sub matrices
	which specifies the horizontal position of the respective submatrix
	in the overall cost matrix.

	axis_y_list: list
	A list containing a vertical axis for each of the
	sub matrices which specifies the vertical position of the
	respective submatrix in the overall cost matrix.

	"""
	num_matrices = len(dtw_matrices)
	size_list = [dtw_mat.shape for dtw_mat in dtw_matrices]

	axis_x_list = list()
	axis_y_list = list()

	x_acc = 0
	y_acc = 0

	for i in range(num_matrices):
	curr_size_list = size_list[i]
	axis_x_list.append(np.arange(x_acc, x_acc + curr_size_list[0]))
	axis_y_list.append(np.arange(y_acc, y_acc + curr_size_list[1]))
	x_acc += curr_size_list[0] - 1
	y_acc += curr_size_list[1] - 1

	return axis_x_list, axis_y_list

	def __visualize_cost_matrices(ax: plt.Axes,
	cost_matrices: list = None,
	offset_x: float = 0.0,
	offset_y: float = 0.0) -> plt.Axes:
	"""Visualizes cost matrices

	Parameters
	----------
	ax : axes
	The Axes instance to plot on

	cost_matrices : list
	List of DTW cost matrices.

	offset_x : float
	Offset on the x axis.

	offset_y : float
	Offset on the y axis.

	Returns
	-------
	ax: axes
	The Axes instance to plot on

	"""
	x_ax, y_ax = __size_dtw_matrices(dtw_matrices=cost_matrices)

	for i, cur_cost in enumerate(cost_matrices[::-1]):
	curr_x_ax = x_ax[i] + offset_x
	curr_y_ax = y_ax[i] + offset_y
	cur_cost = cost_matrices[i]
	ax.imshow(cur_cost, cmap='gray_r', aspect='auto', origin='lower',
	extent=[curr_y_ax[0], curr_y_ax[-1], curr_x_ax[0], curr_x_ax[-1]])

	return ax

	def __visualize_path_in_matrix(ax,
	wp: np.ndarray = None,
	axisX: np.ndarray = None,
	axisY: np.ndarray = None,
	path_color: str = 'r'):
	"""Plots a warping path on top of a given matrix. The matrix is
	usually an accumulated cost matrix.

	Parameters
	----------
	ax : axes
	The Axes instance to plot on

	wp : np.ndarray
	Warping path

	axisX : np.ndarray
	Array of X axis

	axisY : np.ndarray
	Array of Y axis

	path_color : str
	Color of the warping path to be plotted. (default: r)
	"""
	assert axisX is not None and isinstance(axisX, np.ndarray), 'axisX must be a numpy array!'
	assert axisY is not None and isinstance(axisY, np.ndarray), 'axisY must be a numpy array!'

	wp = wp.astype(int)

	ax.plot(axisY[wp[1, :]], axisX[wp[0, :]], '-k', linewidth=5)
	ax.plot(axisY[wp[1, :]], axisX[wp[0, :]], color=path_color, linewidth=3)

	def __visualize_constraint_rectangles(anchors: np.ndarray,
	linestyle: str = '-',
	edgecolor: str = 'royalblue',
	linewidth: float = 1.0):

	for k in range(anchors.shape[1]-1):
	a1 = anchors[:, k]
	a2 = anchors[:, k + 1]

	# a rectangle is defined by [x y width height]
	x = a1[0]
	y = a1[1]
	w = a2[0] - a1[0] + np.finfo(float).eps
	h = a2[1] - a1[1] + np.finfo(float).eps

	rect = Rectangle((x, y), w, h,
	linewidth=linewidth,
	edgecolor=edgecolor,
	linestyle=linestyle,
	facecolor='none')

	plt.gca().add_patch(rect)

	def project_alignment_on_a_new_feature_rate(alignment: np.ndarray,
	feature_rate_old: int,
	feature_rate_new: int,
	cost_matrix_size_old: tuple = (),
	cost_matrix_size_new: tuple = ()) -> np.ndarray:
	"""Projects an alignment computed for a cost matrix on a certain
	feature resolution on a cost matrix having a different feature
	resolution.

	Parameters
	----------
	alignment : np.ndarray [shape=(2, N)]
	Alignment matrix

	feature_rate_old : int
	Feature rate of the old cost matrix

	feature_rate_new : int
	Feature rate of the new cost matrix

	cost_matrix_size_old : tuple
	Size of the old cost matrix. Possibly needed to deal with border cases

	cost_matrix_size_new : tuple
	Size of the new cost matrix. Possibly needed to deal with border cases

	Returns
	-------
	np.ndarray [shape=(2, N)]
	Anchor sequence for the new cost matrix
	"""
	# Project the alignment on the new feature rate
	fac = feature_rate_new / feature_rate_old
	anchors = np.round(alignment * fac) + 1

	# In case the sizes of the cost matrices are given explicitly and the
	# alignment specifies to align the first and last elements, handle this case
	# separately since this might cause problems in the general projection
	# procedure.
	if cost_matrix_size_old is not None and cost_matrix_size_new is not None:
	if np.array_equal(alignment[:, 0], np.array([0, 0])):
	anchors[:, 0] = np.array([1, 1])

	if np.array_equal(alignment[:, -1], np.array(cost_matrix_size_old) - 1):
	anchors[:, -1] = np.array(cost_matrix_size_new)

	return anchors - 1

	def derive_anchors_from_projected_alignment(projected_alignment: np.ndarray,
	threshold: int) -> np.ndarray:
	"""Derive anchors from a projected alignment such that the area of the rectangle
	defined by two subsequent anchors a1 and a2 is below a given threshold.

	Parameters
	----------
	projected_alignment : np.ndarray [shape=(2, N)]
	Projected alignment array

	threshold : int
	Maximum area of the constraint rectangle

	Returns
	-------
	anchors_res : np.ndarray [shape=(2, M)]
	Resulting anchor sequence
	"""
	L = projected_alignment.shape[1]

	a1 = np.array(projected_alignment[:, 0], copy=True).reshape(-1, 1)
	a2 = np.array(projected_alignment[:, -1], copy=True).reshape(-1, 1)

	if __compute_area(a1, a2) <= threshold:
	anchors_res = np.concatenate([a1, a2], axis=1)
	elif L > 2:
	center = int(np.floor(L/2 + 1))

	a1 = np.array(projected_alignment[:, 0], copy=True).reshape(-1, 1)
	a2 = np.array(projected_alignment[:, center - 1], copy=True).reshape(-1, 1)
	a3 = np.array(projected_alignment[:, -1], copy=True).reshape(-1, 1)

	if __compute_area(a1, a2) > threshold:
	anchors_1 = derive_anchors_from_projected_alignment(projected_alignment[:, 0:center], threshold)
	else:
	anchors_1 = np.concatenate([a1, a2], axis=1)

	if __compute_area(a2, a3) > threshold:
	anchors_2 = derive_anchors_from_projected_alignment(projected_alignment[:, center - 1:], threshold)
	else:
	anchors_2 = np.concatenate([a2, a3], axis=1)

	anchors_res = np.concatenate([anchors_1, anchors_2[:, 1:]], axis=1)
	else:
	if __compute_area(a1, a2) > threshold:
	print('Only two anchor points are given which do not fulfill the constraint.')
	anchors_res = np.concatenate([a1, a2], axis=1)

	return anchors_res

	def derive_neighboring_anchors(warping_path: np.ndarray,
	anchor_indices: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
	"""Compute anchor points in the neighborhood of previous anchor points.

	Parameters
	----------
	warping_path : np.ndarray [shape=(2, N)]
	Warping path

	anchor_indices : np.ndarray
	Indices corresponding to the anchor points in the ``warping_path``

	Returns
	-------
	neighboring_anchors : np.ndarray [shape=(2, N-1)]
	Sequence of neighboring anchors

	neighboring_anchor_indices : np.ndarray
	Indices into ``warping path`` corresponding to ``neighboring_anchors``
	"""
	L = anchor_indices.shape[0]
	neighboring_anchor_indices = np.zeros(L-1, dtype=int)
	neighboring_anchors = np.zeros((2, L-1), dtype=int)

	for k in range(1, L):
	i1 = anchor_indices[k-1]
	i2 = anchor_indices[k]

	neighboring_anchor_indices[k-1] = i1 + np.floor((i2 - i1) / 2)
	neighboring_anchors[:, k-1] = warping_path[:, neighboring_anchor_indices[k - 1]]

	return neighboring_anchors, neighboring_anchor_indices


	@jit(nopython=True)
	def __compute_area(a: tuple,
	b: tuple):
	"""Computes the area between two points, given as tuples"""
	return (b[0] - a[0] + 1) * (b[1] - a[1] + 1)

	class Transcriber(PitchEstimator):
	def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
	super().__init__(labeling, instrument=instrument, sr=sr, window_size=window_size, hop_length=hop_length)

	def transcribe(self, audio, batch_size=128, postprocessing='spotify', include_pitch_bends=True, to_midi=True,
	debug=False):
	"""
	Transcribe an audio file or mono waveform in numpy or torch into MIDI with pitch bends.
	:param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
	:param batch_size: frames to process at once
	:param postprocessing: note creation method. 'spotify'(default) or 'tiktok'
	:param include_pitch_bends: whether to include pitch bends in the MIDI file
	:param to_midi: whether to return a MIDI file or a list of note events (as tuple)
	:return: transcribed MIDI file as a pretty_midi.PrettyMIDI object
	"""
	out = self.predict(audio, batch_size)
	if debug:
	plt.imshow(out['f0'].T, aspect='auto', origin='lower')
	plt.show()
	plt.imshow(out['note'].T, aspect='auto', origin='lower')
	plt.show()

	plt.imshow(out['onset'].T, aspect='auto', origin='lower')
	plt.show()

	plt.imshow(out['offset'].T, aspect='auto', origin='lower')
	plt.show()

	if to_midi:
	return self.out2midi(out, postprocessing, include_pitch_bends)
	else:
	return self.out2note(out, postprocessing, include_pitch_bends)



	def out2note(self, output: Dict[str, np.array], postprocessing='spotify',
	include_pitch_bends: bool = True,
	) -> List[Tuple[float, float, int, float, Optional[List[int]]]]:
	"""Convert model output to notes
	"""
	if postprocessing == 'spotify':
	estimated_notes = spotify_create_notes(
	output["note"],
	output["onset"],
	note_low=self.labeling.midi_centers[0],
	note_high=self.labeling.midi_centers[-1],
	onset_thresh=0.5,
	frame_thresh=0.3,
	infer_onsets=True,
	min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))), #127.70
	melodia_trick=True,
	)

	if postprocessing == 'rebab':
	estimated_notes = spotify_create_notes(
	output["note"],
	output["onset"],
	note_low=self.labeling.midi_centers[0],
	note_high=self.labeling.midi_centers[-1],
	onset_thresh=0.2,
	frame_thresh=0.2,
	infer_onsets=True,
	min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))), #127.70
	melodia_trick=True,
	)


	elif postprocessing == 'tiktok':
	postprocessor = RegressionPostProcessor(
	frames_per_second=self.sr / self.hop_length,
	classes_num=self.labeling.midi_centers.shape[0],
	begin_note=self.labeling.midi_centers[0],
	onset_threshold=0.2,
	offset_threshold=0.2,
	frame_threshold=0.3,
	pedal_offset_threshold=0.5,
	)
	tiktok_note_dict, _ = postprocessor.output_dict_to_midi_events(output)
	estimated_notes = []
	for list_item in tiktok_note_dict:
	if list_item['offset_time'] > 0.6 + list_item['onset_time']:
	estimated_notes.append((int(np.floor(list_item['onset_time']/(output['time'][1]))),
	int(np.ceil(list_item['offset_time']/(output['time'][1]))),
	list_item['midi_note'], list_item['velocity']/128))
	if include_pitch_bends:
	estimated_notes_with_pitch_bend = self.get_pitch_bends(output["f0"], estimated_notes)
	else:
	estimated_notes_with_pitch_bend = [(note[0], note[1], note[2], note[3], None) for note in estimated_notes]

	times_s = output['time']
	estimated_notes_time_seconds = [
	(times_s[note[0]], times_s[note[1]], note[2], note[3], note[4]) for note in estimated_notes_with_pitch_bend
	]

	return estimated_notes_time_seconds


	def out2midi(self, output: Dict[str, np.array], postprocessing: str = 'spotify', include_pitch_bends: bool = True,
	) -> PrettyMIDI:
	"""Convert model output to MIDI
	Args:
	output: A dictionary with shape
	{
	'frame': array of shape (n_times, n_freqs),
	'onset': array of shape (n_times, n_freqs),
	'contour': array of shape (n_times, 3*n_freqs)
	}
	representing the output of the basic pitch model.
	postprocessing: spotify or tiktok postprocessing.
	include_pitch_bends: If True, include pitch bends.
	Returns:
	note_events: A list of note event tuples (start_time_s, end_time_s, pitch_midi, amplitude)
	"""
	estimated_notes_time_seconds = self.out2note(output, postprocessing, include_pitch_bends)
	midi_tempo = 120 # todo: infer tempo from the onsets
	return self.note2midi(estimated_notes_time_seconds, midi_tempo)


	def note2midi(
	self, note_events_with_pitch_bends: List[Tuple[float, float, int, float, Optional[List[int]]]],
	midi_tempo: float = 120,
	):
	"""Create a pretty_midi object from note events
	:param note_events_with_pitch_bends: list of tuples
	[(start_time_seconds, end_time_seconds, pitch_midi, amplitude)]
	:param midi_tempo: #todo: infer tempo from the onsets
	:return: transcribed MIDI file as a pretty_midi.PrettyMIDI object
	"""
	mid = PrettyMIDI(initial_tempo=midi_tempo)

	program = instrument_name_to_program(self.instrument)
	instruments: DefaultDict[int, Instrument] = defaultdict(
	lambda: Instrument(program=program)
	)
	for start_time, end_time, note_number, amplitude, pitch_bend in note_events_with_pitch_bends:
	instrument = instruments[note_number]
	note = Note(
	velocity=int(np.round(127 * amplitude)),
	pitch=note_number,
	start=start_time,
	end=end_time,
	)
	instrument.notes.append(note)
	if not isinstance(pitch_bend, np.ndarray):
	continue
	pitch_bend_times = np.linspace(start_time, end_time, len(pitch_bend))

	for pb_time, pb_midi in zip(pitch_bend_times, pitch_bend):
	instrument.pitch_bends.append(PitchBend(pb_midi, pb_time))

	mid.instruments.extend(instruments.values())

	return mid

	def sync_via_mrmsdtw_with_anchors(f_chroma1: np.ndarray,
	f_chroma2: np.ndarray,
	f_onset1: np.ndarray = None,
	f_onset2: np.ndarray = None,
	input_feature_rate: float = 50,
	step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int32),
	step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
	threshold_rec: int = 10000,
	win_len_smooth: np.ndarray = np.array([201, 101, 21, 1]),
	downsamp_smooth: np.ndarray = np.array([50, 25, 5, 1]),
	verbose: bool = False,
	dtw_implementation: str = 'synctoolbox',
	normalize_chroma: bool = True,
	chroma_norm_ord: int = 2,
	chroma_norm_threshold: float = 0.001,
	visualization_title: str = "MrMsDTW result",
	anchor_pairs: List[Tuple] = None,
	linear_inp_idx: List[int] = [],
	alpha=0.5) -> np.ndarray:
	"""Compute memory-restricted multi-scale DTW (MrMsDTW) using chroma and (optionally) onset features.
	MrMsDTW is performed on multiple levels that get progressively finer, with rectangular constraint
	regions defined by the alignment found on the previous, coarser level.
	If onset features are provided, these are used on the finest level in addition to chroma
	to provide higher synchronization accuracy.

	Parameters
	----------
	f_chroma1 : np.ndarray [shape=(12, N)]
	Chroma feature matrix of the first sequence

	f_chroma2 : np.ndarray [shape=(12, M)]
	Chroma feature matrix of the second sequence

	f_onset1 : np.ndarray [shape=(L, N)]
	Onset feature matrix of the first sequence (optional, default: None)

	f_onset2 : np.ndarray [shape=(L, M)]
	Onset feature matrix of the second sequence (optional, default: None)

	input_feature_rate: int
	Input feature rate of the chroma features (default: 50)

	step_sizes: np.ndarray
	DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))

	step_weights: np.ndarray
	DTW step weights (np.array([1.0, 1.0, 1.0]))

	threshold_rec: int
	Defines the maximum area that is spanned by the rectangle of two
	consecutive elements in the alignment (default: 10000)

	win_len_smooth : np.ndarray
	Window lengths for chroma feature smoothing (default: np.array([201, 101, 21, 1]))

	downsamp_smooth : np.ndarray
	Downsampling factors (default: np.array([50, 25, 5, 1]))

	verbose : bool
	Set `True` for visualization (default: False)

	dtw_implementation : str
	DTW implementation, librosa or synctoolbox (default: synctoolbox)

	normalize_chroma : bool
	Set `True` to normalize input chroma features after each downsampling
	and smoothing operation.

	chroma_norm_ord: int
	Order of chroma normalization, relevant if ``normalize_chroma`` is True.
	(default: 2)

	chroma_norm_threshold: float
	If the norm falls below threshold for a feature vector, then the
	normalized feature vector is set to be the unit vector. Relevant, if
	``normalize_chroma`` is True (default: 0.001)

	visualization_title : str
	Title for the visualization plots. Only relevant if 'verbose' is True
	(default: "MrMsDTW result")

	anchor_pairs: List[Tuple]
	Anchor pairs given in seconds. Note that
	* (0, 0) and (<audio-len1>, <audio-len2>) are not allowed.
	* Anchors must be monotonously increasing.

	linear_inp_idx: List[int]
	List of the indices of intervals created by anchor pairs, for which
	MrMsDTW shouldn't be run, e.g., if the interval only involves silence.

	0 ap1 ap2 ap3
	\| \| \| \|
	\| idx0 \| idx1 \| idx2 \| idx3 OR idx-1
	\| \| \| \|

	Note that index -1 corresponds to the last interval, which begins with
	the last anchor pair until the end of the audio files.

	alpha: float
	Coefficient for the Chroma cost matrix in the finest scale of the MrMsDTW algorithm.
	C = alpha * C_Chroma + (1 - alpha) * C_act (default: 0.5)

	Returns
	-------
	wp : np.ndarray [shape=(2, T)]
	Resulting warping path which indicates synchronized indices.
	"""
	if anchor_pairs is None:
	wp = sync_via_mrmsdtw(f_chroma1=f_chroma1,
	f_chroma2=f_chroma2,
	f_onset1=f_onset1,
	f_onset2=f_onset2,
	input_feature_rate=input_feature_rate,
	step_sizes=step_sizes,
	step_weights=step_weights,
	threshold_rec=threshold_rec,
	win_len_smooth=win_len_smooth,
	downsamp_smooth=downsamp_smooth,
	verbose=verbose,
	dtw_implementation=dtw_implementation,
	normalize_chroma=normalize_chroma,
	chroma_norm_ord=chroma_norm_ord,
	chroma_norm_threshold=chroma_norm_threshold,
	visualization_title=visualization_title,
	alpha=alpha)
	else:
	# constant_intervals = [((0, x1), (0, y1), False),
	# ((x1, x2), (y1, y2), True),
	# ((x2, -1), (y2, -1), False)]
	wp = None

	if verbose:
	print('Anchor points are given!')

	__check_anchor_pairs(anchor_pairs, f_chroma1.shape[1], f_chroma2.shape[1], input_feature_rate)

	# Add ending as the anchor point
	anchor_pairs.append((-1, -1))

	prev_a1 = 0
	prev_a2 = 0

	for idx, anchor_pair in enumerate(anchor_pairs):
	cur_a1, cur_a2 = anchor_pair

	# Split the features
	f_chroma1_split, f_onset1_split, f_chroma2_split, f_onset2_split = __split_features(f_chroma1,
	f_onset1,
	f_chroma2,
	f_onset2,
	cur_a1,
	cur_a2,
	prev_a1,
	prev_a2,
	input_feature_rate)

	if idx in linear_inp_idx or idx == len(anchor_pairs) - 1 and -1 in linear_inp_idx:
	# Generate a diagonal warping path, if the algorithm is not supposed to executed.
	# A typical scenario is the silence breaks which are enclosed by two anchor points.
	if verbose:
	print('A diagonal warping path is generated for the interval \n\t Feature sequence 1: %.2f - %.2f'
	'\n\t Feature sequence 2: %.2f - %.2f\n' % (prev_a1, cur_a1, prev_a2, cur_a2))
	wp_cur = __diagonal_warping_path(f_chroma1_split, f_chroma2_split)

	else:
	if verbose:
	if cur_a1 != -1 and cur_a2 != -1:
	print('MrMsDTW is applied for the interval \n\t Feature sequence 1: %.2f - %.2f'
	'\n\t Feature sequence 2: %.2f - %.2f\n' % (prev_a1, cur_a1, prev_a2, cur_a2))
	else:
	print('MrMsDTW is applied for the interval \n\t Feature sequence 1: %.2f - end'
	'\n\t Feature sequence 2: %.2f - end\n' % (prev_a1, prev_a2))
	wp_cur = sync_via_mrmsdtw(f_chroma1=f_chroma1_split,
	f_chroma2=f_chroma2_split,
	f_onset1=f_onset1_split,
	f_onset2=f_onset2_split,
	input_feature_rate=input_feature_rate,
	step_sizes=step_sizes,
	step_weights=step_weights,
	threshold_rec=threshold_rec,
	win_len_smooth=win_len_smooth,
	downsamp_smooth=downsamp_smooth,
	verbose=verbose,
	dtw_implementation=dtw_implementation,
	normalize_chroma=normalize_chroma,
	chroma_norm_ord=chroma_norm_ord,
	chroma_norm_threshold=chroma_norm_threshold,
	alpha=alpha)

	if wp is None:
	wp = np.array(wp_cur, copy=True)

	# Concatenate warping paths
	else:
	wp = np.concatenate([wp, wp_cur + wp[:, -1].reshape(2, 1) + 1], axis=1)

	prev_a1 = cur_a1
	prev_a2 = cur_a2

	anchor_pairs.pop()

	return wp

	def sync_via_mrmsdtw(f_chroma1: np.ndarray,
	f_chroma2: np.ndarray,
	f_onset1: np.ndarray = None,
	f_onset2: np.ndarray = None,
	input_feature_rate: float = 50,
	step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int32),
	step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
	threshold_rec: int = 10000,
	win_len_smooth: np.ndarray = np.array([201, 101, 21, 1]),
	downsamp_smooth: np.ndarray = np.array([50, 25, 5, 1]),
	verbose: bool = False,
	dtw_implementation: str = 'synctoolbox',
	normalize_chroma: bool = True,
	chroma_norm_ord: int = 2,
	chroma_norm_threshold: float = 0.001,
	visualization_title: str = "MrMsDTW result",
	alpha=0.5) -> np.ndarray:
	"""Compute memory-restricted multi-scale DTW (MrMsDTW) using chroma and (optionally) onset features.
	MrMsDTW is performed on multiple levels that get progressively finer, with rectangular constraint
	regions defined by the alignment found on the previous, coarser level.
	If onset features are provided, these are used on the finest level in addition to chroma
	to provide higher synchronization accuracy.

	Parameters
	----------
	f_chroma1 : np.ndarray [shape=(12, N)]
	Chroma feature matrix of the first sequence

	f_chroma2 : np.ndarray [shape=(12, M)]
	Chroma feature matrix of the second sequence

	f_onset1 : np.ndarray [shape=(L, N)]
	Onset feature matrix of the first sequence (optional, default: None)

	f_onset2 : np.ndarray [shape=(L, M)]
	Onset feature matrix of the second sequence (optional, default: None)

	input_feature_rate: int
	Input feature rate of the chroma features (default: 50)

	step_sizes: np.ndarray
	DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))

	step_weights: np.ndarray
	DTW step weights (np.array([1.0, 1.0, 1.0]))

	threshold_rec: int
	Defines the maximum area that is spanned by the rectangle of two
	consecutive elements in the alignment (default: 10000)

	win_len_smooth : np.ndarray
	Window lengths for chroma feature smoothing (default: np.array([201, 101, 21, 1]))

	downsamp_smooth : np.ndarray
	Downsampling factors (default: np.array([50, 25, 5, 1]))

	verbose : bool
	Set `True` for visualization (default: False)

	dtw_implementation : str
	DTW implementation, librosa or synctoolbox (default: synctoolbox)

	normalize_chroma : bool
	Set `True` to normalize input chroma features after each downsampling
	and smoothing operation.

	chroma_norm_ord: int
	Order of chroma normalization, relevant if ``normalize_chroma`` is True.
	(default: 2)

	chroma_norm_threshold: float
	If the norm falls below threshold for a feature vector, then the
	normalized feature vector is set to be the unit vector. Relevant, if
	``normalize_chroma`` is True (default: 0.001)

	visualization_title : str
	Title for the visualization plots. Only relevant if 'verbose' is True
	(default: "MrMsDTW result")

	alpha: float
	Coefficient for the Chroma cost matrix in the finest scale of the MrMsDTW algorithm.
	C = alpha * C_Chroma + (1 - alpha) * C_act (default: 0.5)

	Returns
	-------
	alignment: np.ndarray [shape=(2, T)]
	Resulting warping path which indicates synchronized indices.
	"""
	# If onset features are given as input, high resolution MrMsDTW is activated.
	high_res = False
	if f_onset1 is not None and f_onset2 is not None:
	high_res = True

	if high_res and (f_chroma1.shape[1] != f_onset1.shape[1] or f_chroma2.shape[1] != f_onset2.shape[1]):
	raise ValueError('Chroma and onset features must be of the same length.')

	if downsamp_smooth[-1] != 1 or win_len_smooth[-1] != 1:
	raise ValueError('The downsampling factor of the last iteration must be equal to 1, i.e.'
	'at the last iteration, it is computed at the input feature rate!')

	num_iterations = win_len_smooth.shape[0]
	cost_matrix_size_old = tuple()
	feature_rate_old = input_feature_rate / downsamp_smooth[0]
	alignment = None
	total_computation_time = 0.0

	# If the area is less than the threshold_rec, don't apply the multiscale DTW.
	it = (num_iterations - 1) if __compute_area(f_chroma1, f_chroma2) < threshold_rec else 0

	while it < num_iterations:
	tic1 = perf_counter()

	# Smooth and downsample given raw features
	f_chroma1_cur, _ = smooth_downsample_feature(f_chroma1,
	input_feature_rate=input_feature_rate,
	win_len_smooth=win_len_smooth[it],
	downsamp_smooth=downsamp_smooth[it])

	f_chroma2_cur, feature_rate_new = smooth_downsample_feature(f_chroma2,
	input_feature_rate=input_feature_rate,
	win_len_smooth=win_len_smooth[it],
	downsamp_smooth=downsamp_smooth[it])

	if normalize_chroma:
	f_chroma1_cur = normalize_feature(f_chroma1_cur,
	norm_ord=chroma_norm_ord,
	threshold=chroma_norm_threshold)

	f_chroma2_cur = normalize_feature(f_chroma2_cur,
	norm_ord=chroma_norm_ord,
	threshold=chroma_norm_threshold)

	# Project path onto new resolution
	cost_matrix_size_new = (f_chroma1_cur.shape[1], f_chroma2_cur.shape[1])

	if alignment is None:
	# Initialize the alignment with the start and end frames of the feature sequence
	anchors = np.array([[0, f_chroma1_cur.shape[1] - 1], [0, f_chroma2_cur.shape[1] - 1]])

	else:
	projected_alignment = project_alignment_on_a_new_feature_rate(alignment=alignment,
	feature_rate_old=feature_rate_old,
	feature_rate_new=feature_rate_new,
	cost_matrix_size_old=cost_matrix_size_old,
	cost_matrix_size_new=cost_matrix_size_new)

	anchors = derive_anchors_from_projected_alignment(projected_alignment=projected_alignment,
	threshold=threshold_rec)

	# Cost matrix and warping path computation
	if high_res and it == num_iterations - 1:
	# Compute cost considering chroma and pitch onset features and alignment only in the last iteration,
	# where the features are at the finest level.
	cost_matrices_step1 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
	f_chroma2=f_chroma2_cur,
	f_onset1=f_onset1,
	f_onset2=f_onset2,
	anchors=anchors,
	alpha=alpha)

	else:
	cost_matrices_step1 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
	f_chroma2=f_chroma2_cur,
	anchors=anchors,
	alpha=alpha)

	wp_list = compute_warping_paths_from_cost_matrices(cost_matrices_step1,
	step_sizes=step_sizes,
	step_weights=step_weights,
	implementation=dtw_implementation)

	# Concatenate warping paths
	wp = build_path_from_warping_paths(warping_paths=wp_list,
	anchors=anchors)

	anchors_step1 = None
	wp_step1 = None
	num_rows_step1 = 0
	num_cols_step1 = 0
	ax = None

	toc1 = perf_counter()
	if verbose and cost_matrices_step1 is not None:
	anchors_step1 = np.array(anchors, copy=True)
	wp_step1 = np.array(wp, copy=True)
	num_rows_step1, num_cols_step1 = np.sum(np.array([dtw_mat.shape for dtw_mat in cost_matrices_step1], int),
	axis=0)
	fig, ax = sync_visualize_step1(cost_matrices_step1,
	num_rows_step1,
	num_cols_step1,
	anchors,
	wp)
	tic2 = perf_counter()

	# Compute neighboring anchors and refine alignment using local path between neighboring anchors
	anchor_indices_in_warping_path = find_anchor_indices_in_warping_path(wp, anchors=anchors)

	# Compute neighboring anchors for refinement
	neighboring_anchors, neighboring_anchor_indices = \
	derive_neighboring_anchors(wp, anchor_indices=anchor_indices_in_warping_path)

	if neighboring_anchor_indices.shape[0] > 1 \
	and it == num_iterations - 1 and high_res:
	cost_matrices_step2 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
	f_chroma2=f_chroma2_cur,
	f_onset1=f_onset1,
	f_onset2=f_onset2,
	anchors=neighboring_anchors,
	alpha=alpha)

	else:
	cost_matrices_step2 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
	f_chroma2=f_chroma2_cur,
	anchors=neighboring_anchors,
	alpha=alpha)

	wp_list_refine = compute_warping_paths_from_cost_matrices(cost_matrices=cost_matrices_step2,
	step_sizes=step_sizes,
	step_weights=step_weights,
	implementation=dtw_implementation)

	wp = __refine_wp(wp, anchors, wp_list_refine, neighboring_anchors, neighboring_anchor_indices)

	toc2 = perf_counter()
	computation_time_it = toc2 - tic2 + toc1 - tic1
	total_computation_time += computation_time_it

	alignment = wp
	feature_rate_old = feature_rate_new
	cost_matrix_size_old = cost_matrix_size_new

	if verbose and cost_matrices_step2 is not None:
	sync_visualize_step2(ax,
	cost_matrices_step2,
	wp,
	wp_step1,
	num_rows_step1,
	num_cols_step1,
	anchors_step1,
	neighboring_anchors,
	plot_title=f"{visualization_title} - Level {it + 1}")
	print('Level {} computation time: {:.2f} seconds'.format(it, computation_time_it))

	it += 1

	if verbose:
	print('Computation time of MrMsDTW: {:.2f} seconds'.format(total_computation_time))

	return alignment

	def __diagonal_warping_path(f1: np.ndarray,
	f2: np.ndarray) -> np.ndarray:
	"""Generates a diagonal warping path given two feature sequences.

	Parameters
	----------
	f1: np.ndarray [shape=(_, N)]
	First feature sequence

	f2: np.ndarray [shape=(_, M)]
	Second feature sequence

	Returns
	-------
	np.ndarray: Diagonal warping path [shape=(2, T)]
	"""
	max_size = np.maximum(f1.shape[1], f2.shape[1])
	min_size = np.minimum(f1.shape[1], f2.shape[1])

	if min_size == 1:
	return np.array([max_size - 1, 0]).reshape(-1, 1)

	elif max_size == f1.shape[1]:
	return np.array([np.round(np.linspace(0, max_size - 1, min_size)), np.linspace(0, min_size - 1, min_size)])

	else:
	return np.array([np.linspace(0, min_size-1, min_size), np.round(np.linspace(0, max_size - 1, min_size))])

	@jit(nopython=True)
	def __compute_area(f1, f2):
	"""Computes the area of the cost matrix given two feature sequences

	Parameters
	----------
	f1: np.ndarray
	First feature sequence

	f2: np.ndarray
	Second feature sequence

	Returns
	-------
	int: Area of the cost matrix
	"""
	return f1.shape[1] * f2.shape[1]

	def __split_features(f_chroma1: np.ndarray,
	f_onset1: np.ndarray,
	f_chroma2: np.ndarray,
	f_onset2: np.ndarray,
	cur_a1: float,
	cur_a2: float,
	prev_a1: float,
	prev_a2: float,
	feature_rate: int) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray, Optional[np.ndarray]]:

	if cur_a1 == -1:
	f_chroma1_split = f_chroma1[:, int(prev_a1 * feature_rate):]
	if f_onset1 is not None:
	f_onset1_split = f_onset1[:, int(prev_a1 * feature_rate):]
	else:
	f_onset1_split = None

	else:
	# Split the features
	f_chroma1_split = f_chroma1[:, int(prev_a1 * feature_rate):int(cur_a1 * feature_rate)]
	if f_onset1 is not None:
	f_onset1_split = f_onset1[:, int(prev_a1 * feature_rate):int(cur_a1 * feature_rate)]
	else:
	f_onset1_split = None

	if cur_a2 == -1:
	f_chroma2_split = f_chroma2[:, int(prev_a2 * feature_rate):]
	if f_onset2 is not None:
	f_onset2_split = f_onset2[:, int(prev_a2 * feature_rate):]
	else:
	f_onset2_split = None

	else:
	f_chroma2_split = f_chroma2[:, int(prev_a2 * feature_rate):int(cur_a2 * feature_rate)]
	if f_onset2 is not None:
	f_onset2_split = f_onset2[:, int(prev_a2 * feature_rate):int(cur_a2 * feature_rate)]
	else:
	f_onset2_split = None

	return f_chroma1_split, f_onset1_split, f_chroma2_split, f_onset2_split

	def __refine_wp(wp: np.ndarray,
	anchors: np.ndarray,
	wp_list_refine: List,
	neighboring_anchors: np.ndarray,
	neighboring_anchor_indices: np.ndarray) -> np.ndarray:
	wp_length = wp[:, neighboring_anchor_indices[-1]:].shape[1]
	last_list = wp[:, neighboring_anchor_indices[-1]:] - np.tile(
	wp[:, neighboring_anchor_indices[-1]].reshape(-1, 1), wp_length)
	wp_list_tmp = [wp[:, :neighboring_anchor_indices[0] + 1]] + wp_list_refine + [last_list]
	A_tmp = np.concatenate([anchors[:, 0].reshape(-1, 1), neighboring_anchors, anchors[:, -1].reshape(-1, 1)],
	axis=1)
	wp_res = build_path_from_warping_paths(warping_paths=wp_list_tmp,
	anchors=A_tmp)

	return wp_res

	def __check_anchor_pairs(anchor_pairs: List,
	f_len1: int,
	f_len2: int,
	feature_rate: int):
	"""Ensures that the anchors satisfy the conditions

	Parameters
	----------
	anchor_pairs: List[Tuple]
	List of anchor pairs

	f_len1: int
	Length of the first feature sequence

	f_len2: int
	Length of the second feature sequence

	feature_rate: int
	Input feature rate of the features
	"""
	prev_a1 = 0
	prev_a2 = 0
	for anchor_pair in anchor_pairs:
	a1, a2 = anchor_pair

	if a1 <= 0 or a2 <= 0:
	raise ValueError('Starting point must be a positive number!')

	if a1 > f_len1 / feature_rate or a2 > f_len2 / feature_rate:
	raise ValueError('Anchor points cannot be greater than the length of the input audio files!')

	if a1 == f_len1 and a2 == f_len2:
	raise ValueError('Both anchor points cannot be equal to the length of the audio files.')

	if a1 == prev_a1 and a2 == prev_a2:
	raise ValueError('Duplicate anchor pairs are not allowed!')

	if a1 < prev_a1 or a2 < prev_a2:
	raise ValueError('Anchor points must be monotonously increasing.')

	prev_a1 = a1
	prev_a2 = a2

	class PerformanceLabel:
	"""
	The dataset labeling class for performance representations. Currently, includes onset, note, and fine-grained f0
	representations. Note min, note max, and f0_bin_per_semitone values are to be arranged per instrument. The default
	values are for violin performance analysis. Fretted instruments might not require such f0 resolutions per semitone.
	"""
	def __init__(self, note_min='F#3', note_max='C8', f0_bins_per_semitone=9, f0_smooth_std_c=None,
	onset_smooth_std=0.7, f0_tolerance_c=200):
	midi_min = note_name_to_number(note_min)
	midi_max = note_name_to_number(note_max)
	self.midi_centers = np.arange(midi_min, midi_max)
	self.onset_smooth_std=onset_smooth_std # onset smoothing along time axis (compensate for alignment)

	f0_hz_range = note_to_hz([note_min, note_max])
	f0_c_min, f0_c_max = hz2cents(f0_hz_range)
	self.f0_granularity_c = 100/f0_bins_per_semitone
	if not f0_smooth_std_c:
	f0_smooth_std_c = self.f0_granularity_c * 5/4 # Keep the ratio from the CREPE paper (20 cents and 25 cents)
	self.f0_smooth_std_c = f0_smooth_std_c

	self.f0_centers_c = np.arange(f0_c_min, f0_c_max, self.f0_granularity_c)
	self.f0_centers_hz = 10 * 2 ** (self.f0_centers_c / 1200)
	self.f0_n_bins = len(self.f0_centers_c)

	self.pdf_normalizer = norm.pdf(0)

	self.f0_c2hz = lambda c: 102*(c/1200)
	self.f0_hz2c = hz2cents
	self.midi_centers_c = self.f0_hz2c(midi_to_hz(self.midi_centers))

	self.f0_tolerance_bins = int(f0_tolerance_c/self.f0_granularity_c)
	self.f0_transition_matrix = gaussian_filter1d(np.eye(2*self.f0_tolerance_bins + 1), 25/self.f0_granularity_c)

	def f0_c2label(self, pitch_c):
	"""
	Convert a single f0 value in cents to a one-hot label vector with smoothing (i.e., create a gaussian blur around
	the target f0 bin for regularization and training stability. The blur is controlled by self.f0_smooth_std_c
	:param pitch_c: a single pitch value in cents
	:return: one-hot label vector with frequency blur
	"""
	result = norm.pdf((self.f0_centers_c - pitch_c) / self.f0_smooth_std_c).astype(np.float32)
	result /= self.pdf_normalizer
	return result

	def f0_label2c(self, salience, center=None):
	"""
	Convert the salience predictions to monophonic f0 in cents. Only outputs a single f0 value per frame!
	:param salience: f0 activations
	:param center: f0 center bin to calculate the weighted average. Use argmax if empty
	:return: f0 array per frame (in cents).
	"""
	if salience.ndim == 1:
	if center is None:
	center = int(np.argmax(salience))
	start = max(0, center - 4)
	end = min(len(salience), center + 5)
	salience = salience[start:end]
	product_sum = np.sum(salience * self.f0_centers_c[start:end])
	weight_sum = np.sum(salience)
	return product_sum / np.clip(weight_sum, 1e-8, None)
	if salience.ndim == 2:
	return np.array([self.f0_label2c(salience[i, :]) for i in range(salience.shape[0])])
	raise Exception("label should be either 1d or 2d ndarray")

	def fill_onset_matrix(self, onsets, window, feature_rate):
	"""
	Create a sparse onset matrix from window and onsets (per-semitone). Apply a gaussian smoothing (along time)
	so that we can tolerate better the alignment problems. This is similar to the frequency smoothing for the f0.
	The temporal smoothing is controlled by the parameter self.onset_smooth_std
	:param onsets: A 2d np.array of individual note onsets with their respective time values
	(Nx2: time in seconds - midi number)
	:param window: Timestamps for the frame centers of the sparse matrix
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds
	:return: onset_roll: A sparse matrix filled with temporally blurred onsets.
	"""
	onsets = self.get_window_feats(onsets, window, feature_rate)
	onset_roll = np.zeros((len(window), len(self.midi_centers)))
	for onset in onsets:
	onset, note = onset # it was a pair with time and midi note
	if self.midi_centers[0] < note < self.midi_centers[-1]: # midi note should be in the range defined
	note = int(note) - self.midi_centers[0] # find the note index in our range
	onset = (onset*feature_rate)-window[0] # onset index (as float but in frames, not in seconds!)
	start = max(0, int(onset) - 3)
	end = min(len(window) - 1, int(onset) + 3)
	try:
	vals = norm.pdf(np.linspace(start - onset, end - onset, end - start + 1) / self.onset_smooth_std)
	# if you increase 0.7 you smooth the peak
	# if you decrease it, e.g., 0.1, it becomes too peaky! around 0.5-0.7 seems ok
	vals /= self.pdf_normalizer
	onset_roll[start:end + 1, note] += vals
	except ValueError:
	print('start',start, 'onset', onset, 'end', end)
	return onset_roll, onsets

	def fill_note_matrix(self, notes, window, feature_rate):
	"""
	Create the note matrix (piano roll) from window timestamps and note values per frame.
	:param notes: A 2d np.array of individual notes with their active time values Nx2
	:param window: Timestamps for the frame centers of the output
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds
	:return note_roll: The piano roll in the defined range of [note_min, note_max).
	"""
	notes = self.get_window_feats(notes, window, feature_rate)

	# take the notes in the midi range defined
	notes = notes[np.logical_and(notes[:,1]>=self.midi_centers[0], notes[:,1]<=self.midi_centers[-1]),:]

	times = (notes[:,0]*feature_rate - window[0]).astype(int) # in feature samples (fs:self.hop/self.sr)
	notes = (notes[:,1] - self.midi_centers[0]).astype(int)

	note_roll = np.zeros((len(window), len(self.midi_centers)))
	note_roll[(times, notes)] = 1
	return note_roll, notes


	def fill_f0_matrix(self, f0s, window, feature_rate):
	"""
	Unlike the labels for onsets and notes, f0 label is only relevant for strictly monophonic regions! Thus, this
	function returns a boolean which represents where to apply the given values.
	Never back-propagate without the boolean! Empty frames mean that the label is not that reliable.

	:param f0s: A 2d np.array of f0 values with the time they belong to (2xN: time in seconds - f0 in Hz)
	:param window: Timestamps for the frame centers of the output
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds

	:return f0_roll: f0 label matrix and
	f0_hz: f0 values in Hz
	annotation_bool: A boolean array representing which frames have reliable f0 annotations.
	"""
	f0s = self.get_window_feats(f0s, window, feature_rate)
	f0_cents = np.zeros_like(window, dtype=float)
	f0s[:,1] = self.f0_hz2c(f0s[:,1]) # convert f0 in hz to cents

	annotation_bool = np.zeros_like(window, dtype=bool)
	f0_roll = np.zeros((len(window), len(self.f0_centers_c)))
	times_in_frame = f0s[:, 0]*feature_rate - window[0]
	for t, f0 in enumerate(f0s):
	t = times_in_frame[t]
	if t%1 < 0.25: # only consider it as annotation if the f0 values is really close to the frame center
	t = int(np.round(t))
	f0_roll[t] = self.f0_c2label(f0[1])
	annotation_bool[t] = True
	f0_cents[t] = f0[1]

	return f0_roll, f0_cents, annotation_bool


	@staticmethod
	def get_window_feats(time_feature_matrix, window, feature_rate):
	"""
	Restrict the feature matrix to the features that are inside the window
	:param window: Timestamps for the frame centers of the output
	:param time_feature_matrix: A 2d array of Nx2 per the entire file.
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds
	:return: window_features: the features inside the given window
	"""
	start = time_feature_matrix[:,0]>(window[0]-0.5)/feature_rate
	end = time_feature_matrix[:,0]<(window[-1]+0.5)/feature_rate
	window_features = np.logical_and(start, end)
	window_features = np.array(time_feature_matrix[window_features,:])
	return window_features

	def represent_midi(self, midi, feature_rate):
	"""
	Represent a midi file as sparse matrices of onsets, offsets, and notes. No f0 is included.
	:param midi: A midi file (either a path or a pretty_midi.PrettyMIDI object)
	:param feature_rate: The feature rate in Hz
	:return: dict {onset, offset, note, time}: Same format with the model's learning and outputs
	"""
	def _get_onsets_offsets_frames(midi_content):
	if isinstance(midi_content, str):
	midi_content = PrettyMIDI(midi_content)
	onsets = []
	offsets = []
	frames = []
	for instrument in midi_content.instruments:
	for note in instrument.notes:
	start = int(np.round(note.start * feature_rate))
	end = int(np.round(note.end * feature_rate))
	note_times = (np.arange(start, end+0.5)/feature_rate)[:, np.newaxis]
	note_pitch = np.full_like(note_times, fill_value=note.pitch)
	onsets.append([note.start, note.pitch])
	offsets.append([note.end, note.pitch])
	frames.append(np.hstack([note_times, note_pitch]))
	onsets = np.vstack(onsets)
	offsets = np.vstack(offsets)
	frames = np.vstack(frames)
	return onsets, offsets, frames, midi_content
	onset_array, offset_array, frame_array, midi_object = _get_onsets_offsets_frames(midi)
	window = np.arange(frame_array[0, 0]feature_rate, frame_array[-1, 0]feature_rate, dtype=int)
	onset_roll, _ = self.fill_onset_matrix(onset_array, window, feature_rate)
	offset_roll, _ = self.fill_onset_matrix(offset_array, window, feature_rate)
	note_roll, _ = self.fill_note_matrix(frame_array, window, feature_rate)
	start_anchor = onset_array[onset_array[:, 0]==np.min(onset_array[:, 0])]
	end_anchor = offset_array[offset_array[:, 0]==np.max(offset_array[:, 0])]
	return {
	'midi': midi_object,
	'note': note_roll,
	'onset': onset_roll,
	'offset': offset_roll,
	'time': window/feature_rate,
	'start_anchor': start_anchor,
	'end_anchor': end_anchor
	}

	class Synchronizer(Transcriber):
	def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
	super().__init__(labeling, instrument=instrument, sr=sr, window_size=window_size, hop_length=hop_length)
	def synchronize(self, audio, midi, batch_size=128, include_pitch_bends=True, to_midi=True, debug=False,
	include_velocity=False, alignment_padding=50, timing_refinement_range_with_f0s=0):
	"""
	Synchronize an audio file or mono waveform in numpy or torch with a MIDI file.
	:param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
	:param midi: str, pathlib.Path, or pretty_midi.PrettyMIDI
	:param batch_size: frames to process at once
	:param include_pitch_bends: whether to include pitch bends in the MIDI file
	:param to_midi: whether to return a MIDI file or a list of note events (as tuple)
	:param debug: whether to plot the alignment path and compare the alignment with the predicted notes
	:param include_velocity: whether to embed the note confidence in place of the velocity in the MIDI file
	:param alignment_padding: how many frames to pad the audio and MIDI representations with
	:param timing_refinement_range_with_f0s: how many frames to refine the alignment with the f0 confidence
	:return: aligned MIDI file as a pretty_midi.PrettyMIDI object

	Args:
	debug:
	to_midi:
	include_pitch_bends:
	"""

	audio = self.predict(audio, batch_size)
	notes_and_midi = self.out2sync(audio, midi, include_velocity=include_velocity,
	alignment_padding=alignment_padding)
	if notes_and_midi: # it might be none
	notes, midi = notes_and_midi

	if debug:
	import pandas as pd
	estimated_notes = self.out2note(audio, postprocessing='spotify', include_pitch_bends=True)
	est_df = pd.DataFrame(estimated_notes).sort_values(by=0)
	note_df = pd.DataFrame(notes).sort_values(by=0)

	fig, ax = plt.subplots(figsize=(20, 10))

	for row in notes:
	t_start = row[0] # sec
	t_end = row[1] # sec
	freq = row[2] # Hz
	ax.hlines(freq, t_start, t_end, color='k', linewidth=3, zorder=2, alpha=0.5)

	for row in estimated_notes:
	t_start = row[0] # sec
	t_end = row[1] # sec
	freq = row[2] # Hz
	ax.hlines(freq, t_start, t_end, color='r', linewidth=3, zorder=2, alpha=0.5)
	fig.suptitle('alignment (black) vs. estimated (red)')
	fig.show()

	if not include_pitch_bends:
	if to_midi:
	return midi['midi']
	else:
	return notes
	else:
	notes = [(np.argmin(np.abs(audio['time']-note[0])),
	np.argmin(np.abs(audio['time']-note[1])),
	note[2], note[3]) for note in notes]
	notes = self.get_pitch_bends(audio["f0"], notes, timing_refinement_range_with_f0s)
	notes = [
	(audio['time'][note[0]], audio['time'][note[1]], note[2], note[3], note[4]) for note in
	notes
	]
	if to_midi:
	return self.note2midi(notes, 120) #int(midi['midi'].estimate_tempo()))
	else:
	return notes

	def out2sync_old(self, out: Dict[str, np.array], midi, include_velocity=False, alignment_padding=50, debug=False):
	"""
	Synchronizes the output of the model with the MIDI file.
	Args:
	out: Model output dictionary
	midi: Path to the MIDI file or PrettyMIDI object
	include_velocity: Whether to encode the note confidence in place of velocity
	alignment_padding: Number of frames to pad the MIDI features with zeros
	debug: Visualize the alignment

	Returns:
	note events and the aligned PrettyMIDI object
	"""
	midi = self.labeling.represent_midi(midi, self.sr/self.hop_length)

	audio_midi_anchors = self.prepare_for_synchronization(out, midi, feature_rate=self.sr/self.hop_length,
	pad_length=alignment_padding)
	if isinstance(audio_midi_anchors, str):
	print(audio_midi_anchors)
	return None # the file is corrupted! no possible alignment at all
	else:
	audio, midi, anchor_pairs = audio_midi_anchors

	ALPHA = 0.6 # This is the coefficient of onsets, 1 - ALPHA for offsets

	wp = sync_via_mrmsdtw_with_anchors(f_chroma1=audio['note'].T,
	f_onset1=np.hstack([ALPHA * audio['onset'],
	(1 - ALPHA) * audio['offset']]).T,
	f_chroma2=midi['note'].T,
	f_onset2=np.hstack([ALPHA * midi['onset'],
	(1 - ALPHA) * midi['offset']]).T,
	input_feature_rate=self.sr/self.hop_length,
	step_weights=np.array([1.5, 1.5, 2.0]),
	threshold_rec=10 ** 6,
	verbose=debug, normalize_chroma=False,
	anchor_pairs=anchor_pairs)
	wp = make_path_strictly_monotonic(wp).astype(int)

	audio_time = np.take(audio['time'], wp[0])
	midi_time = np.take(midi['time'], wp[1])

	notes = []
	for instrument in midi['midi'].instruments:
	for note in instrument.notes:
	note.start = np.interp(note.start, midi_time, audio_time)
	note.end = np.interp(note.end, midi_time, audio_time)

	if note.end - note.start <= 0.012: # notes should be at least 12 ms (i.e. 2 frames)
	note.start = note.start - 0.003
	note.end = note.start + 0.012

	if include_velocity: # encode the note confidence in place of velocity
	velocity = np.median(audio['note'][np.argmin(np.abs(audio['time']-note.start)):
	np.argmin(np.abs(audio['time']-note.end)),
	note.pitch-self.labeling.midi_centers[0]])

	note.velocity = max(1, velocity*127) # velocity should be at least 1 otherwise midi removes the note
	else:
	velocity = note.velocity/127
	notes.append((note.start, note.end, note.pitch, velocity))
	return notes, midi


	def out2sync(self, out: Dict[str, np.array], midi, include_velocity=False, alignment_padding=50, debug=False):
	"""
	Synchronizes the output of the model with the MIDI file.
	Args:
	out: Model output dictionary
	midi: Path to the MIDI file or PrettyMIDI object
	include_velocity: Whether to encode the note confidence in place of velocity
	alignment_padding: Number of frames to pad the MIDI features with zeros
	debug: Visualize the alignment

	Returns:
	note events and the aligned PrettyMIDI object
	"""
	midi = self.labeling.represent_midi(midi, self.sr/self.hop_length)

	audio_midi_anchors = self.prepare_for_synchronization(out, midi, feature_rate=self.sr/self.hop_length,
	pad_length=alignment_padding)
	if isinstance(audio_midi_anchors, str):
	print(audio_midi_anchors)
	return None # the file is corrupted! no possible alignment at all
	else:
	audio, midi, anchor_pairs = audio_midi_anchors

	ALPHA = 0.6 # This is the coefficient of onsets, 1 - ALPHA for offsets

	starts = (np.array(anchor_pairs[0])*self.sr/self.hop_length).astype(int)
	ends = (np.array(anchor_pairs[1])*self.sr/self.hop_length).astype(int)

	wp = sync_via_mrmsdtw_with_anchors(f_chroma1=audio['note'].T[:, starts[0]:ends[0]],
	f_onset1=np.hstack([ALPHA * audio['onset'],
	(1 - ALPHA) * audio['offset']]).T[:, starts[0]:ends[0]],
	f_chroma2=midi['note'].T[:, starts[1]:ends[1]],
	f_onset2=np.hstack([ALPHA * midi['onset'],
	(1 - ALPHA) * midi['offset']]).T[:, starts[1]:ends[1]],
	input_feature_rate=self.sr/self.hop_length,
	step_weights=np.array([1.5, 1.5, 2.0]),
	threshold_rec=10 ** 6,
	verbose=debug, normalize_chroma=False,
	anchor_pairs=None)
	wp = make_path_strictly_monotonic(wp).astype(int)
	wp[0] += starts[0]
	wp[1] += starts[1]
	wp = np.hstack((wp, ends[:,np.newaxis]))

	audio_time = np.take(audio['time'], wp[0])
	midi_time = np.take(midi['time'], wp[1])

	notes = []
	for instrument in midi['midi'].instruments:
	for note in instrument.notes:
	note.start = np.interp(note.start, midi_time, audio_time)
	note.end = np.interp(note.end, midi_time, audio_time)

	if note.end - note.start <= 0.012: # notes should be at least 12 ms (i.e. 2 frames)
	note.start = note.start - 0.003
	note.end = note.start + 0.012

	if include_velocity: # encode the note confidence in place of velocity
	velocity = np.median(audio['note'][np.argmin(np.abs(audio['time']-note.start)):
	np.argmin(np.abs(audio['time']-note.end)),
	note.pitch-self.labeling.midi_centers[0]])

	note.velocity = max(1, velocity*127) # velocity should be at least 1 otherwise midi removes the note
	else:
	velocity = note.velocity/127
	notes.append((note.start, note.end, note.pitch, velocity))
	return notes, midi

	@staticmethod
	def pad_representations(dict_of_representations, pad_length=10):
	"""
	Pad the representations so that the DTW does not enforce them to encompass the entire duration.
	Args:
	dict_of_representations: audio or midi representations
	pad_length: how many frames to pad

	Returns:
	padded representations
	"""
	for key, value in dict_of_representations.items():
	if key == 'time':
	padded_time = dict_of_representations[key]
	padded_time = np.concatenate([padded_time[:2pad_length], padded_time+padded_time[2pad_length]])
	dict_of_representations[key] = padded_time - padded_time[pad_length] # this is to ensure that the
	# first frame times are negative until the real zero time
	elif key in ['onset', 'offset', 'note']:
	dict_of_representations[key] = np.pad(value, ((pad_length, pad_length), (0, 0)))
	elif key in ['start_anchor', 'end_anchor']:
	anchor_time = dict_of_representations[key][0][0]
	anchor_time = np.argmin(np.abs(dict_of_representations['time'] - anchor_time))
	dict_of_representations[key][:,0] = anchor_time
	dict_of_representations[key] = dict_of_representations[key].astype(np.int)
	return dict_of_representations

	def prepare_for_synchronization(self, audio, midi, feature_rate=44100/256, pad_length=100):
	"""
	MrMsDTW works better with start and end anchors. This function finds the start and end anchors for audio
	based on the midi notes. It also pads the MIDI representations since MIDI files most often start with an active
	note and end with an active note. Thus, the DTW will try to align the active notes to the entire duration of the
	audio. This is not desirable. Therefore, we pad the MIDI representations with a few frames of silence at the
	beginning and end of the audio. This way, the DTW will not try to align the active notes to the entire duration.
	Args:
	audio:
	midi:
	feature_rate:
	pad_length:

	Returns:

	"""
	# first pad the MIDI
	midi = self.pad_representations(midi, pad_length)

	# sometimes f0s are more reliable than the notes. So, we use both the f0s and the notes together to find the
	# start and end anchors. f0 lookup bins is the number of bins to look around the f0 to assign a note to it.
	f0_lookup_bins = int(100//(2*self.labeling.f0_granularity_c))

	# find the start anchor for the audio
	# first decide on which notes to use for the start anchor (take the entire chord where the MIDI file starts)
	anchor_notes = midi['start_anchor'][:, 1] - self.labeling.midi_centers[0]
	# now find which f0 bins to look at for the start anchor
	anchor_f0s = [self.midi_pitch_to_contour_bin(an+self.labeling.midi_centers[0]) for an in anchor_notes]
	anchor_f0s = np.array([list(range(f0-f0_lookup_bins, f0+f0_lookup_bins+1)) for f0 in anchor_f0s]).reshape(-1)
	# first start anchor proposals come from the notes
	anchor_vals = np.any(audio['note'][:, anchor_notes]>0.5, axis=1)
	# now the f0s
	anchor_vals_f0 = np.any(audio['f0'][:, anchor_f0s]>0.5, axis=1)
	# combine the two
	anchor_vals = np.logical_or(anchor_vals, anchor_vals_f0)
	if not any(anchor_vals):
	return 'corrupted' # do not consider the file if we cannot find the start anchor
	audio_start = np.argmax(anchor_vals)

	# now the end anchor (most string instruments use chords in cadences: in general the end anchor is polyphonic)
	anchor_notes = midi['end_anchor'][:, 1] - self.labeling.midi_centers[0]
	anchor_f0s = [self.midi_pitch_to_contour_bin(an+self.labeling.midi_centers[0]) for an in anchor_notes]
	anchor_f0s = np.array([list(range(f0-f0_lookup_bins, f0+f0_lookup_bins+1)) for f0 in anchor_f0s]).reshape(-1)
	# the same procedure as above
	anchor_vals = np.any(audio['note'][::-1, anchor_notes]>0.5, axis=1)
	anchor_vals_f0 = np.any(audio['f0'][::-1, anchor_f0s]>0.5, axis=1)
	anchor_vals = np.logical_or(anchor_vals, anchor_vals_f0)
	if not any(anchor_vals):
	return 'corrupted' # do not consider the file if we cannot find the end anchor
	audio_end = audio['note'].shape[0] - np.argmax(anchor_vals)

	if audio_end - audio_start < (midi['end_anchor'][0][0] - midi['start_anchor'][0][0])/10: # no one plays x10 faster
	return 'corrupted' # do not consider the interval between anchors is too short
	anchor_pairs = [(audio_start - 5, midi['start_anchor'][0][0] - 5),
	(audio_end + 5, midi['end_anchor'][0][0] + 5)]

	if anchor_pairs[0][0] < 1:
	anchor_pairs[0] = (1, midi['start_anchor'][0][0])
	if anchor_pairs[1][0] > audio['note'].shape[0] - 1:
	anchor_pairs[1] = (audio['note'].shape[0] - 1, midi['end_anchor'][0][0])

	return audio, midi, [(anchor_pairs[0][0]/feature_rate, anchor_pairs[0][1]/feature_rate),
	(anchor_pairs[1][0]/feature_rate, anchor_pairs[1][1]/feature_rate)]

	class ConvBlock(nn.Module):
	def __init__(self, f, w, s, d, in_channels):
	super().__init__()
	p1 = d*(w - 1) // 2
	p2 = d*(w - 1) - p1
	self.pad = nn.ZeroPad2d((0, 0, p1, p2))

	self.conv2d = nn.Conv2d(in_channels=in_channels, out_channels=f, kernel_size=(w, 1), stride=(s, 1), dilation=(d, 1))
	self.relu = nn.ReLU()
	self.bn = nn.BatchNorm2d(f)
	self.pool = nn.MaxPool2d(kernel_size=(2, 1))
	self.dropout = nn.Dropout(0.25)

	def forward(self, x):
	x = self.pad(x)
	x = self.conv2d(x)
	x = self.relu(x)
	x = self.bn(x)
	x = self.pool(x)
	x = self.dropout(x)
	return x

	class NoPadConvBlock(nn.Module):
	def __init__(self, f, w, s, d, in_channels):
	super().__init__()

	self.conv2d = nn.Conv2d(in_channels=in_channels, out_channels=f, kernel_size=(w, 1), stride=(s, 1),
	dilation=(d, 1))
	self.relu = nn.ReLU()
	self.bn = nn.BatchNorm2d(f)
	self.pool = nn.MaxPool2d(kernel_size=(2, 1))
	self.dropout = nn.Dropout(0.25)

	def forward(self, x):
	x = self.conv2d(x)
	x = self.relu(x)
	x = self.bn(x)
	x = self.pool(x)
	x = self.dropout(x)
	return x

	class TinyPathway(nn.Module):
	def __init__(self, dilation=1, hop=256, localize=False,
	model_capacity="full", n_layers=6, chunk_size=256):
	super().__init__()

	capacity_multiplier = {
	'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32
	}[model_capacity]
	self.layers = [1, 2, 3, 4, 5, 6]
	self.layers = self.layers[:n_layers]
	filters = [n * capacity_multiplier for n in [32, 8, 8, 8, 8, 8]]
	filters = [1] + filters
	widths = [512, 64, 64, 64, 32, 32]
	strides = self.deter_dilations(hop//(4(2*n_layers)), localize=localize)
	strides[0] = strides[0]*4 # apply 4 times more stride at the first layer
	dilations = self.deter_dilations(dilation)

	for i in range(len(self.layers)):
	f, w, s, d, in_channel = filters[i + 1], widths[i], strides[i], dilations[i], filters[i]
	self.add_module("conv%d" % i, NoPadConvBlock(f, w, s, d, in_channel))
	self.chunk_size = chunk_size
	self.input_window, self.hop = self.find_input_size_for_pathway()
	self.out_dim = filters[n_layers]

	def find_input_size_for_pathway(self):
	def find_input_size(output_size, kernel_size, stride, dilation, padding):
	num = (stride*(output_size-1)) + 1
	input_size = num - 2padding + dilation(kernel_size-1)
	return input_size
	conv_calc, n = {}, 0
	for i in self.layers:
	layer = self.__getattr__("conv%d" % (i-1))
	for mm in layer.modules():
	if hasattr(mm, 'kernel_size'):
	try:
	d = mm.dilation[0]
	except TypeError:
	d = mm.dilation
	conv_calc[n] = [mm.kernel_size[0], mm.stride[0], 0, d]
	n += 1
	out = self.chunk_size
	hop = 1
	for n in sorted(conv_calc.keys())[::-1]:
	kernel_size_n, stride_n, padding_n, dilation_n = conv_calc[n]
	out = find_input_size(out, kernel_size_n, stride_n, dilation_n, padding_n)
	hop = hop*stride_n
	return out, hop

	def deter_dilations(self, total_dilation, localize=False):
	n_layers = len(self.layers)
	if localize: # e.g., 32*1023 window and 3 layers -> [1, 1, 32]
	a = [total_dilation] + [1 for _ in range(n_layers-1)]
	else: # e.g., 32*1023 window and 3 layers -> [4, 4, 2]
	total_dilation = int(np.log2(total_dilation))
	a = []
	for layer in range(n_layers):
	this_dilation = int(np.ceil(total_dilation/(n_layers-layer)))
	a.append(2**this_dilation)
	total_dilation = total_dilation - this_dilation
	return a[::-1]

	def forward(self, x):
	x = x.view(x.shape[0], 1, -1, 1)
	for i in range(len(self.layers)):
	x = self.__getattr__("conv%d" % i)(x)
	x = x.permute(0, 3, 2, 1)
	return x


	#@jit(nopython=True)
	def cosine_distance(f1, f2, cos_meas_max=2.0, cos_meas_min=1.0):
	"""For all pairs of vectors f1' and f2' in f1 and f2, computes 1 - (f1.f2),
	where '.' is the dot product, and rescales the results to lie in the
	range [cos_meas_min, cos_meas_max].
	Corresponds to regular cosine distance if f1' and f2' are normalized and
	cos_meas_min==0.0 and cos_meas_max==1.0."""
	return (1 - f1.T @ f2) * (cos_meas_max - cos_meas_min) + cos_meas_min

	#@jit(nopython=True)
	def euclidean_distance(f1, f2, l2_meas_max=1.0, l2_meas_min=0.0):
	"""Computes euclidean distances between the vectors in f1 and f2, and
	rescales the results to lie in the range [cos_meas_min, cos_meas_max]."""

	#S1 = np.zeros((f1.shape[1], f2.shape[1]))
	#for n in range(f2.shape[1]):
	# S1[:, n] = np.sqrt(np.sum((f1.T - f2[:, n]) ** 2, axis=1))
	S1 = euclidean_distances(f1.T, f2.T)

	return S1 * (l2_meas_max - l2_meas_min) + l2_meas_min

	def compute_high_res_cost_matrix(f_chroma1: np.ndarray,
	f_chroma2: np.ndarray,
	f_onset1: np.ndarray,
	f_onset2: np.ndarray,
	weights: np.ndarray = np.array([1.0, 1.0]),
	cos_meas_min: float = 1.0,
	cos_meas_max: float = 2.0,
	l2_meas_min: float = 0.0,
	l2_meas_max: float = 1.0):
	"""Computes cost matrix of two sequences using two feature matrices
	for each sequence. Cosine distance is used for the chroma sequences and
	euclidean distance is used for the DLNCO sequences.

	Parameters
	----------
	f_chroma1 : np.ndarray [shape=(12, N)]
	Chroma feature matrix of the first sequence (assumed to be normalized).

	f_chroma2 : np.ndarray [shape=(12, M)]
	Chroma feature matrix of the second sequence (assumed to be normalized).

	f_onset1 : np.ndarray [shape=(12, N)]
	DLNCO feature matrix of the first sequence

	f_onset2 : np.ndarray [shape=(12, M)]
	DLNCO feature matrix of the second sequence

	weights : np.ndarray [shape=[2,]]
	Weights array for the high-resolution cost computation.
	weights[0] * cosine_distance + weights[1] * euclidean_distance

	cos_meas_min : float
	Cosine distances are shifted to be at least ``cos_meas_min``

	cos_meas_max : float
	Cosine distances are scaled to be at most ``cos_meas_max``

	l2_meas_min : float
	Euclidean distances are shifted to be at least ``l2_meas_min``

	l2_meas_max : float
	Euclidean distances are scaled to be at most ``l2_meas_max``

	Returns
	-------
	C: np.ndarray [shape=(N, M)]
	Cost matrix
	"""
	cos_dis = cosine_distance(f_chroma1, f_chroma2, cos_meas_min=cos_meas_min, cos_meas_max=cos_meas_max)
	euc_dis = euclidean_distance(f_onset1, f_onset2, l2_meas_min=l2_meas_min, l2_meas_max=l2_meas_max)

	return weights[0] * cos_dis + weights[1] * euc_dis

	@jit(nopython=True, cache=True)
	def __C_to_DE(C: np.ndarray = None,
	dn: np.ndarray = np.array([1, 1, 0], np.int64),
	dm: np.ndarray = np.array([1, 0, 1], np.int64),
	dw: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
	sub_sequence: bool = False) -> tuple[np.ndarray, np.ndarray]:
	"""This function computes the accumulated cost matrix D and the step index
	matrix E.

	Parameters
	----------
	C : np.ndarray (np.float32 / np.float64) [shape=(N, M)]
	Cost matrix

	dn : np.ndarray (np.int64) [shape=(1, S)]
	Integer array defining valid steps (N direction of C), default: [1, 1, 0]

	dm : np.ndarray (np.int64) [shape=(1, S)]
	Integer array defining valid steps (M direction of C), default: [1, 0, 1]

	dw : np.ndarray (np.float64) [shape=(1, S)]
	Double array defining the weight of the each step, default: [1.0, 1.0, 1.0]

	sub_sequence : bool
	Set `True` for SubSequence DTW, default: False

	Returns
	-------
	D : np.ndarray (np.float64) [shape=(N, M)]
	Accumulated cost matrix of type double

	E : np.ndarray (np.int64) [shape=(N, M)]
	Step index matrix.
	E[n, m] holds the index of the step take to determine the value of D[n, m].
	If E[n, m] is zero, no valid step was possible.
	NaNs in the cost matrix are preserved, invalid fields in the cost matrix are NaNs.
	"""
	if C is None:
	raise ValueError('C must be a 2D numpy array.')

	N, M = C.shape
	S = dn.size

	if S != dm.size or S != dw.size:
	raise ValueError('The parameters dn,dm, and dw must be of equal length.')

	# calc bounding box size of steps
	sbbn = np.max(dn)
	sbbm = np.max(dm)

	# initialize E
	E = np.zeros((N, M), np.int64) - 1

	# initialize extended D matrix
	D = np.ones((sbbn + N, sbbm + M), np.float64) * np.inf

	if sub_sequence:
	for m in range(M):
	D[sbbn, sbbm + m] = C[0, m]
	else:
	D[sbbn, sbbm] = C[0, 0]

	# accumulate
	for m in range(sbbm, M + sbbm):
	for n in range(sbbn, N + sbbn):
	for s in range(S):
	cost = D[n - dn[s], m - dm[s]] + C[n - sbbn, m - sbbm] * dw[s]
	if cost < D[n, m]:
	D[n, m] = cost
	E[n - sbbn, m - sbbm] = s

	D = D[sbbn: N + sbbn, sbbm: M + sbbm]

	return D, E

	@jit(nopython=True, cache=True)
	def __E_to_warping_path(E: np.ndarray,
	dn: np.ndarray = np.array([1, 1, 0], np.int64),
	dm: np.ndarray = np.array([1, 0, 1], np.int64),
	sub_sequence: bool = False,
	end_index: int = -1) -> np.ndarray:
	"""This function computes a warping path based on the provided matrix E
	and the allowed steps.

	Parameters
	----------
	E : np.ndarray (np.int64) [shape=(N, M)]
	Step index matrix

	dn : np.ndarray (np.int64) [shape=(1, S)]
	Integer array defining valid steps (N direction of C), default: [1, 1, 0]

	dm : np.ndarray (np.int64) [shape=(1, S)]
	Integer array defining valid steps (M direction of C), default: [1, 0, 1]

	sub_sequence : bool
	Set `True` for SubSequence DTW, default: False

	end_index : int
	In case of SubSequence DTW

	Returns
	-------
	warping_path : np.ndarray (np.int64) [shape=(2, M)]
	Resulting optimal warping path
	"""
	N, M = E.shape

	if not sub_sequence and end_index == -1:
	end_index = M - 1

	m = end_index
	n = N - 1

	warping_path = np.zeros((2, n + m + 1))

	index = 0

	def _loop(m, n, index):
	warping_path[:, index] = np.array([n, m])
	step_index = E[n, m]
	m -= dm[step_index]
	n -= dn[step_index]
	index += 1
	return m, n, index

	if sub_sequence:
	while n > 0:
	m, n, index = _loop(m, n, index)
	else:
	while m > 0 or n > 0:
	m, n, index = _loop(m, n, index)

	warping_path[:, index] = np.array([n, m])
	warping_path = warping_path[:, index::-1]

	return warping_path

	def compute_warping_path(C: np.ndarray,
	step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int64),
	step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
	implementation: str = 'synctoolbox'):
	"""Applies DTW on cost matrix C.

	Parameters
	----------
	C : np.ndarray (np.float32 / np.float64) [shape=(N, M)]
	Cost matrix

	step_sizes : np.ndarray (np.int64) [shape=(2, S)]
	Array of step sizes

	step_weights : np.ndarray (np.float64) [shape=(2, S)]
	Array of step weights

	implementation: str
	Choose among ``synctoolbox`` and ``librosa``. (default: ``synctoolbox``)

	Returns
	-------
	D : np.ndarray (np.float64) [shape=(N, M)]
	Accumulated cost matrix

	E : np.ndarray (np.int64) [shape=(N, M)]
	Step index matrix

	wp : np.ndarray (np.int64) [shape=(2, M)]
	Warping path
	"""
	if implementation == 'librosa':
	D, wp, E = dtw(C=C,
	step_sizes_sigma=step_sizes,
	weights_add=np.array([0, 0, 0]),
	weights_mul=step_weights,
	return_steps=True,
	subseq=False)
	wp = wp[::-1].T

	elif implementation == 'synctoolbox':
	dn = step_sizes[:, 0]
	dm = step_sizes[:, 1]

	D, E = __C_to_DE(C,
	dn=dn,
	dm=dm,
	dw=step_weights,
	sub_sequence=False)

	wp = __E_to_warping_path(E=E,
	dn=dn,
	dm=dm,
	sub_sequence=False)

	else:
	raise NotImplementedError(f'No implementation found called {implementation}')

	return D, E, wp

	def compute_warping_paths_from_cost_matrices(cost_matrices: List,
	step_sizes: np.array = np.array([[1, 0], [0, 1], [1, 1]], int),
	step_weights: np.array = np.array([1.0, 1.0, 1.0], np.float64),
	implementation: str = 'synctoolbox') -> List:
	"""Computes a path via DTW on each matrix in cost_matrices

	Parameters
	----------
	cost_matrices : list
	List of cost matrices

	step_sizes : np.ndarray
	DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))

	step_weights : np.ndarray
	DTW step weights (default: np.array([1.0, 1.0, 1.0]))

	implementation : str
	Choose among 'synctoolbox' and 'librosa' (default: 'synctoolbox')

	Returns
	-------
	wp_list : list
	List of warping paths
	"""
	return [compute_warping_path(C=C,
	step_sizes=step_sizes,
	step_weights=step_weights,
	implementation=implementation)[2] for C in cost_matrices]

	def compute_cost_matrices_between_anchors(f_chroma1: np.ndarray,
	f_chroma2: np.ndarray,
	anchors: np.ndarray,
	f_onset1: np.ndarray = None,
	f_onset2: np.ndarray = None,
	alpha: float = 0.5) -> List:
	"""Computes cost matrices for the given features between subsequent
	pairs of anchors points.

	Parameters
	----------
	f_chroma1 : np.ndarray [shape=(12, N)]
	Chroma feature matrix of the first sequence

	f_chroma2 : np.ndarray [shape=(12, M)]
	Chroma feature matrix of the second sequence

	anchors : np.ndarray [shape=(2, R)]
	Anchor sequence

	f_onset1 : np.ndarray [shape=(L, N)]
	Onset feature matrix of the first sequence

	f_onset2 : np.ndarray [shape=(L, M)]
	Onset feature matrix of the second sequence

	alpha: float
	Alpha parameter to weight the cost functions.

	Returns
	-------
	cost_matrices: list
	List containing cost matrices
	"""
	high_res = False
	if f_onset1 is not None and f_onset2 is not None:
	high_res = True

	cost_matrices = list()
	for k in range(anchors.shape[1] - 1):
	a1 = np.array(anchors[:, k].astype(int), copy=True)
	a2 = np.array(anchors[:, k + 1].astype(int), copy=True)

	if high_res:
	cost_matrices.append(compute_high_res_cost_matrix(f_chroma1[:, a1[0]: a2[0] + 1],
	f_chroma2[:, a1[1]: a2[1] + 1],
	f_onset1[:, a1[0]: a2[0] + 1],
	f_onset2[:, a1[1]: a2[1] + 1],
	weights=np.array([alpha, 1-alpha])))
	else:
	cost_matrices.append(cosine_distance(f_chroma1[:, a1[0]: a2[0] + 1],
	f_chroma2[:, a1[1]: a2[1] + 1]))
	return cost_matrices

	def build_path_from_warping_paths(warping_paths: List,
	anchors: np.ndarray = None) -> np.ndarray:
	"""The function builds a path from a given list of warping paths
	and the anchors used to obtain these paths. The indices of the original
	warping paths are adapted such that they cross the anchors.

	Parameters
	----------
	warping_paths : list
	List of warping paths

	anchors : np.ndarray [shape=(2, N)]
	Anchor sequence

	Returns
	-------
	path : np.ndarray [shape=(2, M)]
	Merged path
	"""

	if anchors is None:
	# When no anchor points are given, we can construct them from the
	# subpaths in the wp_list

	# To do this, we assume that the first path's element is the starting
	# anchor
	anchors = warping_paths[0][:, 0]

	# Retrieve the last element of each path
	anchors_tmp = np.zeros(len(warping_paths), np.float32)
	for idx, x in enumerate(warping_paths):
	anchors_tmp[idx] = x[:, -1]

	# Correct indices, such that the indices of the anchors are given on a
	# common path. Each anchor a_l = [Nnew_[l+1];Mnew_[l+1]]
	# Nnew_[l+1] = N_l + N_[l+1] -1
	# Mnew_[l+1] = M_l + M_[l+1] -1

	anchors_tmp = np.cumsum(anchors_tmp, axis=1)
	anchors_tmp[:, 1:] = anchors_tmp[:, 1:] - [np.arange(1, anchors_tmp.shape[1]),
	np.arange(1, anchors_tmp.shape[1])]

	anchors = np.concatenate([anchors, anchors_tmp], axis=1)

	L = len(warping_paths) + 1
	path = None
	wp = None

	for anchor_idx in range(1, L):
	anchor1 = anchors[:, anchor_idx - 1]
	anchor2 = anchors[:, anchor_idx]

	wp = np.array(warping_paths[anchor_idx - 1], copy=True)

	# correct indices in warpingPath
	wp += np.repeat(anchor1.reshape(-1, 1), wp.shape[1], axis=1).astype(wp.dtype)

	# consistency checks
	assert np.array_equal(wp[:, 0], anchor1), 'First entry of warping path does not coincide with anchor point'
	assert np.array_equal(wp[:, -1], anchor2), 'Last entry of warping path does not coincide with anchor point'

	if path is None:
	path = np.array(wp[:, :-1], copy=True)
	else:
	path = np.concatenate([path, wp[:, :-1]], axis=1)

	# append last index of warping path
	path = np.concatenate([path, wp[:, -1].reshape(-1, 1)], axis=1)

	return path

	def find_anchor_indices_in_warping_path(warping_path: np.ndarray,
	anchors: np.ndarray) -> np.ndarray:
	"""Compute the indices in the warping path that corresponds
	to the elements in 'anchors'

	Parameters
	----------
	warping_path : np.ndarray [shape=(2, N)]
	Warping path

	anchors : np.ndarray [shape=(2, M)]
	Anchor sequence

	Returns
	-------
	indices : np.ndarray [shape=(2, M)]
	Anchor indices in the ``warping_path``
	"""
	indices = np.zeros(anchors.shape[1])

	for k in range(anchors.shape[1]):
	a = anchors[:, k]
	indices[k] = np.where((a[0] == warping_path[0, :]) & (a[1] == warping_path[1, :]))[0]

	return indices

	def make_path_strictly_monotonic(P: np.ndarray) -> np.ndarray:
	"""Compute strict alignment path from a warping path

	Wrapper around "compute_strict_alignment_path_mask" from libfmp.

	Parameters
	----------
	P: np.ndarray [shape=(2, N)]
	Warping path

	Returns
	-------
	P_mod: np.ndarray [shape=(2, M)]
	Strict alignment path, M <= N
	"""
	P_mod = compute_strict_alignment_path_mask(P.T)

	return P_mod.T

	def compute_strict_alignment_path_mask(P):
	"""Compute strict alignment path from a warping path

	Notebook: C3/C3S3_MusicAppTempoCurve.ipynb

	Args:
	P (list or np.ndarray): Wapring path

	Returns:
	P_mod (list or np.ndarray): Strict alignment path
	"""
	P = np.array(P, copy=True)
	N, M = P[-1]
	# Get indices for strict monotonicity
	keep_mask = (P[1:, 0] > P[:-1, 0]) & (P[1:, 1] > P[:-1, 1])
	# Add first index to enforce start boundary condition
	keep_mask = np.concatenate(([True], keep_mask))
	# Remove all indices for of last row or column
	keep_mask[(P[:, 0] == N) \| (P[:, 1] == M)] = False
	# Add last index to enforce end boundary condition
	keep_mask[-1] = True
	P_mod = P[keep_mask, :]

	return P_mod

	def evaluate_synchronized_positions(ground_truth_positions: np.ndarray,
	synchronized_positions: np.ndarray,
	tolerances: List = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 250]):
	"""Compute standard evaluation measures for evaluating the quality of synchronized (musical) positions.

	When synchronizing two versions of a piece of music, one can evaluate the quality of the resulting alignment
	by comparing errors at musical positions (e.g. beats or measures) that appear in both versions.
	This function implements two measures: mean absolute error at positions and the percentage of correctly transferred
	measures given a threshold.

	Parameters
	----------
	ground_truth_positions: np.ndarray [shape=N]
	Positions (e.g. beat or measure positions) annotated in the target version of a piece of music, in milliseconds.

	synchronized_positions: np.ndarray [shape=N]
	The same musical positions as in 'ground_truth_positions' obtained by transfer using music synchronization,
	in milliseconds.

	tolerances: list of integers
	Tolerances (in miliseconds) used for comparing annotated and synchronized positions.

	Returns
	-------
	mean_absolute_error: float
	Mean absolute error for synchronized positions, in miliseconds.

	accuracy_at_tolerances: list of floats
	Percentages of correctly transferred measures, for each entry in 'tolerances'.

	"""
	absolute_errors_at_positions = np.abs(synchronized_positions - ground_truth_positions)

	print('Measure transfer from recording 1 to 2 yielded:')
	mean_absolute_error = np.mean(absolute_errors_at_positions)
	print('\nMean absolute error (MAE): %.2fms (standard deviation: %.2fms)' % (mean_absolute_error,
	np.std(absolute_errors_at_positions)))
	print('\nAccuracy of transferred positions at different tolerances:')
	print('\t\t\tAccuracy')
	print('################################')
	accuracy_at_tolerances = []
	for tolerance in tolerances:
	accuracy = np.mean((absolute_errors_at_positions < tolerance)) * 100.0
	accuracy_at_tolerances.append(accuracy)
	print('Tolerance: {} ms \t{:.2f} %'.format(tolerance, accuracy))

	return mean_absolute_error, accuracy_at_tolerances

	def smooth_downsample_feature(f_feature: np.ndarray,
	input_feature_rate: float,
	win_len_smooth: int = 0,
	downsamp_smooth: int = 1) -> Tuple[np.ndarray, float]:
	"""Temporal smoothing and downsampling of a feature sequence

	Parameters
	----------
	f_feature : np.ndarray
	Input feature sequence, size dxN

	input_feature_rate : float
	Input feature rate in Hz

	win_len_smooth : int
	Smoothing window length. For 0, no smoothing is applied.

	downsamp_smooth : int
	Downsampling factor. For 1, no downsampling is applied.

	Returns
	-------
	f_feature_stat : np.ndarray
	Downsampled & smoothed feature.

	new_feature_rate : float
	New feature rate after downsampling
	"""
	if win_len_smooth != 0 or downsamp_smooth != 1:
	# hack to get the same results as on MATLAB
	stat_window = np.hanning(win_len_smooth+2)[1:-1]
	stat_window /= np.sum(stat_window)

	# upfirdn filters and downsamples each column of f_stat_help
	f_feature_stat = upfirdn(h=stat_window, x=f_feature, up=1, down=downsamp_smooth)
	seg_num = f_feature.shape[1]
	stat_num = int(np.ceil(seg_num / downsamp_smooth))
	cut = int(np.floor((win_len_smooth - 1) / (2 * downsamp_smooth)))
	f_feature_stat = f_feature_stat[:, cut: stat_num + cut]
	else:
	f_feature_stat = f_feature

	new_feature_rate = input_feature_rate / downsamp_smooth

	return f_feature_stat, new_feature_rate

	@jit(nopython=True)
	def normalize_feature(feature: np.ndarray,
	norm_ord: int,
	threshold: float) -> np.ndarray:
	"""Normalizes a feature sequence according to the l^norm_ord norm.

	Parameters
	----------
	feature : np.ndarray
	Input feature sequence of size d x N
	d: dimensionality of feature vectors
	N: number of feature vectors (time in frames)

	norm_ord : int
	Norm degree

	threshold : float
	If the norm falls below threshold for a feature vector, then the
	normalized feature vector is set to be the normalized unit vector.

	Returns
	-------
	f_normalized : np.ndarray
	Normalized feature sequence
	"""
	# TODO rewrite in vectorized fashion
	d, N = feature.shape
	f_normalized = np.zeros((d, N))

	# normalize the vectors according to the l^norm_ord norm
	unit_vec = np.ones(d)
	unit_vec = unit_vec / np.linalg.norm(unit_vec, norm_ord)

	for k in range(N):
	cur_norm = np.linalg.norm(feature[:, k], norm_ord)

	if cur_norm < threshold:
	f_normalized[:, k] = unit_vec
	else:
	f_normalized[:, k] = feature[:, k] / cur_norm

	return f_normalized

	class FourHeads(Synchronizer):

	def __init__(
	self,
	pathway_multiscale: int = 32,
	num_pathway_layers: int = 2,
	chunk_size: int = 256,
	hop_length: int = 256,
	encoder_dim: int = 256,
	sr: int = 44100,
	num_heads: int = 4,
	ffn_dim: int = 128,
	num_separator_layers: int = 16,
	num_representation_layers: int = 4,
	depthwise_conv_kernel_size: int = 31,
	dropout: float = 0.25,
	use_group_norm: bool = False,
	convolution_first: bool = False,
	labeling=PerformanceLabel(),
	wiring='tiktok'
	):
	super().__init__(labeling, sr=sr, hop_length=hop_length)
	self.main = TinyPathway(dilation=1, hop=hop_length, localize=True,
	n_layers=num_pathway_layers, chunk_size=chunk_size)
	self.attendant = TinyPathway(dilation=pathway_multiscale, hop=hop_length, localize=False,
	n_layers=num_pathway_layers, chunk_size=chunk_size)
	assert self.main.hop == self.attendant.hop # they should output with the same sample rate
	print('hop in samples:', self.main.hop)
	self.input_window = self.attendant.input_window

	self.encoder_dim = encoder_dim
	self.dropout = nn.Dropout(dropout)

	# merge two streams into a conformer input
	self.stream_merger = nn.Sequential(self.dropout,
	nn.Linear(self.main.out_dim + self.attendant.out_dim, self.encoder_dim))



	print('main stream window:', self.main.input_window,
	', attendant stream window:', self.attendant.input_window,
	', conformer input dim:', self.encoder_dim)

	center = ((chunk_size - 1) * self.main.hop) # region labeled with pitch track
	main_overlap = self.main.input_window - center
	main_overlap = [int(np.floor(main_overlap / 2)), int(np.ceil(main_overlap / 2))]
	attendant_overlap = self.attendant.input_window - center
	attendant_overlap = [int(np.floor(attendant_overlap / 2)), int(np.ceil(attendant_overlap / 2))]
	print('main frame overlap:', main_overlap, ', attendant frame overlap:', attendant_overlap)
	main_crop_relative = [attendant_overlap[0] - main_overlap[0], main_overlap[1] - attendant_overlap[1]]
	print('crop for main pathway', main_crop_relative)
	print("Total sequence duration is", self.attendant.input_window, 'samples')
	print('Main stream receptive field for one frame is', (self.main.input_window - center), 'samples')
	print('Attendant stream receptive field for one frame is', (self.attendant.input_window - center), 'samples')
	self.frame_overlap = attendant_overlap

	self.main_stream_crop = main_crop_relative
	self.max_window_size = self.attendant.input_window
	self.chunk_size = chunk_size

	self.separator_stream = nn.ModuleList( # source-separation, reinvented
	[
	ConformerLayer(
	input_dim=self.encoder_dim,
	ffn_dim=ffn_dim,
	num_attention_heads=num_heads,
	depthwise_conv_kernel_size=depthwise_conv_kernel_size,
	dropout=dropout,
	use_group_norm=use_group_norm,
	convolution_first=convolution_first,
	)
	for _ in range(num_separator_layers)
	]
	)

	self.f0_stream = nn.ModuleList(
	[
	ConformerLayer(
	input_dim=self.encoder_dim,
	ffn_dim=ffn_dim,
	num_attention_heads=num_heads,
	depthwise_conv_kernel_size=depthwise_conv_kernel_size,
	dropout=dropout,
	use_group_norm=use_group_norm,
	convolution_first=convolution_first,
	)
	for _ in range(num_representation_layers)
	]
	)
	self.f0_head = nn.Linear(self.encoder_dim, len(self.labeling.f0_centers_c))

	self.note_stream = nn.ModuleList(
	[
	ConformerLayer(
	input_dim=self.encoder_dim,
	ffn_dim=ffn_dim,
	num_attention_heads=num_heads,
	depthwise_conv_kernel_size=depthwise_conv_kernel_size,
	dropout=dropout,
	use_group_norm=use_group_norm,
	convolution_first=convolution_first,
	)
	for _ in range(num_representation_layers)
	]
	)
	self.note_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))

	self.onset_stream = nn.ModuleList(
	[
	ConformerLayer(
	input_dim=self.encoder_dim,
	ffn_dim=ffn_dim,
	num_attention_heads=num_heads,
	depthwise_conv_kernel_size=depthwise_conv_kernel_size,
	dropout=dropout,
	use_group_norm=use_group_norm,
	convolution_first=convolution_first,
	)
	for _ in range(num_representation_layers)
	]
	)
	self.onset_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))

	self.offset_stream = nn.ModuleList(
	[
	ConformerLayer(
	input_dim=self.encoder_dim,
	ffn_dim=ffn_dim,
	num_attention_heads=num_heads,
	depthwise_conv_kernel_size=depthwise_conv_kernel_size,
	dropout=dropout,
	use_group_norm=use_group_norm,
	convolution_first=convolution_first,
	)
	for _ in range(num_representation_layers)
	]
	)
	self.offset_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))

	self.labeling = labeling
	self.double_merger = nn.Sequential(self.dropout, nn.Linear(2 * self.encoder_dim, self.encoder_dim))
	self.triple_merger = nn.Sequential(self.dropout, nn.Linear(3 * self.encoder_dim, self.encoder_dim))
	self.wiring = wiring

	print('Total parameter count: ', self.count_parameters())

	def count_parameters(self) -> int:
	""" Count parameters of encoder """
	return sum([p.numel() for p in self.parameters()])

	def stream(self, x, representation, key_padding_mask=None):
	for i, layer in enumerate(self.__getattr__('{}_stream'.format(representation))):
	x = layer(x, key_padding_mask)
	return x

	def head(self, x, representation):
	return self.__getattr__('{}_head'.format(representation))(x)

	def forward(self, x, key_padding_mask=None):

	# two auditory streams followed by the separator stream to ensure timbre-awareness
	x_attendant = self.attendant(x)
	x_main = self.main(x[:, self.main_stream_crop[0]:self.main_stream_crop[1]])
	x = self.stream_merger(torch_cat((x_attendant, x_main), -1).squeeze(1))
	x = self.stream(x, 'separator', key_padding_mask)

	f0 = self.stream(x, 'f0', key_padding_mask) # they say this is a low level feature :)

	if self.wiring == 'parallel':
	note = self.stream(x, 'note', key_padding_mask)
	onset = self.stream(x, 'onset', key_padding_mask)
	offset = self.stream(x, 'offset', key_padding_mask)

	elif self.wiring == 'tiktok':
	onset = self.stream(x, 'onset', key_padding_mask)
	offset = self.stream(x, 'offset', key_padding_mask)
	# f0 is disconnected, note relies on separator, onset, and offset
	note = self.stream(self.triple_merger(torch_cat((x, onset, offset), -1)), 'note', key_padding_mask)

	elif self.wiring == 'tiktok2':
	onset = self.stream(x, 'onset', key_padding_mask)
	offset = self.stream(x, 'offset', key_padding_mask)
	# note is connected to f0, onset, and offset
	note = self.stream(self.triple_merger(torch_cat((f0, onset, offset), -1)), 'note', key_padding_mask)

	elif self.wiring == 'spotify':
	# note is connected to f0 only
	note = self.stream(f0, 'note', key_padding_mask)
	# here onset and onsets are higher-level features informed by the separator and note
	onset = self.stream(self.double_merger(torch_cat((x, note), -1)), 'onset', key_padding_mask)
	offset = self.stream(self.double_merger(torch_cat((x, note), -1)), 'offset', key_padding_mask)

	else:
	# onset and offset are connected to f0 and separator streams
	onset = self.stream(self.double_merger(torch_cat((x, f0), -1)), 'onset', key_padding_mask)
	offset = self.stream(self.double_merger(torch_cat((x, f0), -1)), 'offset', key_padding_mask)
	# note is connected to f0, onset, and offset streams
	note = self.stream(self.triple_merger(torch_cat((f0, onset, offset), -1)), 'note', key_padding_mask)


	return {'f0': self.head(f0, 'f0'),
	'note': self.head(note, 'note'),
	'onset': self.head(onset, 'onset'),
	'offset': self.head(offset, 'offset')}


	class PretrainedModel(FourHeads):
	def __init__(self,model_json:dict,model:str,device):
	super().__init__(pathway_multiscale=model_json['pathway_multiscale'],num_pathway_layers=model_json['num_pathway_layers'], wiring=model_json['wiring'],hop_length=model_json['hop_length'], chunk_size=model_json['chunk_size'],labeling=PerformanceLabel(note_min=model_json['note_low'], note_max=model_json['note_high'],f0_bins_per_semitone=model_json['f0_bins_per_semitone'],f0_tolerance_c=200,f0_smooth_std_c=model_json['f0_smooth_std_c'], onset_smooth_std=model_json['onset_smooth_std']), sr=model_json['sampling_rate'])
	self.load_state_dict(torch_load(model, map_location=device,weights_only=True))
	self.eval()

	def merge_violin_tracks(self,mid:MidiFile):
	new_mid = MidiFile(ticks_per_beat=mid.ticks_per_beat)
	new_track = MidiTrack()
	new_mid.tracks.append(new_track)
	events = []
	for track in mid.tracks:
	current_time = 0
	for msg in track:
	current_time += msg.time
	events.append((current_time, msg))
	events.sort(key=lambda x: x[0])
	last_time = 0
	for event_time, msg in events:
	delta_time = event_time - last_time
	new_track.append(msg.copy(time=delta_time))
	last_time = event_time
	for track in mid.tracks:
	for msg in track:
	if msg.type == 'set_tempo':
	new_track.insert(0, msg)
	return new_mid

	def transcribe_music(self, audio, batch_size, postprocessing):
	self.transcribe(audio, batch_size, postprocessing).write("output.mid")
	self.merge_violin_tracks(MidiFile("output.mid")).save("output.mid")
	return "output.mid"