# Copyright 2022 The MT3 Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """TensorBoard summaries and utilities.""" from typing import Any, Mapping, Optional, Sequence, Tuple import librosa from mt3 import note_sequences from mt3 import spectrograms import note_seq from note_seq import midi_synth from note_seq import sequences_lib from note_seq.protobuf import music_pb2 import numpy as np import seqio _DEFAULT_AUDIO_SECONDS = 30.0 _DEFAULT_PIANOROLL_FRAMES_PER_SECOND = 15 # TODO(iansimon): pick a SoundFont; for some reason the default is all organ def _extract_example_audio( examples: Sequence[Mapping[str, Any]], sample_rate: float, num_seconds: float, audio_key: str = 'raw_inputs' ) -> np.ndarray: """Extract audio from examples. Args: examples: List of examples containing raw audio. sample_rate: Number of samples per second. num_seconds: Number of seconds of audio to include. audio_key: Dictionary key for the raw audio. Returns: An n-by-num_samples numpy array of samples. """ n = len(examples) num_samples = round(num_seconds * sample_rate) all_samples = np.zeros([n, num_samples]) for i, ex in enumerate(examples): samples = ex[audio_key][:num_samples] all_samples[i, :len(samples)] = samples return all_samples def _example_to_note_sequence( example: Mapping[str, Sequence[float]], ns_feature_name: str, note_onset_feature_name: str, note_offset_feature_name: str, note_frequency_feature_name: str, note_confidence_feature_name: str, num_seconds: float ) -> music_pb2.NoteSequence: """Extract NoteSequence from example.""" if ns_feature_name: ns = example[ns_feature_name] else: onset_times = np.array(example[note_onset_feature_name]) pitches = librosa.hz_to_midi( example[note_frequency_feature_name]).round().astype(int) assert len(onset_times) == len(pitches) if note_offset_feature_name or note_confidence_feature_name: offset_times = ( example[note_offset_feature_name] if note_offset_feature_name else onset_times + note_sequences.DEFAULT_NOTE_DURATION ) assert len(onset_times) == len(offset_times) confidences = (np.array(example[note_confidence_feature_name]) if note_confidence_feature_name else None) velocities = np.ceil( note_seq.MAX_MIDI_VELOCITY * confidences if confidences is not None else note_sequences.DEFAULT_VELOCITY * np.ones_like(onset_times) ).astype(int) assert len(onset_times) == len(velocities) ns = note_sequences.note_arrays_to_note_sequence( onset_times=onset_times, offset_times=offset_times, pitches=pitches, velocities=velocities) else: ns = note_sequences.note_arrays_to_note_sequence( onset_times=onset_times, pitches=pitches) return sequences_lib.trim_note_sequence(ns, 0, num_seconds) def _synthesize_example_notes( examples: Sequence[Mapping[str, Sequence[float]]], ns_feature_name: str, note_onset_feature_name: str, note_offset_feature_name: str, note_frequency_feature_name: str, note_confidence_feature_name: str, sample_rate: float, num_seconds: float, ) -> np.ndarray: """Synthesize example notes to audio. Args: examples: List of example dictionaries, containing either serialized NoteSequence protos or note onset times and pitches. ns_feature_name: Name of serialized NoteSequence feature. note_onset_feature_name: Name of note onset times feature. note_offset_feature_name: Name of note offset times feature. note_frequency_feature_name: Name of note frequencies feature. note_confidence_feature_name: Name of note confidences (velocities) feature. sample_rate: Sample rate at which to synthesize. num_seconds: Number of seconds to synthesize for each example. Returns: An n-by-num_samples numpy array of samples. """ if (ns_feature_name is not None) == (note_onset_feature_name is not None): raise ValueError( 'must specify exactly one of NoteSequence feature and onset feature') n = len(examples) num_samples = round(num_seconds * sample_rate) all_samples = np.zeros([n, num_samples]) for i, ex in enumerate(examples): ns = _example_to_note_sequence( ex, ns_feature_name=ns_feature_name, note_onset_feature_name=note_onset_feature_name, note_offset_feature_name=note_offset_feature_name, note_frequency_feature_name=note_frequency_feature_name, note_confidence_feature_name=note_confidence_feature_name, num_seconds=num_seconds) fluidsynth = midi_synth.fluidsynth samples = fluidsynth(ns, sample_rate=sample_rate) if len(samples) > num_samples: samples = samples[:num_samples] all_samples[i, :len(samples)] = samples return all_samples def _examples_to_pianorolls( targets: Sequence[Mapping[str, Sequence[float]]], predictions: Sequence[Mapping[str, Sequence[float]]], ns_feature_suffix: str, note_onset_feature_suffix: str, note_offset_feature_suffix: str, note_frequency_feature_suffix: str, note_confidence_feature_suffix: str, track_specs: Optional[Sequence[note_sequences.TrackSpec]], num_seconds: float, frames_per_second: float ) -> Tuple[np.ndarray, np.ndarray]: """Generate pianoroll images from example notes. Args: targets: List of target dictionaries, containing either serialized NoteSequence protos or note onset times and pitches. predictions: List of prediction dictionaries, containing either serialized NoteSequence protos or note onset times and pitches. ns_feature_suffix: Suffix of serialized NoteSequence feature. note_onset_feature_suffix: Suffix of note onset times feature. note_offset_feature_suffix: Suffix of note offset times feature. note_frequency_feature_suffix: Suffix of note frequencies feature. note_confidence_feature_suffix: Suffix of note confidences (velocities) feature. track_specs: Optional list of TrackSpec objects to indicate a set of tracks into which each NoteSequence should be split. Tracks will be stacked vertically in the pianorolls num_seconds: Number of seconds to show for each example. frames_per_second: Number of pianoroll frames per second. Returns: onset_pianorolls: An n-by-num_pitches-by-num_frames-by-4 numpy array of pianoroll images showing only onsets. full_pianorolls: An n-by-num_pitches-by-num_frames-by-4 numpy array of pianoroll images. """ if (ns_feature_suffix is not None) == (note_onset_feature_suffix is not None): raise ValueError( 'must specify exactly one of NoteSequence feature and onset feature') def ex_to_ns(example, prefix): return _example_to_note_sequence( example=example, ns_feature_name=(prefix + ns_feature_suffix if ns_feature_suffix else None), note_onset_feature_name=(prefix + note_onset_feature_suffix if note_onset_feature_suffix else None), note_offset_feature_name=(prefix + note_offset_feature_suffix if note_offset_feature_suffix else None), note_frequency_feature_name=( prefix + note_frequency_feature_suffix if note_frequency_feature_suffix else None), note_confidence_feature_name=( prefix + note_confidence_feature_suffix if note_confidence_feature_suffix else None), num_seconds=num_seconds) n = len(targets) num_pitches = note_seq.MAX_MIDI_PITCH - note_seq.MIN_MIDI_PITCH + 1 num_frames = round(num_seconds * frames_per_second) num_tracks = len(track_specs) if track_specs else 1 pianoroll_height = num_tracks * num_pitches + (num_tracks - 1) onset_images = np.zeros([n, pianoroll_height, num_frames, 3]) full_images = np.zeros([n, pianoroll_height, num_frames, 3]) for i, (target, pred) in enumerate(zip(targets, predictions)): target_ns, pred_ns = [ ex_to_ns(ex, prefix) for (ex, prefix) in [(target, 'ref_'), (pred, 'est_')] ] # Show lines at frame boundaries. To ensure that these lines are drawn with # the same downsampling and frame selection logic as the real NoteSequences, # use this hack to draw the lines with a NoteSequence that contains notes # across all pitches at all frame start times. start_times_ns = note_seq.NoteSequence() start_times_ns.CopyFrom(target_ns) del start_times_ns.notes[:] for start_time in pred['start_times']: if start_time < target_ns.total_time: for pitch in range( note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH + 1): start_times_ns.notes.add( pitch=pitch, velocity=100, start_time=start_time, end_time=start_time + (1 / frames_per_second)) start_time_roll = sequences_lib.sequence_to_pianoroll( start_times_ns, frames_per_second=frames_per_second, min_pitch=note_seq.MIN_MIDI_PITCH, max_pitch=note_seq.MAX_MIDI_PITCH, onset_mode='length_ms') num_start_time_frames = min(len(start_time_roll.onsets), num_frames) if track_specs is not None: target_tracks = [note_sequences.extract_track(target_ns, spec.program, spec.is_drum) for spec in track_specs] pred_tracks = [note_sequences.extract_track(pred_ns, spec.program, spec.is_drum) for spec in track_specs] else: target_tracks = [target_ns] pred_tracks = [pred_ns] for j, (target_track, pred_track) in enumerate(zip(target_tracks[::-1], pred_tracks[::-1])): target_roll = sequences_lib.sequence_to_pianoroll( target_track, frames_per_second=frames_per_second, min_pitch=note_seq.MIN_MIDI_PITCH, max_pitch=note_seq.MAX_MIDI_PITCH, onset_mode='length_ms') pred_roll = sequences_lib.sequence_to_pianoroll( pred_track, frames_per_second=frames_per_second, min_pitch=note_seq.MIN_MIDI_PITCH, max_pitch=note_seq.MAX_MIDI_PITCH, onset_mode='length_ms') num_target_frames = min(len(target_roll.onsets), num_frames) num_pred_frames = min(len(pred_roll.onsets), num_frames) start_offset = j * (num_pitches + 1) end_offset = (j + 1) * (num_pitches + 1) - 1 # Onsets onset_images[ i, start_offset:end_offset, :num_start_time_frames, 0 ] = start_time_roll.onsets[:num_start_time_frames, :].T onset_images[ i, start_offset:end_offset, :num_target_frames, 1 ] = target_roll.onsets[:num_target_frames, :].T onset_images[ i, start_offset:end_offset, :num_pred_frames, 2 ] = pred_roll.onsets[:num_pred_frames, :].T # Full notes full_images[ i, start_offset:end_offset, :num_start_time_frames, 0 ] = start_time_roll.onsets[:num_start_time_frames, :].T full_images[ i, start_offset:end_offset, :num_target_frames, 1 ] = target_roll.active[:num_target_frames, :].T full_images[ i, start_offset:end_offset, :num_pred_frames, 2 ] = pred_roll.active[:num_pred_frames, :].T # Add separator between tracks. if j < num_tracks - 1: onset_images[i, end_offset, :, 0] = 1 full_images[i, end_offset, :, 0] = 1 return onset_images[:, ::-1, :, :], full_images[:, ::-1, :, :] def prettymidi_pianoroll( track_pianorolls: Mapping[str, Sequence[Tuple[np.ndarray, np.ndarray]]], fps: float, num_seconds=_DEFAULT_AUDIO_SECONDS ) -> Mapping[str, seqio.metrics.MetricValue]: """Create summary from given pianorolls.""" max_len = int(num_seconds * fps) summaries = {} for inst_name, all_prs in track_pianorolls.items(): est_prs, ref_prs = zip(*all_prs) bs = len(ref_prs) pianoroll_image_batch = np.zeros(shape=(bs, 128, max_len, 3)) for i in range(bs): ref_pr = ref_prs[i][:, :max_len] est_pr = est_prs[i][:, :max_len] pianoroll_image_batch[i, :, :est_pr.shape[1], 2] = est_pr pianoroll_image_batch[i, :, :ref_pr.shape[1], 1] = ref_pr if not inst_name: inst_name = 'all instruments' summaries[f'{inst_name} pretty_midi pianoroll'] = seqio.metrics.Image( image=pianoroll_image_batch, max_outputs=bs) return summaries def audio_summaries( targets: Sequence[Mapping[str, Sequence[float]]], predictions: Sequence[Mapping[str, Sequence[float]]], spectrogram_config: spectrograms.SpectrogramConfig, num_seconds: float = _DEFAULT_AUDIO_SECONDS ) -> Mapping[str, seqio.metrics.MetricValue]: """Compute audio summaries for a list of examples. Args: targets: List of targets, unused as we pass the input audio tokens via predictions. predictions: List of predictions, including input audio tokens. spectrogram_config: Spectrogram configuration. num_seconds: Number of seconds of audio to include in the summaries. Longer audio will be cropped (from the beginning), shorter audio will be padded with silence (at the end). Returns: A dictionary mapping "audio" to the audio summaries. """ del targets samples = _extract_example_audio( examples=predictions, sample_rate=spectrogram_config.sample_rate, num_seconds=num_seconds) return { 'audio': seqio.metrics.Audio( audiodata=samples[:, :, np.newaxis], sample_rate=spectrogram_config.sample_rate, max_outputs=samples.shape[0]) } def transcription_summaries( targets: Sequence[Mapping[str, Sequence[float]]], predictions: Sequence[Mapping[str, Sequence[float]]], spectrogram_config: spectrograms.SpectrogramConfig, ns_feature_suffix: Optional[str] = None, note_onset_feature_suffix: Optional[str] = None, note_offset_feature_suffix: Optional[str] = None, note_frequency_feature_suffix: Optional[str] = None, note_confidence_feature_suffix: Optional[str] = None, track_specs: Optional[Sequence[note_sequences.TrackSpec]] = None, num_seconds: float = _DEFAULT_AUDIO_SECONDS, pianoroll_frames_per_second: float = _DEFAULT_PIANOROLL_FRAMES_PER_SECOND, ) -> Mapping[str, seqio.metrics.MetricValue]: """Compute note transcription summaries for multiple examples. Args: targets: List of targets containing ground truth. predictions: List of predictions, including raw input audio. spectrogram_config: The spectrogram configuration. ns_feature_suffix: Suffix of serialized NoteSequence feature. note_onset_feature_suffix: Suffix of note onset times feature. note_offset_feature_suffix: Suffix of note offset times feature. note_frequency_feature_suffix: Suffix of note frequencies feature. note_confidence_feature_suffix: Suffix of note confidences (velocities) feature. track_specs: Optional list of TrackSpec objects to indicate a set of tracks into which each NoteSequence should be split. num_seconds: Number of seconds of audio to include in the summaries. Longer audio will be cropped (from the beginning), shorter audio will be padded with silence (at the end). pianoroll_frames_per_second: Temporal resolution of pianoroll images. Returns: A dictionary of input, ground truth, and transcription summaries. """ audio_samples = _extract_example_audio( examples=predictions, sample_rate=spectrogram_config.sample_rate, num_seconds=num_seconds) def synthesize(examples, prefix): return _synthesize_example_notes( examples=examples, ns_feature_name=(prefix + ns_feature_suffix if ns_feature_suffix else None), note_onset_feature_name=(prefix + note_onset_feature_suffix if note_onset_feature_suffix else None), note_offset_feature_name=(prefix + note_offset_feature_suffix if note_offset_feature_suffix else None), note_frequency_feature_name=( prefix + note_frequency_feature_suffix if note_frequency_feature_suffix else None), note_confidence_feature_name=( prefix + note_confidence_feature_suffix if note_confidence_feature_suffix else None), sample_rate=spectrogram_config.sample_rate, num_seconds=num_seconds) synthesized_predictions = synthesize(predictions, 'est_') onset_pianoroll_images, full_pianoroll_images = _examples_to_pianorolls( targets=targets, predictions=predictions, ns_feature_suffix=ns_feature_suffix, note_onset_feature_suffix=note_onset_feature_suffix, note_offset_feature_suffix=note_offset_feature_suffix, note_frequency_feature_suffix=note_frequency_feature_suffix, note_confidence_feature_suffix=note_confidence_feature_suffix, track_specs=track_specs, num_seconds=num_seconds, frames_per_second=pianoroll_frames_per_second) return { 'input_with_transcription': seqio.metrics.Audio( audiodata=np.stack([audio_samples, synthesized_predictions], axis=2), sample_rate=spectrogram_config.sample_rate, max_outputs=audio_samples.shape[0]), 'pianoroll': seqio.metrics.Image( image=full_pianoroll_images, max_outputs=full_pianoroll_images.shape[0]), 'onset_pianoroll': seqio.metrics.Image( image=onset_pianoroll_images, max_outputs=onset_pianoroll_images.shape[0]), }