Spaces:

juancopi81
/

youtube-music-transcribe

Build error

File size: 18,358 Bytes

b100e1c

# Copyright 2022 The MT3 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""TensorBoard summaries and utilities."""

from typing import Any, Mapping, Optional, Sequence, Tuple

import librosa

from mt3 import note_sequences
from mt3 import spectrograms

import note_seq
from note_seq import midi_synth
from note_seq import sequences_lib
from note_seq.protobuf import music_pb2

import numpy as np
import seqio


_DEFAULT_AUDIO_SECONDS = 30.0
_DEFAULT_PIANOROLL_FRAMES_PER_SECOND = 15

# TODO(iansimon): pick a SoundFont; for some reason the default is all organ


def _extract_example_audio(
    examples: Sequence[Mapping[str, Any]],
    sample_rate: float,
    num_seconds: float,
    audio_key: str = 'raw_inputs'
) -> np.ndarray:
  """Extract audio from examples.

  Args:
    examples: List of examples containing raw audio.
    sample_rate: Number of samples per second.
    num_seconds: Number of seconds of audio to include.
    audio_key: Dictionary key for the raw audio.

  Returns:
    An n-by-num_samples numpy array of samples.
  """
  n = len(examples)
  num_samples = round(num_seconds * sample_rate)
  all_samples = np.zeros([n, num_samples])
  for i, ex in enumerate(examples):
    samples = ex[audio_key][:num_samples]
    all_samples[i, :len(samples)] = samples
  return all_samples


def _example_to_note_sequence(
    example: Mapping[str, Sequence[float]],
    ns_feature_name: str,
    note_onset_feature_name: str,
    note_offset_feature_name: str,
    note_frequency_feature_name: str,
    note_confidence_feature_name: str,
    num_seconds: float
) -> music_pb2.NoteSequence:
  """Extract NoteSequence from example."""
  if ns_feature_name:
    ns = example[ns_feature_name]

  else:
    onset_times = np.array(example[note_onset_feature_name])
    pitches = librosa.hz_to_midi(
        example[note_frequency_feature_name]).round().astype(int)
    assert len(onset_times) == len(pitches)

    if note_offset_feature_name or note_confidence_feature_name:
      offset_times = (
          example[note_offset_feature_name]
          if note_offset_feature_name
          else onset_times + note_sequences.DEFAULT_NOTE_DURATION
      )
      assert len(onset_times) == len(offset_times)

      confidences = (np.array(example[note_confidence_feature_name])
                     if note_confidence_feature_name else None)
      velocities = np.ceil(
          note_seq.MAX_MIDI_VELOCITY * confidences if confidences is not None
          else note_sequences.DEFAULT_VELOCITY * np.ones_like(onset_times)
      ).astype(int)
      assert len(onset_times) == len(velocities)

      ns = note_sequences.note_arrays_to_note_sequence(
          onset_times=onset_times, offset_times=offset_times,
          pitches=pitches, velocities=velocities)

    else:
      ns = note_sequences.note_arrays_to_note_sequence(
          onset_times=onset_times, pitches=pitches)

  return sequences_lib.trim_note_sequence(ns, 0, num_seconds)


def _synthesize_example_notes(
    examples: Sequence[Mapping[str, Sequence[float]]],
    ns_feature_name: str,
    note_onset_feature_name: str,
    note_offset_feature_name: str,
    note_frequency_feature_name: str,
    note_confidence_feature_name: str,
    sample_rate: float,
    num_seconds: float,
) -> np.ndarray:
  """Synthesize example notes to audio.

  Args:
    examples: List of example dictionaries, containing either serialized
        NoteSequence protos or note onset times and pitches.
    ns_feature_name: Name of serialized NoteSequence feature.
    note_onset_feature_name: Name of note onset times feature.
    note_offset_feature_name: Name of note offset times feature.
    note_frequency_feature_name: Name of note frequencies feature.
    note_confidence_feature_name: Name of note confidences (velocities) feature.
    sample_rate: Sample rate at which to synthesize.
    num_seconds: Number of seconds to synthesize for each example.

  Returns:
    An n-by-num_samples numpy array of samples.
  """
  if (ns_feature_name is not None) == (note_onset_feature_name is not None):
    raise ValueError(
        'must specify exactly one of NoteSequence feature and onset feature')

  n = len(examples)
  num_samples = round(num_seconds * sample_rate)

  all_samples = np.zeros([n, num_samples])

  for i, ex in enumerate(examples):
    ns = _example_to_note_sequence(
        ex,
        ns_feature_name=ns_feature_name,
        note_onset_feature_name=note_onset_feature_name,
        note_offset_feature_name=note_offset_feature_name,
        note_frequency_feature_name=note_frequency_feature_name,
        note_confidence_feature_name=note_confidence_feature_name,
        num_seconds=num_seconds)
    fluidsynth = midi_synth.fluidsynth
    samples = fluidsynth(ns, sample_rate=sample_rate)
    if len(samples) > num_samples:
      samples = samples[:num_samples]
    all_samples[i, :len(samples)] = samples

  return all_samples


def _examples_to_pianorolls(
    targets: Sequence[Mapping[str, Sequence[float]]],
    predictions: Sequence[Mapping[str, Sequence[float]]],
    ns_feature_suffix: str,
    note_onset_feature_suffix: str,
    note_offset_feature_suffix: str,
    note_frequency_feature_suffix: str,
    note_confidence_feature_suffix: str,
    track_specs: Optional[Sequence[note_sequences.TrackSpec]],
    num_seconds: float,
    frames_per_second: float
) -> Tuple[np.ndarray, np.ndarray]:
  """Generate pianoroll images from example notes.

  Args:
    targets: List of target dictionaries, containing either serialized
        NoteSequence protos or note onset times and pitches.
    predictions: List of prediction dictionaries, containing either serialized
        NoteSequence protos or note onset times and pitches.
    ns_feature_suffix: Suffix of serialized NoteSequence feature.
    note_onset_feature_suffix: Suffix of note onset times feature.
    note_offset_feature_suffix: Suffix of note offset times feature.
    note_frequency_feature_suffix: Suffix of note frequencies feature.
    note_confidence_feature_suffix: Suffix of note confidences (velocities)
        feature.
    track_specs: Optional list of TrackSpec objects to indicate a set of tracks
        into which each NoteSequence should be split. Tracks will be stacked
        vertically in the pianorolls
    num_seconds: Number of seconds to show for each example.
    frames_per_second: Number of pianoroll frames per second.

  Returns:
    onset_pianorolls: An n-by-num_pitches-by-num_frames-by-4 numpy array of
        pianoroll images showing only onsets.
    full_pianorolls: An n-by-num_pitches-by-num_frames-by-4 numpy array of
        pianoroll images.
  """
  if (ns_feature_suffix is not None) == (note_onset_feature_suffix is not None):
    raise ValueError(
        'must specify exactly one of NoteSequence feature and onset feature')

  def ex_to_ns(example, prefix):
    return _example_to_note_sequence(
        example=example,
        ns_feature_name=(prefix + ns_feature_suffix
                         if ns_feature_suffix else None),
        note_onset_feature_name=(prefix + note_onset_feature_suffix
                                 if note_onset_feature_suffix else None),
        note_offset_feature_name=(prefix + note_offset_feature_suffix
                                  if note_offset_feature_suffix else None),
        note_frequency_feature_name=(
            prefix + note_frequency_feature_suffix
            if note_frequency_feature_suffix else None),
        note_confidence_feature_name=(
            prefix + note_confidence_feature_suffix
            if note_confidence_feature_suffix else None),
        num_seconds=num_seconds)

  n = len(targets)
  num_pitches = note_seq.MAX_MIDI_PITCH - note_seq.MIN_MIDI_PITCH + 1
  num_frames = round(num_seconds * frames_per_second)
  num_tracks = len(track_specs) if track_specs else 1
  pianoroll_height = num_tracks * num_pitches + (num_tracks - 1)

  onset_images = np.zeros([n, pianoroll_height, num_frames, 3])
  full_images = np.zeros([n, pianoroll_height, num_frames, 3])

  for i, (target, pred) in enumerate(zip(targets, predictions)):
    target_ns, pred_ns = [
        ex_to_ns(ex, prefix)
        for (ex, prefix) in [(target, 'ref_'), (pred, 'est_')]
    ]

    # Show lines at frame boundaries. To ensure that these lines are drawn with
    # the same downsampling and frame selection logic as the real NoteSequences,
    # use this hack to draw the lines with a NoteSequence that contains notes
    # across all pitches at all frame start times.
    start_times_ns = note_seq.NoteSequence()
    start_times_ns.CopyFrom(target_ns)
    del start_times_ns.notes[:]
    for start_time in pred['start_times']:
      if start_time < target_ns.total_time:
        for pitch in range(
            note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH + 1):
          start_times_ns.notes.add(
              pitch=pitch,
              velocity=100,
              start_time=start_time,
              end_time=start_time + (1 / frames_per_second))

    start_time_roll = sequences_lib.sequence_to_pianoroll(
        start_times_ns,
        frames_per_second=frames_per_second,
        min_pitch=note_seq.MIN_MIDI_PITCH,
        max_pitch=note_seq.MAX_MIDI_PITCH,
        onset_mode='length_ms')
    num_start_time_frames = min(len(start_time_roll.onsets), num_frames)

    if track_specs is not None:
      target_tracks = [note_sequences.extract_track(target_ns,
                                                    spec.program, spec.is_drum)
                       for spec in track_specs]
      pred_tracks = [note_sequences.extract_track(pred_ns,
                                                  spec.program, spec.is_drum)
                     for spec in track_specs]
    else:
      target_tracks = [target_ns]
      pred_tracks = [pred_ns]

    for j, (target_track, pred_track) in enumerate(zip(target_tracks[::-1],
                                                       pred_tracks[::-1])):
      target_roll = sequences_lib.sequence_to_pianoroll(
          target_track,
          frames_per_second=frames_per_second,
          min_pitch=note_seq.MIN_MIDI_PITCH,
          max_pitch=note_seq.MAX_MIDI_PITCH,
          onset_mode='length_ms')
      pred_roll = sequences_lib.sequence_to_pianoroll(
          pred_track,
          frames_per_second=frames_per_second,
          min_pitch=note_seq.MIN_MIDI_PITCH,
          max_pitch=note_seq.MAX_MIDI_PITCH,
          onset_mode='length_ms')

      num_target_frames = min(len(target_roll.onsets), num_frames)
      num_pred_frames = min(len(pred_roll.onsets), num_frames)

      start_offset = j * (num_pitches + 1)
      end_offset = (j + 1) * (num_pitches + 1) - 1

      # Onsets
      onset_images[
          i, start_offset:end_offset, :num_start_time_frames, 0
      ] = start_time_roll.onsets[:num_start_time_frames, :].T
      onset_images[
          i, start_offset:end_offset, :num_target_frames, 1
      ] = target_roll.onsets[:num_target_frames, :].T
      onset_images[
          i, start_offset:end_offset, :num_pred_frames, 2
      ] = pred_roll.onsets[:num_pred_frames, :].T

      # Full notes
      full_images[
          i, start_offset:end_offset, :num_start_time_frames, 0
      ] = start_time_roll.onsets[:num_start_time_frames, :].T
      full_images[
          i, start_offset:end_offset, :num_target_frames, 1
      ] = target_roll.active[:num_target_frames, :].T
      full_images[
          i, start_offset:end_offset, :num_pred_frames, 2
      ] = pred_roll.active[:num_pred_frames, :].T

      # Add separator between tracks.
      if j < num_tracks - 1:
        onset_images[i, end_offset, :, 0] = 1
        full_images[i, end_offset, :, 0] = 1

  return onset_images[:, ::-1, :, :], full_images[:, ::-1, :, :]


def prettymidi_pianoroll(
    track_pianorolls: Mapping[str, Sequence[Tuple[np.ndarray, np.ndarray]]],
    fps: float,
    num_seconds=_DEFAULT_AUDIO_SECONDS
) -> Mapping[str, seqio.metrics.MetricValue]:
  """Create summary from given pianorolls."""
  max_len = int(num_seconds * fps)
  summaries = {}
  for inst_name, all_prs in track_pianorolls.items():

    est_prs, ref_prs = zip(*all_prs)

    bs = len(ref_prs)
    pianoroll_image_batch = np.zeros(shape=(bs, 128, max_len, 3))
    for i in range(bs):
      ref_pr = ref_prs[i][:, :max_len]
      est_pr = est_prs[i][:, :max_len]

      pianoroll_image_batch[i, :, :est_pr.shape[1], 2] = est_pr
      pianoroll_image_batch[i, :, :ref_pr.shape[1], 1] = ref_pr
    if not inst_name:
      inst_name = 'all instruments'

    summaries[f'{inst_name} pretty_midi pianoroll'] = seqio.metrics.Image(
        image=pianoroll_image_batch, max_outputs=bs)

  return summaries


def audio_summaries(
    targets: Sequence[Mapping[str, Sequence[float]]],
    predictions: Sequence[Mapping[str, Sequence[float]]],
    spectrogram_config: spectrograms.SpectrogramConfig,
    num_seconds: float = _DEFAULT_AUDIO_SECONDS
) -> Mapping[str, seqio.metrics.MetricValue]:
  """Compute audio summaries for a list of examples.

  Args:
    targets: List of targets, unused as we pass the input audio tokens via
        predictions.
    predictions: List of predictions, including input audio tokens.
    spectrogram_config: Spectrogram configuration.
    num_seconds: Number of seconds of audio to include in the summaries.
        Longer audio will be cropped (from the beginning), shorter audio will be
        padded with silence (at the end).

  Returns:
    A dictionary mapping "audio" to the audio summaries.
  """
  del targets
  samples = _extract_example_audio(
      examples=predictions,
      sample_rate=spectrogram_config.sample_rate,
      num_seconds=num_seconds)
  return {
      'audio': seqio.metrics.Audio(
          audiodata=samples[:, :, np.newaxis],
          sample_rate=spectrogram_config.sample_rate,
          max_outputs=samples.shape[0])
  }


def transcription_summaries(
    targets: Sequence[Mapping[str, Sequence[float]]],
    predictions: Sequence[Mapping[str, Sequence[float]]],
    spectrogram_config: spectrograms.SpectrogramConfig,
    ns_feature_suffix: Optional[str] = None,
    note_onset_feature_suffix: Optional[str] = None,
    note_offset_feature_suffix: Optional[str] = None,
    note_frequency_feature_suffix: Optional[str] = None,
    note_confidence_feature_suffix: Optional[str] = None,
    track_specs: Optional[Sequence[note_sequences.TrackSpec]] = None,
    num_seconds: float = _DEFAULT_AUDIO_SECONDS,
    pianoroll_frames_per_second: float = _DEFAULT_PIANOROLL_FRAMES_PER_SECOND,
) -> Mapping[str, seqio.metrics.MetricValue]:
  """Compute note transcription summaries for multiple examples.

  Args:
    targets: List of targets containing ground truth.
    predictions: List of predictions, including raw input audio.
    spectrogram_config: The spectrogram configuration.
    ns_feature_suffix: Suffix of serialized NoteSequence feature.
    note_onset_feature_suffix: Suffix of note onset times feature.
    note_offset_feature_suffix: Suffix of note offset times feature.
    note_frequency_feature_suffix: Suffix of note frequencies feature.
    note_confidence_feature_suffix: Suffix of note confidences (velocities)
        feature.
    track_specs: Optional list of TrackSpec objects to indicate a set of tracks
        into which each NoteSequence should be split.
    num_seconds: Number of seconds of audio to include in the summaries.
        Longer audio will be cropped (from the beginning), shorter audio will be
        padded with silence (at the end).
    pianoroll_frames_per_second: Temporal resolution of pianoroll images.

  Returns:
    A dictionary of input, ground truth, and transcription summaries.
  """
  audio_samples = _extract_example_audio(
      examples=predictions,
      sample_rate=spectrogram_config.sample_rate,
      num_seconds=num_seconds)

  def synthesize(examples, prefix):
    return _synthesize_example_notes(
        examples=examples,
        ns_feature_name=(prefix + ns_feature_suffix
                         if ns_feature_suffix else None),
        note_onset_feature_name=(prefix + note_onset_feature_suffix
                                 if note_onset_feature_suffix else None),
        note_offset_feature_name=(prefix + note_offset_feature_suffix
                                  if note_offset_feature_suffix else None),
        note_frequency_feature_name=(
            prefix + note_frequency_feature_suffix
            if note_frequency_feature_suffix else None),
        note_confidence_feature_name=(
            prefix + note_confidence_feature_suffix
            if note_confidence_feature_suffix else None),
        sample_rate=spectrogram_config.sample_rate,
        num_seconds=num_seconds)

  synthesized_predictions = synthesize(predictions, 'est_')

  onset_pianoroll_images, full_pianoroll_images = _examples_to_pianorolls(
      targets=targets,
      predictions=predictions,
      ns_feature_suffix=ns_feature_suffix,
      note_onset_feature_suffix=note_onset_feature_suffix,
      note_offset_feature_suffix=note_offset_feature_suffix,
      note_frequency_feature_suffix=note_frequency_feature_suffix,
      note_confidence_feature_suffix=note_confidence_feature_suffix,
      track_specs=track_specs,
      num_seconds=num_seconds,
      frames_per_second=pianoroll_frames_per_second)

  return {
      'input_with_transcription': seqio.metrics.Audio(
          audiodata=np.stack([audio_samples, synthesized_predictions], axis=2),
          sample_rate=spectrogram_config.sample_rate,
          max_outputs=audio_samples.shape[0]),

      'pianoroll': seqio.metrics.Image(
          image=full_pianoroll_images,
          max_outputs=full_pianoroll_images.shape[0]),

      'onset_pianoroll': seqio.metrics.Image(
          image=onset_pianoroll_images,
          max_outputs=onset_pianoroll_images.shape[0]),
  }