File size: 5,999 Bytes

22a2b4f
 
9b5281a
 
 
 
 
 
 
 
 
13c33d8
 
 
 
 
 
 
 
 
 
9b5281a
 
 
 
ac7d960
11cac8a
 
 
 
 
 
 
9ec6403
 
3231a82
9ec6403
3bf476b
 
 
 
 
9ec6403
3bf476b
 
 
 
9ec6403
ac7d960
11cac8a
 
 
 
 
 
3231a82
 
 
 
11cac8a
 
ac7d960
11cac8a
 
 
 
 
 
 
 
 
ac7d960
 
 
133505d
3bf476b
11cac8a
 
edac276
9ec6403
 
 
 
3bf476b
 
 
 
 
 
 
 
 
 
 
 
 
9ec6403
 
11cac8a
 
9ec6403
 
 
 
 
3bf476b
 
 
e878ec7
11cac8a
f97762f
9ec6403
edac276
 
9ec6403
3bf476b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ec6403
ebc3f07
 
11cac8a
 
 
 
 
ebc3f07
11cac8a
 
 
ac7d960
11cac8a
 
 
ebc3f07
 
 
 
 
11cac8a
 
 
 
 
 
 
 
 
 
 
 
ebc3f07
 
 
edac276
11cac8a
 
e878ec7
 
 
f97762f
bd1f4bf
edac276
e878ec7
edac276
3bf476b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edac276
 
11cac8a
ac7d960
3bf476b

# Machine learning, flow and data
import tensorflow as tf
import numpy as np
import pandas as pd

# Audio
import pretty_midi

# Displaying
from IPython import display

# Get the absolute path of the directory and add it to sys.path in order to
# get the VAE class type
import sys
from pathlib import Path

directory = Path(__file__).resolve().parent
sys.path.insert(0, str(directory))

from model import VAE

# Extras
import collections


_CAP = 3501 # Cap for the number of notes
_SAMPLING_RATE = 16000 # Parameter to pass continuous signal to a discrete one
_INSTRUMENT_NAME = "Acoustic Grand Piano" # MIDI instrument used
_SCALING_FACTORS = pd.Series(
    {"pitch": 64.024558, "step": 0.101410, "duration": 0.199386}
) # Factors used to normalize song maps

def midi_to_notes(midi_file: str) -> pd.DataFrame:
  """
  Convert midi file to "song map" (dataframe where each note is broken
  into its components). The song must have at least 3501 notes.

  Parameters
  ----------
  
  midi_file : str
      Path to the midi file.

  Returns
  -------
  song_map : pd.Dataframe
      3xN matrix where each column is a note, composed of pitch, duration and step.
  """
    
  pm = pretty_midi.PrettyMIDI(midi_file)
  instrument = pm.instruments[0]
  notes = collections.defaultdict(list)

  # Sort the notes by start time
  sorted_notes = sorted(instrument.notes, key=lambda note: note.start)

  if len(sorted_notes) < 3501:
      raise ValueError("Song must have at least 3501 notes.")
    
  prev_start = sorted_notes[0].start

  # Separate each individual note in pitch, step and duration
  for note in sorted_notes:
    start = note.start
    end = note.end
    notes['pitch'].append(note.pitch)
    notes['step'].append(start - prev_start)
    notes['duration'].append(end - start)
    prev_start = start


  # Put notes in a dataframe
  notes_df = pd.DataFrame({name: np.array(value) for name, value in notes.items()})
  notes_df = notes_df[:_CAP] # Cap the song to match the model's architecture
  song_map = (notes_df / _SCALING_FACTORS).T # Scale and get transpose
  return song_map


def display_audio(pm: pretty_midi.PrettyMIDI, seconds=-1) -> display.Audio:
  """
  Display a song in PrettyMIDI format as a display.Audio object.
  This method specially comes in useful in a jupyter notebook.

  Parameters
  ----------
  
  pm : str
      PrettyMidi object containing a song.
  seconds : int
      Time fraction of the song to be displayed.
      Default ``-1``, for which the full length is taken.

  Returns
  -------
  display_obj : display.Audio
      Song as an object allowing for display.
  """
    
  waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
  # Take a sample of the generated waveform to mitigate kernel resets
  if seconds == -1: 
      waveform_short = waveform[:]
  else:
      waveform_short = waveform[:seconds*_SAMPLING_RATE]
  
  display_obj = display.Audio(waveform_short, rate=_SAMPLING_RATE)
    
  return display_obj
    

def map_to_wav(song_map: pd.DataFrame, out_file: str, velocity: int=50) -> pretty_midi.PrettyMIDI:
  """
  Convert "song map" to midi file (reverse process with respect to 
  midi_to_notes) and (optionally) save it, generating a PrettyMidi object in the process.

  Parameters
  ----------
  
  song_map : pd.DataFrame
      3xN matrix where each column is a note, composed of pitch, duration and step.
  out_file : str
      Path or file to write .mid file to. If None, no saving is done.
  velocity : int
      Note loudness, i. e. the hardness a piano key is struck with.
      Default ``50``.

  Returns
  -------
  
  pm : pretty_midi.PrettyMIDI
      PrettyMIDI object containing the song's representation.
  """

  # Get song map as dataframe
  contracted_map = tf.squeeze(song_map)
  song_map_T = contracted_map.numpy().T
  notes = pd.DataFrame(song_map_T, columns=["pitch", "step", "duration"]).mul(_SCALING_FACTORS, axis=1)
  notes["pitch"] = notes["pitch"].astype('int32').clip(1, 127)

  # Instantiate PrettyMIDI object and append notes
  pm = pretty_midi.PrettyMIDI()
  instrument = pretty_midi.Instrument(
      program=pretty_midi.instrument_name_to_program(
          _INSTRUMENT_NAME))

  prev_start = 0
  for i, note in notes.iterrows():
    # The VAE might generate notes with negative step and duration,
    # and we therefore need to make sure to skip these anomalies
    if (note['step'] < 0 or note['duration'] < 0):
        continue
      
    start = float(prev_start + note['step'])
    end = float(start + note['duration'])
    note = pretty_midi.Note(
        velocity=velocity,
        pitch=int(note['pitch']),
        start=start,
        end=end,
    )
    instrument.notes.append(note)
    prev_start = start

  pm.instruments.append(instrument)

  # If a path was specified, save as midi file
  if out_file:
      pm.write(out_file)
  return pm

def generate_and_display(model: VAE, 
                         out_file: str=None, 
                         z_sample: tf.Tensor=None, 
                         velocity: int=50, 
                         seconds: int=-1) -> display.Audio:
  """
  Generate a song, (optionally) save it and display it.

  Parameters
  ----------
  model : VAE
      Instance of VAE to generate the song with.
  out_file : str
      Path or file to write .mid file to.
      Default ``None``, for which no saving is done.
  z_sample : tf.Tensor 
      Song encoding used to generate a song.
      Default ``None``, for which an unconditioned piece is generated.
  velocity : int
      Note loudness, i. e. the hardness a piano key is struck with.
      Default ``50``.
  seconds : int
      Time fraction of the song to be displayed.
      Default ``-1``, for which the full length is taken.

  Returns
  -------
  display_obj : display.Audio
      Song as an object allowing for display.
  """
    
  song_map = model.generate(z_sample)
  wav = map_to_wav(song_map, out_file, velocity)
  display_obj = display_audio(wav, seconds)
                             
  return display_obj