Spaces:

sweetcocoa
/

pop2piano

Runtime error

App Files Files Community

sweetcocoa commited on Nov 5, 2022

Commit

88490a8

1 Parent(s): e28a4f0

initial test

Browse files

Files changed (17) hide show

README.md +6 -8
app.py +64 -0
config.yaml +61 -0
layer/__init__.py +0 -0
layer/input.py +46 -0
midi_tokenizer.py +430 -0
packages.txt +1 -0
preprocess/README.md +36 -0
preprocess/beat_quantizer.py +111 -0
preprocess/bpm_quantize.py +98 -0
preprocess/melody_accuracy.py +81 -0
preprocess/pop_align.py +331 -0
preprocess/split_spleeter.py +72 -0
requirements.txt +8 -0
transformer_wrapper.py +342 -0
utils/__init__.py +0 -0
utils/dsp.py +63 -0

README.md CHANGED Viewed

@@ -1,12 +1,10 @@
 ---
-title: Pop2piano
-emoji: 🚀
-colorFrom: purple
-colorTo: yellow
 sdk: streamlit
 sdk_version: 1.10.0
 app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Pop2Piano Demo
+emoji: 🎹
+colorFrom: black
+colorTo: white
 sdk: streamlit
 sdk_version: 1.10.0
 app_file: app.py
+pinned: true
+---

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import streamlit as st
+import os
+from transformer_wrapper import TransformerWrapper
+from omegaconf import OmegaConf
+@st.cache(show_spinner=False)
+def get_file_content_as_string(path):
+    return open(path, "r", encoding="utf-8").read()
+@st.cache(show_spinner=True)
+def model_load():
+    config = OmegaConf.load("config.yaml")
+    wrapper = TransformerWrapper(config)
+    wrapper = wrapper.load_from_checkpoint(
+        "https://huggingface.co/sweetcocoa/pop2piano/raw/main/model-1999-val_0.67311615.ckpt",
+        config=config,
+    ).cuda()
+    model_id = "dpipqxiy"
+    wrapper.eval()
+    return wrapper, model_id, config
+def main():
+    wrapper, model_id, config = model_load()
+    composers = list(config.composer_to_feature_token.keys())
+    dest_dir = "ytsamples"
+    composer = st.selectbox(label="Arranger", options=composers)
+    file_up = st.file_uploader("Upload an audio", type=["mp3", "wav"])
+    if st.button("convert"):
+        if file_up is not None:
+            bytes_data = file_up.getvalue()
+            target_file = f"{dest_dir}/{file_up.name}"
+            with open(target_file, "wb") as f:
+                f.write(bytes_data)
+            with st.spinner("Wait for it..."):
+                midi, arranger, mix_path, midi_path = wrapper.generate(
+                    audio_path=target_file,
+                    composer=composer,
+                    model=model_id,
+                    ignore_duplicate=True,
+                    show_plot=False,
+                    save_midi=True,
+                    save_mix=True,
+                    vqvae=None,
+                )
+            with open(midi_path, "rb") as midi_f:
+                file_down = st.download_button(
+                    "Download midi",
+                    data=midi_f,
+                    file_name=os.path.basename(midi_path),
+                )
+            with open(mix_path, "rb") as audio_f:
+                st.audio(audio_f.read(), format="audio/wav")
+if __name__ == "__main__":
+    main()

config.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+project: pop2piano
+dataset:
+  target_length: 256
+  input_length: 1024
+  n_bars: 2
+  sample_rate: 22050
+  use_mel: true
+  mel_is_conditioned: true
+composer_to_feature_token:
+  composer1: 2052
+  composer2: 2053
+  composer3: 2054
+  composer4: 2055
+  composer5: 2056
+  composer6: 2057
+  composer7: 2058
+  composer8: 2059
+  composer9: 2060
+  composer10: 2061
+  composer11: 2062
+  composer12: 2063
+  composer13: 2064
+  composer14: 2065
+  composer15: 2066
+  composer16: 2067
+  composer17: 2068
+  composer18: 2069
+  composer19: 2070
+  composer20: 2071
+  composer21: 2072
+t5:
+  feed_forward_proj: gated-gelu
+  tie_word_embeddings: false
+  tie_encoder_decoder: false
+  vocab_size: 2400
+  n_positions: 1024
+  relative_attention_num_buckets: 32
+tokenizer:
+  vocab_size:
+    special: 4
+    note: 128
+    velocity: 2
+    time: 100
+training:
+  seed: 3407
+  resume: false
+  offline: false
+  num_gpu: 1
+  max_epochs: 5000
+  accumulate_grad_batches: 1
+  check_val_every_n_epoch: 20
+  find_lr: false
+  optimizer: adafactor
+  version: none
+  lr: 0.001
+  lr_min: 1.0e-06
+  lr_scheduler: false
+  lr_decay: 0.99
+  batch_size: 32
+  num_workers: 32
+  gradient_clip_val: 3.0

layer/__init__.py ADDED Viewed

File without changes

layer/input.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn as nn
+import torchaudio
+class LogMelSpectrogram(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.melspectrogram = torchaudio.transforms.MelSpectrogram(
+            sample_rate=22050,
+            n_fft=4096,
+            hop_length=1024,
+            f_min=10.0,
+            n_mels=512,
+        )
+    def forward(self, x):
+        # x : audio(batch, sample)
+        # X : melspec (batch, freq, frame)
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                X = self.melspectrogram(x)
+                X = X.clamp(min=1e-6).log()
+        return X
+class ConcatEmbeddingToMel(nn.Module):
+    def __init__(self, embedding_offset, n_vocab, n_dim) -> None:
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_dim)
+        self.embedding_offset = embedding_offset
+    def forward(self, feature, index_value):
+        """
+        index_value : (batch, )
+        feature : (batch, time, feature_dim)
+        """
+        index_shifted = index_value - self.embedding_offset
+        # (batch, 1, feature_dim)
+        composer_embedding = self.embedding(index_shifted).unsqueeze(1)
+        # print(composer_embedding.shape, feature.shape)
+        # (batch, 1 + time, feature_dim)
+        inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
+        return inputs_embeds

midi_tokenizer.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import numpy as np
+from numba import jit
+import pretty_midi
+import scipy.interpolate as interp
+TOKEN_SPECIAL: int = 0
+TOKEN_NOTE: int = 1
+TOKEN_VELOCITY: int = 2
+TOKEN_TIME: int = 3
+DEFAULT_VELOCITY: int = 77
+TIE: int = 2
+EOS: int = 1
+PAD: int = 0
+def extrapolate_beat_times(beat_times, n_extend=1):
+    beat_times_function = interp.interp1d(
+        np.arange(beat_times.size),
+        beat_times,
+        bounds_error=False,
+        fill_value="extrapolate",
+    )
+    ext_beats = beat_times_function(
+        np.linspace(0, beat_times.size + n_extend - 1, beat_times.size + n_extend)
+    )
+    return ext_beats
+@jit(nopython=True, cache=True)
+def fast_tokenize(idx, token_type, n_special, n_note, n_velocity):
+    if token_type == TOKEN_TIME:
+        return n_special + n_note + n_velocity + idx
+    elif token_type == TOKEN_VELOCITY:
+        return n_special + n_note + idx
+    elif token_type == TOKEN_NOTE:
+        return n_special + idx
+    elif token_type == TOKEN_SPECIAL:
+        return idx
+    else:
+        return -1
+@jit(nopython=True, cache=True)
+def fast_detokenize(idx, n_special, n_note, n_velocity, time_idx_offset):
+    if idx >= n_special + n_note + n_velocity:
+        return (TOKEN_TIME, (idx - (n_special + n_note + n_velocity)) + time_idx_offset)
+    elif idx >= n_special + n_note:
+        return TOKEN_VELOCITY, idx - (n_special + n_note)
+    elif idx >= n_special:
+        return TOKEN_NOTE, idx - n_special
+    else:
+        return TOKEN_SPECIAL, idx
+class MidiTokenizer:
+    def __init__(self, config) -> None:
+        self.config = config
+    def tokenize_note(self, idx, token_type):
+        rt = fast_tokenize(
+            idx,
+            token_type,
+            self.config.vocab_size.special,
+            self.config.vocab_size.note,
+            self.config.vocab_size.velocity,
+        )
+        if rt == -1:
+            raise ValueError(f"type {type} is not a predefined token type.")
+        else:
+            return rt
+    def notes_to_tokens(self, notes):
+        """
+        notes : (onset idx, offset idx, pitch, velocity)
+        """
+        max_time_idx = notes[:, :2].max()
+        times = [[] for i in range((max_time_idx + 1))]
+        for onset, offset, pitch, velocity in notes:
+            times[onset].append([pitch, velocity])
+            times[offset].append([pitch, 0])
+        tokens = []
+        current_velocity = 0
+        for i, time in enumerate(times):
+            if len(time) == 0:
+                continue
+            tokens.append(self.tokenize_note(i, TOKEN_TIME))
+            for pitch, velocity in time:
+                velocity = int(velocity > 0)
+                if current_velocity != velocity:
+                    current_velocity = velocity
+                    tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
+                tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
+        return np.array(tokens, dtype=int)
+    def detokenize(self, token, time_idx_offset):
+        type, value = fast_detokenize(
+            token,
+            n_special=self.config.vocab_size.special,
+            n_note=self.config.vocab_size.note,
+            n_velocity=self.config.vocab_size.velocity,
+            time_idx_offset=time_idx_offset,
+        )
+        if type != TOKEN_TIME:
+            value = int(value)
+        return [type, value]
+    def to_string(self, tokens, time_idx_offset=0):
+        nums = [
+            self.detokenize(token, time_idx_offset=time_idx_offset) for token in tokens
+        ]
+        strings = []
+        for i in range(len(nums)):
+            type = nums[i][0]
+            value = nums[i][1]
+            if type == TOKEN_TIME:
+                type = "time"
+            elif type == TOKEN_SPECIAL:
+                if value == EOS:
+                    value = "EOS"
+                elif value == PAD:
+                    value = "PAD"
+                elif value == TIE:
+                    value = "TIE"
+                else:
+                    value = "Unknown Special"
+            elif type == TOKEN_NOTE:
+                type = "note"
+            elif type == TOKEN_VELOCITY:
+                type = "velocity"
+            strings.append((type, value))
+        return strings
+    def split_notes(self, notes, beatsteps, time_from, time_to):
+        """
+        Assumptions
+        - notes are sorted by onset time
+        - beatsteps are sorted by time
+        """
+        start_idx = np.searchsorted(beatsteps, time_from)
+        start_note = np.searchsorted(notes[:, 0], start_idx)
+        end_idx = np.searchsorted(beatsteps, time_to)
+        end_note = np.searchsorted(notes[:, 0], end_idx)
+        splited_notes = notes[start_note:end_note]
+        return splited_notes, (start_idx, end_idx, start_note, end_note)
+    def notes_to_relative_tokens(
+        self, notes, offset_idx, add_eos=False, add_composer=False, composer_value=None
+    ):
+        """
+        notes : (onset idx, offset idx, pitch, velocity)
+        """
+        def _add_eos(tokens):
+            tokens = np.concatenate((tokens, np.array([EOS], dtype=tokens.dtype)))
+            return tokens
+        def _add_composer(tokens, composer_value):
+            tokens = np.concatenate(
+                (np.array([composer_value], dtype=tokens.dtype), tokens)
+            )
+            return tokens
+        if len(notes) == 0:
+            tokens = np.array([], dtype=int)
+            if add_eos:
+                tokens = _add_eos(tokens)
+            if add_composer:
+                tokens = _add_composer(tokens, composer_value=composer_value)
+            return tokens
+        max_time_idx = notes[:, :2].max()
+        # times[time_idx] = [[pitch, .. ], [pitch, 0], ..]
+        times = [[] for i in range((max_time_idx + 1 - offset_idx))]
+        for abs_onset, abs_offset, pitch, velocity in notes:
+            rel_onset = abs_onset - offset_idx
+            rel_offset = abs_offset - offset_idx
+            times[rel_onset].append([pitch, velocity])
+            times[rel_offset].append([pitch, 0])
+        # 여기서부터는 전부 시간 0(offset) 기준
+        tokens = []
+        current_velocity = 0
+        current_time_idx = 0
+        for rel_idx, time in enumerate(times):
+            if len(time) == 0:
+                continue
+            time_idx_shift = rel_idx - current_time_idx
+            current_time_idx = rel_idx
+            tokens.append(self.tokenize_note(time_idx_shift, TOKEN_TIME))
+            for pitch, velocity in time:
+                velocity = int(velocity > 0)
+                if current_velocity != velocity:
+                    current_velocity = velocity
+                    tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
+                tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
+        tokens = np.array(tokens, dtype=int)
+        if add_eos:
+            tokens = _add_eos(tokens)
+        if add_composer:
+            tokens = _add_composer(tokens, composer_value=composer_value)
+        return tokens
+    def relative_batch_tokens_to_midi(
+        self,
+        tokens,
+        beatstep,
+        beat_offset_idx=None,
+        bars_per_batch=None,
+        cutoff_time_idx=None,
+    ):
+        """
+        tokens : (batch, sequence)
+        beatstep : (times, )
+        """
+        beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
+        notes = None
+        bars_per_batch = 2 if bars_per_batch is None else bars_per_batch
+        N = len(tokens)
+        for n in range(N):
+            _tokens = tokens[n]
+            _start_idx = beat_offset_idx + n * bars_per_batch * 4
+            _cutoff_time_idx = cutoff_time_idx + _start_idx
+            _notes = self.relative_tokens_to_notes(
+                _tokens,
+                start_idx=_start_idx,
+                cutoff_time_idx=_cutoff_time_idx,
+            )
+            # print(_notes, "\n-------")
+            if len(_notes) == 0:
+                pass
+                # print("_notes zero")
+            elif notes is None:
+                notes = _notes
+            else:
+                notes = np.concatenate((notes, _notes), axis=0)
+        if notes is None:
+            notes = []
+        midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
+        return midi, notes
+    def relative_tokens_to_notes(self, tokens, start_idx, cutoff_time_idx=None):
+        # TODO remove legacy
+        # decoding 첫토큰이 편곡자인 경우
+        if tokens[0] >= sum(self.config.vocab_size.values()):
+            tokens = tokens[1:]
+        words = [self.detokenize(token, time_idx_offset=0) for token in tokens]
+        if hasattr(start_idx, "item"):
+            """
+            if numpy or torch tensor
+            """
+            start_idx = start_idx.item()
+        current_idx = start_idx
+        current_velocity = 0
+        note_onsets_ready = [None for i in range(self.config.vocab_size.note + 1)]
+        notes = []
+        for type, number in words:
+            if type == TOKEN_SPECIAL:
+                if number == EOS:
+                    break
+            elif type == TOKEN_TIME:
+                current_idx += number
+                if cutoff_time_idx is not None:
+                    current_idx = min(current_idx, cutoff_time_idx)
+            elif type == TOKEN_VELOCITY:
+                current_velocity = number
+            elif type == TOKEN_NOTE:
+                pitch = number
+                if current_velocity == 0:
+                    # note_offset
+                    if note_onsets_ready[pitch] is None:
+                        # offset without onset
+                        pass
+                    else:
+                        onset_idx = note_onsets_ready[pitch]
+                        if onset_idx >= current_idx:
+                            # No time shift after previous note_on
+                            pass
+                        else:
+                            offset_idx = current_idx
+                            notes.append(
+                                [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
+                            )
+                            note_onsets_ready[pitch] = None
+                else:
+                    # note_on
+                    if note_onsets_ready[pitch] is None:
+                        note_onsets_ready[pitch] = current_idx
+                    else:
+                        # note-on already exists
+                        onset_idx = note_onsets_ready[pitch]
+                        if onset_idx >= current_idx:
+                            # No time shift after previous note_on
+                            pass
+                        else:
+                            offset_idx = current_idx
+                            notes.append(
+                                [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
+                            )
+                            note_onsets_ready[pitch] = current_idx
+            else:
+                raise ValueError
+        for pitch, note_on in enumerate(note_onsets_ready):
+            # force offset if no offset for each pitch
+            if note_on is not None:
+                if cutoff_time_idx is None:
+                    cutoff = note_on + 1
+                else:
+                    cutoff = max(cutoff_time_idx, note_on + 1)
+                offset_idx = max(current_idx, cutoff)
+                notes.append([note_on, offset_idx, pitch, DEFAULT_VELOCITY])
+        if len(notes) == 0:
+            return []
+        else:
+            notes = np.array(notes)
+            note_order = notes[:, 0] * 128 + notes[:, 1]
+            notes = notes[note_order.argsort()]
+            return notes
+    def notes_to_midi(self, notes, beatstep, offset_sec=None):
+        new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
+        new_inst = pretty_midi.Instrument(program=0)
+        new_notes = []
+        if offset_sec is None:
+            offset_sec = 0.0
+        for onset_idx, offset_idx, pitch, velocity in notes:
+            new_note = pretty_midi.Note(
+                velocity=velocity,
+                pitch=pitch,
+                start=beatstep[onset_idx] - offset_sec,
+                end=beatstep[offset_idx] - offset_sec,
+            )
+            new_notes.append(new_note)
+        new_inst.notes = new_notes
+        new_pm.instruments.append(new_inst)
+        new_pm.remove_invalid_notes()
+        return new_pm
+@jit(nopython=True, cache=False)
+def fast_notes_to_relative_tokens(
+    notes, offset_idx, max_time_idx, n_special, n_note, n_velocity
+):
+    """
+    notes : (onset idx, offset idx, pitch, velocity)
+    """
+    times_p = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
+    times_v = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
+    for abs_onset, abs_offset, pitch, velocity in notes:
+        rel_onset = abs_onset - offset_idx
+        rel_offset = abs_offset - offset_idx
+        times_p[rel_onset] = np.append(times_p[rel_onset], pitch)
+        times_v[rel_onset] = np.append(times_v[rel_onset], velocity)
+        times_p[rel_offset] = np.append(times_p[rel_offset], pitch)
+        times_v[rel_offset] = np.append(times_v[rel_offset], velocity)
+    # 여기서부터는 전부 시간 0(offset) 기준
+    tokens = []
+    current_velocity = np.array([0])
+    current_time_idx = np.array([0])
+    # range가 0일 수도 있으니까..
+    for i in range(len(times_p)):
+        rel_idx = i
+        notes_at_time = times_p[i]
+        if len(notes_at_time) == 0:
+            continue
+        time_idx_shift = rel_idx - current_time_idx[0]
+        current_time_idx[0] = rel_idx
+        token = fast_tokenize(
+            time_idx_shift,
+            TOKEN_TIME,
+            n_special=n_special,
+            n_note=n_note,
+            n_velocity=n_velocity,
+        )
+        tokens.append(token)
+        for j in range(len(notes_at_time)):
+            pitch = times_p[j]
+            velocity = times_v[j]
+            # for pitch, velocity in time:
+            velocity = int(velocity > 0)
+            if current_velocity[0] != velocity:
+                current_velocity[0] = velocity
+                token = fast_tokenize(
+                    velocity,
+                    TOKEN_VELOCITY,
+                    n_special=n_special,
+                    n_note=n_note,
+                    n_velocity=n_velocity,
+                )
+                tokens.append(token)
+            token = fast_tokenize(
+                pitch,
+                TOKEN_NOTE,
+                n_special=n_special,
+                n_note=n_note,
+                n_velocity=n_velocity,
+            )
+            tokens.append(token)
+    return np.array(tokens)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ fluidsynth

preprocess/README.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# Preprocess Scripts
+---
+- Note : the order of these scripts is IMPORTANT.
+- the preprocessing step is easy. but environment setting is not. please understand.
+- If you encounter any problems, please do not hesitate to email me or open an issue to the github.
+1. Transcribe piano wavs to midi
+- You should transcribe {piano_cover_file.wav} -> {piano_cover_file.mid}
+- I recommend you to use original codes from this repo : [High-resolution Piano Transcription with Pedals by Regressing Onsets and Offsets Times](https://github.com/qiuqiangkong/piano_transcription_inference)
+- Instead, you can also you my docker script.
+    ```bash
+    docker run -it --gpus all --rm -v /DIRECTORY_THAT_CONTAINS_PIANO_WAV/:/input -v /DIRECTORY_THAT_MIDI_OUTPUT/:/output jonghochoi/piano_transcribe:bytedance1
+    ```
+- If you are using GPU RTX 30XX or higher, this script may not work properly. It's because the version of pytorch is too low(1.4).
+- then upgrade the version of pytorch in the docker..
+2. Estimate Pop's beats
+```bash
+python bpm_quantize.py DATA_DIR
+```
+3. synchronize midi
+```bash
+python pop_align.py DATA_DIR
+```
+4. get separated vocal track
+```bash
+python split_spleeter.py DATA_DIR
+```
+5. caculate melody chroma accuracy
+```bash
+python melody_accuracy.py DATA_DIR
+```

preprocess/beat_quantizer.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import copy
+import librosa
+import essentia
+import essentia.standard
+import numpy as np
+import scipy.interpolate as interp
+import note_seq
+SAMPLERATE = 44100
+def nearest_onset_offset_digitize(on, off, bins):
+    intermediate = (bins[1:] + bins[:-1]) / 2
+    on_idx = np.digitize(on, intermediate)
+    off_idx = np.digitize(off, intermediate)
+    off_idx[on_idx == off_idx] += 1
+    # off_idx = np.clip(off_idx, a_min=0, a_max=len(bins) - 1)
+    return on_idx, off_idx
+def apply_sustain_pedal(pm):
+    ns = note_seq.midi_to_note_sequence(pm)
+    susns = note_seq.apply_sustain_control_changes(ns)
+    suspm = note_seq.note_sequence_to_pretty_midi(susns)
+    return suspm
+def interpolate_beat_times(beat_times, steps_per_beat, extend=False):
+    beat_times_function = interp.interp1d(
+        np.arange(beat_times.size),
+        beat_times,
+        bounds_error=False,
+        fill_value="extrapolate",
+    )
+    if extend:
+        beat_steps_8th = beat_times_function(
+            np.linspace(0, beat_times.size, beat_times.size * steps_per_beat + 1)
+        )
+    else:
+        beat_steps_8th = beat_times_function(
+            np.linspace(0, beat_times.size - 1, beat_times.size * steps_per_beat - 1)
+        )
+    return beat_steps_8th
+def midi_quantize_by_beats(
+    sample, beat_times, steps_per_beat, ignore_sustain_pedal=False
+):
+    ns = note_seq.midi_file_to_note_sequence(sample.midi)
+    if ignore_sustain_pedal:
+        susns = ns
+    else:
+        susns = note_seq.apply_sustain_control_changes(ns)
+    qns = copy.deepcopy(susns)
+    notes = np.array([[n.start_time, n.end_time] for n in susns.notes])
+    note_attributes = np.array([[n.pitch, n.velocity] for n in susns.notes])
+    note_ons = np.array(notes[:, 0])
+    note_offs = np.array(notes[:, 1])
+    beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=False)
+    on_idx, off_idx = nearest_onset_offset_digitize(note_ons, note_offs, beat_steps_8th)
+    beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
+    discrete_notes = np.concatenate(
+        (np.stack((on_idx, off_idx), axis=1), note_attributes), axis=1
+    )
+    def delete_duplicate_notes(dnotes):
+        note_order = dnotes[:, 0] * 128 + dnotes[:, 2]
+        dnotes = dnotes[note_order.argsort()]
+        indices = []
+        for i in range(1, len(dnotes)):
+            if dnotes[i, 0] == dnotes[i - 1, 0] and dnotes[i, 2] == dnotes[i - 1, 2]:
+                indices.append(i)
+        dnotes = np.delete(dnotes, indices, axis=0)
+        note_order = dnotes[:, 0] * 128 + dnotes[:, 1]
+        dnotes = dnotes[note_order.argsort()]
+        return dnotes
+    discrete_notes = delete_duplicate_notes(discrete_notes)
+    digitized_note_ons, digitized_note_offs = (
+        beat_steps_8th[on_idx],
+        beat_steps_8th[off_idx],
+    )
+    for i, note in enumerate(qns.notes):
+        note.start_time = digitized_note_ons[i]
+        note.end_time = digitized_note_offs[i]
+    return qns, discrete_notes, beat_steps_8th
+def extract_rhythm(song, y=None):
+    if y is None:
+        y, sr = librosa.load(song, sr=SAMPLERATE)
+    essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
+    (
+        bpm,
+        beat_times,
+        confidence,
+        estimates,
+        essentia_beat_intervals,
+    ) = essentia_tracker(y)
+    return bpm, beat_times, confidence, estimates, essentia_beat_intervals

preprocess/bpm_quantize.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import glob
+import sys
+import os
+import librosa
+import soundfile as sf
+import numpy as np
+import note_seq
+from omegaconf import OmegaConf
+from beat_quantizer import extract_rhythm, midi_quantize_by_beats
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from midiaudiopair import MidiAudioPair
+from utils.dsp import get_stereo
+def estimate(meta_file, ignore_sustain_pedal):
+    sample = MidiAudioPair(meta_file)
+    if (
+        sample.error_code == MidiAudioPair.NO_PIANO
+        or sample.error_code == MidiAudioPair.NO_SONG_DIR
+        or sample.error_code == MidiAudioPair.NO_SONG
+    ):
+        return
+    bpm, beat_times, confidence, estimates, essentia_beat_intervals = extract_rhythm(sample.song)
+    beat_times = np.array(beat_times)
+    essentia_beat_intervals = np.array(essentia_beat_intervals)
+    qns, discrete_notes, beat_steps_8th = midi_quantize_by_beats(
+        sample, beat_times, 2, ignore_sustain_pedal=ignore_sustain_pedal
+    )
+    qpm = note_seq.note_sequence_to_pretty_midi(qns)
+    qpm.instruments[0].control_changes = []
+    qpm.write(sample.qmidi)
+    y, sr = librosa.load(sample.song, sr=None)
+    qpm_y = qpm.fluidsynth(sr)
+    qmix = get_stereo(y, qpm_y, 0.4)
+    sf.write(file=sample.qmix, data=qmix.T, samplerate=sr, format="flac")
+    meta = OmegaConf.load(meta_file)
+    meta.tempo = OmegaConf.create()
+    meta.tempo.bpm = bpm
+    meta.tempo.confidence = confidence
+    OmegaConf.save(meta, meta_file)
+    np.save(sample.notes, discrete_notes)
+    np.save(sample.beatstep, beat_steps_8th)
+    np.save(sample.beattime, beat_times)
+    np.save(sample.beatinterval, essentia_beat_intervals)
+def main(meta_files, ignore_sustain_pedal):
+    from tqdm import tqdm
+    import multiprocessing
+    from joblib import Parallel, delayed
+    def files():
+        pbar = tqdm(meta_files)
+        for meta_file in pbar:
+            pbar.set_description(meta_file)
+            yield meta_file
+    Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
+        delayed(estimate)(meta_file, ignore_sustain_pedal) for meta_file in files()
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="bpm estimate using essentia")
+    parser.add_argument(
+        "data_dir",
+        type=str,
+        default=None,
+        help="""directory contains {id}/{pop_filename.wav}
+        """,
+    )
+    parser.add_argument(
+        "--ignore_sustain_pedal",
+        default=False,
+        action="store_true",
+        help="whether dry_run",
+    )
+    args = parser.parse_args()
+    meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
+    print("meta ", len(meta_files))
+    main(meta_files, args.ignore_sustain_pedal)

preprocess/melody_accuracy.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import glob
+import sys
+import os
+import librosa
+import pretty_midi
+from omegaconf import OmegaConf
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from midiaudiopair import MidiAudioPair
+from evaluate import midi_melody_accuracy as ma
+def estimate(meta_file):
+    import warnings
+    warnings.filterwarnings(action="ignore")
+    sample = MidiAudioPair(meta_file)
+    if (
+        sample.error_code == MidiAudioPair.NO_PIANO
+        or sample.error_code == MidiAudioPair.NO_SONG_DIR
+        or sample.error_code == MidiAudioPair.NO_SONG
+    ):
+        return
+    if "vocals" in sample.invalids:
+        print("no vocal:", meta_file)
+        return
+    midi = pretty_midi.PrettyMIDI(sample.qmidi)
+    vocals, sr = librosa.load(sample.vocals, sr=44100)
+    chroma_accuracy, pitch_accuracy = ma.evaluate_melody(
+        midi, vocals, sr=sr, hop_length=1024
+    )
+    meta = OmegaConf.load(meta_file)
+    meta.eval = OmegaConf.create()
+    meta.eval.melody_chroma_accuracy = chroma_accuracy.item()
+    meta.eval.melody_pitch_accuracy = pitch_accuracy.item()
+    OmegaConf.save(meta, meta_file)
+def main(meta_files):
+    from tqdm import tqdm
+    import multiprocessing
+    from joblib import Parallel, delayed
+    def files():
+        pbar = tqdm(meta_files)
+        for meta_file in pbar:
+            pbar.set_description(meta_file)
+            yield meta_file
+    Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
+        delayed(estimate)(meta_file) for meta_file in files()
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="bpm estimate using essentia")
+    parser.add_argument(
+        "data_dir",
+        type=str,
+        default=None,
+        help="""directory contains {id}/{pop_filename.wav}
+        """,
+    )
+    args = parser.parse_args()
+    meta_files = sorted(glob.glob(args.data_dir + "/**/*.yaml", recursive=True))
+    print("meta ", len(meta_files))
+    main(meta_files)

preprocess/pop_align.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import librosa
+import soundfile as sf
+import glob
+import os
+import copy
+import sys
+import numpy as np
+import pyrubberband as pyrb
+import pretty_midi
+from omegaconf import OmegaConf
+from tqdm.auto import tqdm
+from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
+from synctoolbox.dtw.utils import (
+    compute_optimal_chroma_shift,
+    shift_chroma_vectors,
+    make_path_strictly_monotonic,
+)
+from synctoolbox.feature.chroma import (
+    pitch_to_chroma,
+    quantize_chroma,
+    quantized_chroma_to_CENS,
+)
+from synctoolbox.feature.dlnco import pitch_onset_features_to_DLNCO
+from synctoolbox.feature.pitch import audio_to_pitch_features
+from synctoolbox.feature.pitch_onset import audio_to_pitch_onset_features
+from synctoolbox.feature.utils import estimate_tuning
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from utils.dsp import normalize, get_stereo
+from midiaudiopair import MidiAudioPair
+Fs = 22050
+feature_rate = 50
+step_weights = np.array([1.5, 1.5, 2.0])
+threshold_rec = 10 ** 6
+def save_delayed_song(
+    sample,
+    dry_run,
+):
+    import warnings
+    warnings.filterwarnings(action="ignore")
+    song_audio, _ = librosa.load(sample.original_song, Fs)
+    midi_pm = pretty_midi.PrettyMIDI(sample.original_midi)
+    if np.power(song_audio, 2).sum() < 1:  # low energy: invalid file
+        print("invalid audio :", sample.original_song)
+        sample.delete_files_myself()
+        return
+    rd = get_aligned_results(midi_pm=midi_pm, song_audio=song_audio)
+    mix_song = rd["mix_song"]
+    song_pitch_shifted = rd["song_pitch_shifted"]
+    midi_warped_pm = rd["midi_warped_pm"]
+    pitch_shift_for_song_audio = rd["pitch_shift_for_song_audio"]
+    tuning_offset_song = rd["tuning_offset_song"]
+    tuning_offset_piano = rd["tuning_offset_piano"]
+    try:
+        if dry_run:
+            print("write audio files: ", sample.song)
+        else:
+            sf.write(
+                file=sample.song,
+                data=song_pitch_shifted,
+                samplerate=Fs,
+                format="wav",
+            )
+    except:
+        print("Fail : ", sample.song)
+    try:
+        if dry_run:
+            print("write warped midi :", sample.midi)
+        else:
+            midi_warped_pm.write(sample.midi)
+    except:
+        midi_warped_pm._tick_scales = midi_pm._tick_scales
+        try:
+            if dry_run:
+                print("write warped midi2 :", sample.midi)
+            else:
+                midi_warped_pm.write(sample.midi)
+        except:
+            print("ad-hoc failed midi : ", sample.midi)
+        print("ad-hoc midi : ", sample.midi)
+    sample.yaml.song.pitch_shift = pitch_shift_for_song_audio.item()
+    sample.yaml.song.tuning_offset = tuning_offset_song.item()
+    sample.yaml.piano.tuning_offset = tuning_offset_piano.item()
+    OmegaConf.save(sample.yaml, sample.yaml_path)
+def get_aligned_results(midi_pm, song_audio):
+    piano_audio = midi_pm.fluidsynth(Fs)
+    song_audio = normalize(song_audio)
+    # The reason for estimating tuning ::
+    # https://www.audiolabs-erlangen.de/resources/MIR/FMP/C3/C3S1_TranspositionTuning.html
+    tuning_offset_1 = estimate_tuning(song_audio, Fs)
+    tuning_offset_2 = estimate_tuning(piano_audio, Fs)
+    # DLNCO features (Sebastian Ewert, Meinard Müller, and Peter Grosche: High Resolution Audio Synchronization Using Chroma Onset Features, In Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP): 1869–1872, 2009.):
+    # helpful to increase synchronization accuracy, especially for music with clear onsets.
+    # Quantized and smoothed chroma : CENS features
+    # Because, MrMsDTW Requires CENS.
+    f_chroma_quantized_1, f_DLNCO_1 = get_features_from_audio(
+        song_audio, tuning_offset_1
+    )
+    f_chroma_quantized_2, f_DLNCO_2 = get_features_from_audio(
+        piano_audio, tuning_offset_2
+    )
+    # Shift chroma vectors :
+    # Otherwise, different keys of two audio leads to degradation of alignment.
+    opt_chroma_shift = compute_optimal_chroma_shift(
+        quantized_chroma_to_CENS(f_chroma_quantized_1, 201, 50, feature_rate)[0],
+        quantized_chroma_to_CENS(f_chroma_quantized_2, 201, 50, feature_rate)[0],
+    )
+    f_chroma_quantized_2 = shift_chroma_vectors(f_chroma_quantized_2, opt_chroma_shift)
+    f_DLNCO_2 = shift_chroma_vectors(f_DLNCO_2, opt_chroma_shift)
+    wp = sync_via_mrmsdtw(
+        f_chroma1=f_chroma_quantized_1,
+        f_onset1=f_DLNCO_1,
+        f_chroma2=f_chroma_quantized_2,
+        f_onset2=f_DLNCO_2,
+        input_feature_rate=feature_rate,
+        step_weights=step_weights,
+        threshold_rec=threshold_rec,
+        verbose=False,
+    )
+    wp = make_path_strictly_monotonic(wp)
+    pitch_shift_for_song_audio = -opt_chroma_shift % 12
+    if pitch_shift_for_song_audio > 6:
+        pitch_shift_for_song_audio -= 12
+    if pitch_shift_for_song_audio != 0:
+        song_audio_shifted = pyrb.pitch_shift(
+            song_audio, Fs, pitch_shift_for_song_audio
+        )
+    else:
+        song_audio_shifted = song_audio
+    time_map_second = wp / feature_rate
+    midi_pm_warped = copy.deepcopy(midi_pm)
+    midi_pm_warped = simple_adjust_times(
+        midi_pm_warped, time_map_second[1], time_map_second[0]
+    )
+    piano_audio_warped = midi_pm_warped.fluidsynth(Fs)
+    song_audio_shifted = normalize(song_audio_shifted)
+    stereo_sonification_piano = get_stereo(song_audio_shifted, piano_audio_warped)
+    rd = dict(
+        mix_song=stereo_sonification_piano,
+        song_pitch_shifted=song_audio_shifted,
+        midi_warped_pm=midi_pm_warped,
+        pitch_shift_for_song_audio=pitch_shift_for_song_audio,
+        tuning_offset_song=tuning_offset_1,
+        tuning_offset_piano=tuning_offset_2,
+    )
+    return rd
+def simple_adjust_times(pm, original_times, new_times):
+    """
+    most of these codes are from original pretty_midi
+    https://github.com/craffel/pretty-midi/blob/main/pretty_midi/pretty_midi.py
+    """
+    for instrument in pm.instruments:
+        instrument.notes = [
+            copy.deepcopy(note)
+            for note in instrument.notes
+            if note.start >= original_times[0] and note.end <= original_times[-1]
+        ]
+    # Get array of note-on locations and correct them
+    note_ons = np.array(
+        [note.start for instrument in pm.instruments for note in instrument.notes]
+    )
+    adjusted_note_ons = np.interp(note_ons, original_times, new_times)
+    # Same for note-offs
+    note_offs = np.array(
+        [note.end for instrument in pm.instruments for note in instrument.notes]
+    )
+    adjusted_note_offs = np.interp(note_offs, original_times, new_times)
+    # Correct notes
+    for n, note in enumerate(
+        [note for instrument in pm.instruments for note in instrument.notes]
+    ):
+        note.start = (adjusted_note_ons[n] > 0) * adjusted_note_ons[n]
+        note.end = (adjusted_note_offs[n] > 0) * adjusted_note_offs[n]
+    # After performing alignment, some notes may have an end time which is
+    # on or before the start time.  Remove these!
+    pm.remove_invalid_notes()
+    def adjust_events(event_getter):
+        """This function calls event_getter with each instrument as the
+        sole argument and adjusts the events which are returned."""
+        # Sort the events by time
+        for instrument in pm.instruments:
+            event_getter(instrument).sort(key=lambda e: e.time)
+        # Correct the events by interpolating
+        event_times = np.array(
+            [
+                event.time
+                for instrument in pm.instruments
+                for event in event_getter(instrument)
+            ]
+        )
+        adjusted_event_times = np.interp(event_times, original_times, new_times)
+        for n, event in enumerate(
+            [
+                event
+                for instrument in pm.instruments
+                for event in event_getter(instrument)
+            ]
+        ):
+            event.time = adjusted_event_times[n]
+        for instrument in pm.instruments:
+            # We want to keep only the final event which has time ==
+            # new_times[0]
+            valid_events = [
+                event
+                for event in event_getter(instrument)
+                if event.time == new_times[0]
+            ]
+            if valid_events:
+                valid_events = valid_events[-1:]
+            # Otherwise only keep events within the new set of times
+            valid_events.extend(
+                event
+                for event in event_getter(instrument)
+                if event.time > new_times[0] and event.time < new_times[-1]
+            )
+            event_getter(instrument)[:] = valid_events
+    # Correct pitch bends and control changes
+    adjust_events(lambda i: i.pitch_bends)
+    adjust_events(lambda i: i.control_changes)
+    return pm
+def get_features_from_audio(audio, tuning_offset, visualize=False):
+    f_pitch = audio_to_pitch_features(
+        f_audio=audio,
+        Fs=Fs,
+        tuning_offset=tuning_offset,
+        feature_rate=feature_rate,
+        verbose=visualize,
+    )
+    f_chroma = pitch_to_chroma(f_pitch=f_pitch)
+    f_chroma_quantized = quantize_chroma(f_chroma=f_chroma)
+    f_pitch_onset = audio_to_pitch_onset_features(
+        f_audio=audio, Fs=Fs, tuning_offset=tuning_offset, verbose=visualize
+    )
+    f_DLNCO = pitch_onset_features_to_DLNCO(
+        f_peaks=f_pitch_onset,
+        feature_rate=feature_rate,
+        feature_sequence_length=f_chroma_quantized.shape[1],
+        visualize=visualize,
+    )
+    return f_chroma_quantized, f_DLNCO
+def main(samples, dry_run):
+    import multiprocessing
+    from joblib import Parallel, delayed
+    Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
+        delayed(save_delayed_song)(sample=sample, dry_run=dry_run)
+        for sample in tqdm(samples)
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="piano cover downloader")
+    parser.add_argument(
+        "data_dir",
+        type=str,
+        default=None,
+        help="""directory contains {id}/{song_filename.wav}
+        """,
+    )
+    parser.add_argument(
+        "--dry_run", default=False, action="store_true", help="whether dry_run"
+    )
+    args = parser.parse_args()
+    def getfiles():
+        meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
+        print("meta ", len(meta_files))
+        samples = list()
+        for meta_file in tqdm(meta_files):
+            m = MidiAudioPair(meta_file, auto_remove_no_song=True)
+            if m.error_code != MidiAudioPair.NO_SONG:
+                aux_txt = os.path.join(
+                    m.audio_dir,
+                    m.yaml.piano.ytid,
+                    f"{m.yaml.piano.title[:50]}___{m.yaml.song.title[:50]}.txt",
+                )
+                with open(aux_txt, "w") as f:
+                    f.write(".")
+                samples.append(m)
+        print(f"files available {len(samples)}")
+        return samples
+    samples = getfiles()
+    main(samples=samples, dry_run=args.dry_run)

preprocess/split_spleeter.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import glob
+import os
+import random
+import sys
+from tqdm.auto import tqdm
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from midiaudiopair import MidiAudioPair
+def split_spleeter(meta_files):
+    # Use audio loader explicitly for loading audio waveform :
+    from spleeter.audio.adapter import AudioAdapter
+    from spleeter.separator import Separator
+    import spleeter
+    sample_rate = 44100
+    audio_loader = AudioAdapter.default()
+    # Using embedded configuration.
+    separator = Separator("spleeter:2stems")
+    for meta_file in tqdm(meta_files):
+        sample = MidiAudioPair(meta_file)
+        if sample.error_code == MidiAudioPair.NO_SONG:
+            continue
+        if os.path.exists(sample.vocals):
+            continue
+        waveform, _ = audio_loader.load(sample.song, sample_rate=sample_rate)
+        # Perform the separation :
+        prediction = separator.separate(waveform)
+        audio_loader.save(
+            path=sample.vocals,
+            data=prediction["vocals"][:, 0:1],
+            codec=spleeter.audio.Codec.MP3,
+            sample_rate=sample_rate,
+        )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="bpm estimate using essentia")
+    parser.add_argument(
+        "data_dir",
+        type=str,
+        default=None,
+        help="""directory contains {id}/{pop_filename.wav}
+        """,
+    )
+    parser.add_argument(
+        "--random_order",
+        default=False,
+        action="store_true",
+        help="Random order process (to run multiple process)",
+    )
+    args = parser.parse_args()
+    meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
+    if args.random_order:
+        random.shuffle(meta_files)
+    print("meta ", len(meta_files))
+    split_spleeter(meta_files)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pretty-midi==0.2.9
+omegaconf==2.1.1
+transformers==4.16.1
+pytorch-lightning
+essentia==2.1b6.dev609
+note-seq==0.0.3
+pyFluidSynth==1.3.0
+torch

transformer_wrapper.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import os
+import random
+import numpy as np
+import librosa
+import torch
+import pytorch_lightning as pl
+import soundfile as sf
+from torch.nn.utils.rnn import pad_sequence
+from transformers import T5Config, T5ForConditionalGeneration
+from midi_tokenizer import MidiTokenizer, extrapolate_beat_times
+from layer.input import LogMelSpectrogram, ConcatEmbeddingToMel
+from preprocess.beat_quantizer import extract_rhythm, interpolate_beat_times
+from utils.dsp import get_stereo
+DEFAULT_COMPOSERS = {"various composer": 2052}
+class TransformerWrapper(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.tokenizer = MidiTokenizer(config.tokenizer)
+        self.t5config = T5Config.from_pretrained("t5-small")
+        for k, v in config.t5.items():
+            self.t5config.__setattr__(k, v)
+        self.transformer = T5ForConditionalGeneration(self.t5config)
+        self.use_mel = self.config.dataset.use_mel
+        self.mel_is_conditioned = self.config.dataset.mel_is_conditioned
+        self.composer_to_feature_token = config.composer_to_feature_token
+        if self.use_mel and not self.mel_is_conditioned:
+            self.composer_to_feature_token = DEFAULT_COMPOSERS
+        if self.use_mel:
+            self.spectrogram = LogMelSpectrogram()
+            if self.mel_is_conditioned:
+                n_dim = 512
+                composer_n_vocab = len(self.composer_to_feature_token)
+                embedding_offset = min(self.composer_to_feature_token.values())
+                self.mel_conditioner = ConcatEmbeddingToMel(
+                    embedding_offset=embedding_offset,
+                    n_vocab=composer_n_vocab,
+                    n_dim=n_dim,
+                )
+        else:
+            self.spectrogram = None
+        self.lr = config.training.lr
+    def forward(self, input_ids, labels):
+        """
+        Deprecated.
+        """
+        rt = self.transformer(input_ids=input_ids, labels=labels)
+        return rt
+    @torch.no_grad()
+    def single_inference(
+        self,
+        feature_tokens=None,
+        audio=None,
+        beatstep=None,
+        max_length=256,
+        max_batch_size=64,
+        n_bars=None,
+        composer_value=None,
+    ):
+        """
+        generate a long audio sequence
+        feature_tokens or audio : shape (time, )
+        beatstep : shape (time, )
+        - input_ids가 해당하는 beatstep 값들
+        (offset 빠짐, 즉 beatstep[0] == 0)
+        - beatstep[-1] : input_ids가 끝나는 지점의 시간값
+        (즉 beatstep[-1] == len(y)//sr)
+        """
+        assert feature_tokens is not None or audio is not None
+        assert beatstep is not None
+        if feature_tokens is not None:
+            assert len(feature_tokens.shape) == 1
+        if audio is not None:
+            assert len(audio.shape) == 1
+        config = self.config
+        PAD = self.t5config.pad_token_id
+        n_bars = config.dataset.n_bars if n_bars is None else n_bars
+        if beatstep[0] > 0.01:
+            print(
+                "inference warning : beatstep[0] is not 0 ({beatstep[0]}). all beatstep will be shifted."
+            )
+            beatstep = beatstep - beatstep[0]
+        if self.use_mel:
+            input_ids = None
+            inputs_embeds, ext_beatstep = self.prepare_inference_mel(
+                audio,
+                beatstep,
+                n_bars=n_bars,
+                padding_value=PAD,
+                composer_value=composer_value,
+            )
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise NotImplementedError
+        # Considering GPU capacity, some sequence would not be generated at once.
+        relative_tokens = list()
+        for i in range(0, batch_size, max_batch_size):
+            start = i
+            end = min(batch_size, i + max_batch_size)
+            if input_ids is None:
+                _input_ids = None
+                _inputs_embeds = inputs_embeds[start:end]
+            else:
+                _input_ids = input_ids[start:end]
+                _inputs_embeds = None
+            _relative_tokens = self.transformer.generate(
+                input_ids=_input_ids,
+                inputs_embeds=_inputs_embeds,
+                max_length=max_length,
+            )
+            _relative_tokens = _relative_tokens.cpu().numpy()
+            relative_tokens.append(_relative_tokens)
+        max_length = max([rt.shape[-1] for rt in relative_tokens])
+        for i in range(len(relative_tokens)):
+            relative_tokens[i] = np.pad(
+                relative_tokens[i],
+                [(0, 0), (0, max_length - relative_tokens[i].shape[-1])],
+                constant_values=PAD,
+            )
+        relative_tokens = np.concatenate(relative_tokens)
+        pm, notes = self.tokenizer.relative_batch_tokens_to_midi(
+            relative_tokens,
+            beatstep=ext_beatstep,
+            bars_per_batch=n_bars,
+            cutoff_time_idx=(n_bars + 1) * 4,
+        )
+        return relative_tokens, notes, pm
+    def prepare_inference_mel(
+        self, audio, beatstep, n_bars, padding_value, composer_value=None
+    ):
+        n_steps = n_bars * 4
+        n_target_step = len(beatstep)
+        sample_rate = self.config.dataset.sample_rate
+        ext_beatstep = extrapolate_beat_times(beatstep, (n_bars + 1) * 4 + 1)
+        def split_audio(audio):
+            # Split audio corresponding beat intervals.
+            # Each audio's lengths are different.
+            # Because each corresponding beat interval times are different.
+            batch = []
+            for i in range(0, n_target_step, n_steps):
+                start_idx = i
+                end_idx = min(i + n_steps, n_target_step)
+                start_sample = int(ext_beatstep[start_idx] * sample_rate)
+                end_sample = int(ext_beatstep[end_idx] * sample_rate)
+                feature = audio[start_sample:end_sample]
+                batch.append(feature)
+            return batch
+        def pad_and_stack_batch(batch):
+            batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
+            return batch
+        batch = split_audio(audio)
+        batch = pad_and_stack_batch(batch)
+        inputs_embeds = self.spectrogram(batch).transpose(-1, -2)
+        if self.mel_is_conditioned:
+            composer_value = torch.tensor(composer_value).to(self.device)
+            composer_value = composer_value.repeat(inputs_embeds.shape[0])
+            inputs_embeds = self.mel_conditioner(inputs_embeds, composer_value)
+        return inputs_embeds, ext_beatstep
+    @torch.no_grad()
+    def generate(
+        self,
+        audio_path=None,
+        composer=None,
+        model="generated",
+        steps_per_beat=2,
+        stereo_amp=0.5,
+        n_bars=2,
+        ignore_duplicate=True,
+        show_plot=False,
+        save_midi=False,
+        save_mix=False,
+        midi_path=None,
+        mix_path=None,
+        click_amp=0.2,
+        add_click=False,
+        max_batch_size=None,
+        beatsteps=None,
+        mix_sample_rate=None,
+        audio_y=None,
+        audio_sr=None,
+    ):
+        config = self.config
+        device = self.device
+        if audio_path is not None:
+            extension = os.path.splitext(audio_path)[1]
+            mix_path = (
+                audio_path.replace(extension, f".{model}.{composer}.wav")
+                if mix_path is None
+                else mix_path
+            )
+            midi_path = (
+                audio_path.replace(extension, f".{model}.{composer}.mid")
+                if midi_path is None
+                else midi_path
+            )
+        max_batch_size = 64 // n_bars if max_batch_size is None else max_batch_size
+        composer_to_feature_token = self.composer_to_feature_token
+        if composer is None:
+            composer = random.sample(list(composer_to_feature_token.keys()), 1)[0]
+        composer_value = composer_to_feature_token[composer]
+        mix_sample_rate = (
+            config.dataset.sample_rate if mix_sample_rate is None else mix_sample_rate
+        )
+        if not ignore_duplicate:
+            if os.path.exists(midi_path):
+                return
+        ESSENTIA_SAMPLERATE = 44100
+        if beatsteps is None:
+            y, sr = librosa.load(audio_path, sr=ESSENTIA_SAMPLERATE)
+            (
+                bpm,
+                beat_times,
+                confidence,
+                estimates,
+                essentia_beat_intervals,
+            ) = extract_rhythm(audio_path, y=y)
+            beat_times = np.array(beat_times)
+            beatsteps = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
+        else:
+            y = None
+        if self.use_mel:
+            if audio_y is None and config.dataset.sample_rate != ESSENTIA_SAMPLERATE:
+                if y is not None:
+                    y = librosa.core.resample(
+                        y,
+                        orig_sr=ESSENTIA_SAMPLERATE,
+                        target_sr=config.dataset.sample_rate,
+                    )
+                    sr = config.dataset.sample_rate
+                else:
+                    y, sr = librosa.load(audio_path, sr=config.dataset.sample_rate)
+            elif audio_y is not None:
+                if audio_sr != config.dataset.sample_rate:
+                    audio_y = librosa.core.resample(
+                        audio_y, orig_sr=audio_sr, target_sr=config.dataset.sample_rate
+                    )
+                    audio_sr = config.dataset.sample_rate
+                y = audio_y
+                sr = audio_sr
+            start_sample = int(beatsteps[0] * sr)
+            end_sample = int(beatsteps[-1] * sr)
+            _audio = torch.from_numpy(y)[start_sample:end_sample].to(device)
+            fzs = None
+        else:
+            raise NotImplementedError
+        relative_tokens, notes, pm = self.single_inference(
+            feature_tokens=fzs,
+            audio=_audio,
+            beatstep=beatsteps - beatsteps[0],
+            max_length=config.dataset.target_length
+            * max(1, (n_bars // config.dataset.n_bars)),
+            max_batch_size=max_batch_size,
+            n_bars=n_bars,
+            composer_value=composer_value,
+        )
+        for n in pm.instruments[0].notes:
+            n.start += beatsteps[0]
+            n.end += beatsteps[0]
+        if show_plot or save_mix:
+            if mix_sample_rate != sr:
+                y = librosa.core.resample(y, orig_sr=sr, target_sr=mix_sample_rate)
+                sr = mix_sample_rate
+            if add_click:
+                clicks = (
+                    librosa.clicks(times=beatsteps, sr=sr, length=len(y)) * click_amp
+                )
+                y = y + clicks
+            pm_y = pm.fluidsynth(sr)
+            stereo = get_stereo(y, pm_y, pop_scale=stereo_amp)
+        if show_plot:
+            import IPython.display as ipd
+            from IPython.display import display
+            import note_seq
+            display("Stereo MIX", ipd.Audio(stereo, rate=sr))
+            display("Rendered MIDI", ipd.Audio(pm_y, rate=sr))
+            display("Original Song", ipd.Audio(y, rate=sr))
+            display(note_seq.plot_sequence(note_seq.midi_to_note_sequence(pm)))
+        if save_mix:
+            sf.write(
+                file=mix_path,
+                data=stereo.T,
+                samplerate=sr,
+                format="wav",
+            )
+        if save_midi:
+            pm.write(midi_path)
+        return pm, composer, mix_path, midi_path

utils/__init__.py ADDED Viewed

File without changes

utils/dsp.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import numpy as np
+from scipy.interpolate import interp1d
+def normalize(audio, min_y=-1.0, max_y=1.0, eps=1e-8):
+    assert len(audio.shape) == 1
+    max_y -= eps
+    min_y += eps
+    amax = audio.max()
+    amin = audio.min()
+    audio = (max_y - min_y) * (audio - amin) / (amax - amin) + min_y
+    return audio
+def get_stereo(pop_y, midi_y, pop_scale=0.99):
+    if len(pop_y) > len(midi_y):
+        midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
+    elif len(pop_y) < len(midi_y):
+        pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
+    stereo = np.stack((midi_y, pop_y * pop_scale))
+    return stereo
+def generate_variable_f0_sine_wave(f0, len_y, sr):
+    """
+    integrate instant frequencies to get pure tone sine wave
+    """
+    x_sample = np.arange(len(f0))
+    intp = interp1d(x_sample, f0, kind="linear")
+    f0_audiorate = intp(np.linspace(0, len(f0) - 1, len_y))
+    pitch_wave = np.sin((np.nan_to_num(f0_audiorate) / sr * 2 * np.pi).cumsum())
+    return pitch_wave
+def fluidsynth_without_normalize(self, fs=44100, sf2_path=None):
+    """Synthesize using fluidsynth. without signal normalize
+    Parameters
+    ----------
+    fs : int
+        Sampling rate to synthesize at.
+    sf2_path : str
+        Path to a .sf2 file.
+        Default ``None``, which uses the TimGM6mb.sf2 file included with
+        ``pretty_midi``.
+    Returns
+    -------
+    synthesized : np.ndarray
+        Waveform of the MIDI data, synthesized at ``fs``.
+    """
+    # If there are no instruments, or all instruments have no notes, return
+    # an empty array
+    if len(self.instruments) == 0 or all(len(i.notes) == 0 for i in self.instruments):
+        return np.array([])
+    # Get synthesized waveform for each instrument
+    waveforms = [i.fluidsynth(fs=fs, sf2_path=sf2_path) for i in self.instruments]
+    # Allocate output waveform, with #sample = max length of all waveforms
+    synthesized = np.zeros(np.max([w.shape[0] for w in waveforms]))
+    # Sum all waveforms in
+    for waveform in waveforms:
+        synthesized[: waveform.shape[0]] += waveform
+    # Normalize
+    # synthesized /= np.abs(synthesized).max()
+    return synthesized