Spaces:
Sleeping
Sleeping
Commit
•
1a572e4
1
Parent(s):
705ff7f
codebase_change (#6)
Browse files- codebase switched to HF transformers (26cb74172642e5881ada1f6b423c37f43e16c1e0)
- removed the last 2 examples (46b58a01fee498aabd4d3498364679fd1c7e0ad9)
Co-authored-by: Susnato Dhar <susnato@users.noreply.huggingface.co>
- README.md +9 -7
- app.py +134 -66
- config.yaml +0 -61
- examples/BornThisWay.mp3 +0 -0
- examples/Sk8erBoi.mp3 +0 -0
- examples/custom_song.mp3 +0 -0
- layer/__init__.py +0 -0
- layer/input.py +0 -46
- midi_tokenizer.py +0 -430
- packages.txt +2 -1
- preprocess/README.md +0 -36
- preprocess/beat_quantizer.py +0 -111
- preprocess/bpm_quantize.py +0 -98
- preprocess/melody_accuracy.py +0 -81
- preprocess/pop_align.py +0 -331
- preprocess/split_spleeter.py +0 -72
- requirements.txt +9 -8
- transformer_wrapper.py +0 -330
- utils/__init__.py +0 -0
- utils/dsp.py +0 -63
README.md
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
-
---
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Pop2piano Dev
|
3 |
+
emoji: 🏢
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.42.0
|
8 |
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,54 +1,83 @@
|
|
|
|
1 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
from
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
with block:
|
54 |
gr.HTML(
|
@@ -67,38 +96,77 @@ with block:
|
|
67 |
</h1>
|
68 |
</div>
|
69 |
<p style="margin-bottom: 10px; font-size: 94%">
|
70 |
-
A demo for Pop2Piano:Pop Audio-based Piano Cover Generation
|
|
|
71 |
</p>
|
72 |
</div>
|
73 |
"""
|
74 |
)
|
75 |
with gr.Group():
|
76 |
-
with gr.
|
77 |
-
with gr.
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
gr.Examples([
|
87 |
-
["./examples/
|
88 |
-
["./examples/Sk8erBoi.mp3", "composer2"]
|
89 |
],
|
90 |
fn=inference,
|
91 |
-
inputs=[
|
92 |
-
outputs=[
|
93 |
cache_examples=True
|
94 |
)
|
95 |
gr.HTML(
|
96 |
"""
|
97 |
<div class="footer">
|
98 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
</p>
|
100 |
</div>
|
101 |
"""
|
102 |
)
|
103 |
|
104 |
-
block.launch(debug=
|
|
|
1 |
+
import os
|
2 |
import torch
|
3 |
+
import librosa
|
4 |
+
import binascii
|
5 |
+
import warnings
|
6 |
+
import midi2audio # to convert midi to wav
|
7 |
+
import numpy as np
|
8 |
+
import pytube as pt # to download the youtube videos as audios
|
9 |
import gradio as gr
|
10 |
+
import soundfile as sf # to make the stereo mix
|
11 |
+
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
|
12 |
+
|
13 |
+
|
14 |
+
yt_video_dir = "./yt_dir"
|
15 |
+
outputs_dir = "./midi_wav_outputs"
|
16 |
+
os.makedirs(outputs_dir, exist_ok=True)
|
17 |
+
os.makedirs(yt_video_dir, exist_ok=True)
|
18 |
+
|
19 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
+
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
|
21 |
+
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
|
22 |
+
composers = model.generation_config.composer_to_feature_token.keys()
|
23 |
+
|
24 |
+
|
25 |
+
def get_audio_from_yt_video(yt_link):
|
26 |
+
try:
|
27 |
+
yt = pt.YouTube(yt_link)
|
28 |
+
t = yt.streams.filter(only_audio=True)
|
29 |
+
filename = os.path.join(yt_video_dir, binascii.hexlify(os.urandom(8)).decode() + ".mp4")
|
30 |
+
t[0].download(filename=filename)
|
31 |
+
except:
|
32 |
+
warnings.warn(f"Video Not Found at {yt_link}")
|
33 |
+
filename = None
|
34 |
+
|
35 |
+
return filename, filename
|
36 |
+
|
37 |
+
def inference(file_uploaded, composer):
|
38 |
+
# to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
|
39 |
+
# generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
|
40 |
+
waveform, sr = librosa.load(file_uploaded, sr=None)
|
41 |
+
|
42 |
+
inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
|
43 |
+
model_output = model.generate(input_features=inputs["input_features"], composer=composer)
|
44 |
+
tokenizer_output = processor.batch_decode(token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu"))["pretty_midi_objects"]
|
45 |
+
|
46 |
+
return prepare_output_file(tokenizer_output, sr)
|
47 |
+
|
48 |
+
def prepare_output_file(tokenizer_output, sr):
|
49 |
+
# Add some random values so that no two file names are same
|
50 |
+
output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
|
51 |
+
midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
|
52 |
+
|
53 |
+
# write the .mid file
|
54 |
+
tokenizer_output[0].write(midi_output)
|
55 |
+
|
56 |
+
# convert .mid file to .wav using `midi2audio`
|
57 |
+
wav_output = midi_output.replace(".mid", ".wav")
|
58 |
+
midi2audio.FluidSynth().midi_to_audio(midi_output, wav_output)
|
59 |
+
|
60 |
+
return wav_output, wav_output, midi_output
|
61 |
+
|
62 |
+
def get_stereo(pop_path, midi, pop_scale=0.5):
|
63 |
+
pop_y, sr = librosa.load(pop_path, sr=None)
|
64 |
+
midi_y, _ = librosa.load(midi.name, sr=None)
|
65 |
+
|
66 |
+
if len(pop_y) > len(midi_y):
|
67 |
+
midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
|
68 |
+
elif len(pop_y) < len(midi_y):
|
69 |
+
pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
|
70 |
+
stereo = np.stack((midi_y, pop_y * pop_scale))
|
71 |
+
|
72 |
+
stereo_mix_path = pop_path.replace("output", "output_stereo_mix")
|
73 |
+
sf.write(file=stereo_mix_path, data=stereo.T, samplerate=sr, format="wav",)
|
74 |
+
|
75 |
+
return stereo_mix_path, stereo_mix_path
|
76 |
+
|
77 |
+
|
78 |
+
# Thanks a lot to "https://huggingface.co/Taithrah" for this theme.
|
79 |
+
# taken from https://huggingface.co/spaces/NoCrypt/miku
|
80 |
+
block = gr.Blocks(theme="Taithrah/Minimal")
|
81 |
|
82 |
with block:
|
83 |
gr.HTML(
|
|
|
96 |
</h1>
|
97 |
</div>
|
98 |
<p style="margin-bottom: 10px; font-size: 94%">
|
99 |
+
A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
|
100 |
+
Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
|
101 |
</p>
|
102 |
</div>
|
103 |
"""
|
104 |
)
|
105 |
with gr.Group():
|
106 |
+
with gr.Row(equal_height=True):
|
107 |
+
with gr.Column():
|
108 |
+
file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
|
109 |
+
with gr.Column():
|
110 |
+
with gr.Row():
|
111 |
+
yt_link = gr.Textbox(label="Enter YouTube Link of the Video", autofocus=True, lines=3)
|
112 |
+
yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
|
113 |
+
|
114 |
+
yt_audio_path = gr.Audio(label="Audio Extracted from the YouTube Video", interactive=False)
|
115 |
+
yt_btn.click(get_audio_from_yt_video, inputs=[yt_link], outputs=[yt_audio_path, file_uploaded])
|
116 |
+
|
117 |
+
with gr.Group():
|
118 |
+
with gr.Column():
|
119 |
+
composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
|
120 |
+
generate_btn = gr.Button("Generate")
|
121 |
+
|
122 |
+
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
123 |
+
wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
|
124 |
+
wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
|
125 |
+
midi_output = gr.File(label="Download the Generated MIDI (.mid)")
|
126 |
+
generate_btn.click(inference,
|
127 |
+
inputs=[file_uploaded, composer],
|
128 |
+
outputs=[wav_output1, wav_output2, midi_output])
|
129 |
+
|
130 |
+
with gr.Group():
|
131 |
+
gr.HTML(
|
132 |
+
"""
|
133 |
+
<div> <h3> <center> Get the Stereo Mix from the Pop Music and Generated MIDI </h3> </div>
|
134 |
+
"""
|
135 |
+
)
|
136 |
+
pop_scale = gr.Slider(0, 1, value=0.5, label="Choose the ratio between Pop and MIDI", info="1.0 = Only Pop, 0.0=Only MIDI", interactive=True),
|
137 |
+
stereo_btn = gr.Button("Get Stereo Mix")
|
138 |
+
with gr.Row():
|
139 |
+
stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
|
140 |
+
stereo_mix2 = gr.File(label="Download the Stereo Mix")
|
141 |
+
|
142 |
+
stereo_btn.click(get_stereo, inputs=[file_uploaded, wav_output2, pop_scale[0]], outputs=[stereo_mix1, stereo_mix2])
|
143 |
+
|
144 |
+
with gr.Group():
|
145 |
gr.Examples([
|
146 |
+
["./examples/custom_song.mp3", "composer1"],
|
|
|
147 |
],
|
148 |
fn=inference,
|
149 |
+
inputs=[file_uploaded, composer],
|
150 |
+
outputs=[wav_output1, wav_output2, midi_output],
|
151 |
cache_examples=True
|
152 |
)
|
153 |
gr.HTML(
|
154 |
"""
|
155 |
<div class="footer">
|
156 |
+
<center>The design for this Space is taken from <a href="https://huggingface.co/spaces/NoCrypt/miku"> NoCrypt/miku </a>
|
157 |
+
</div>
|
158 |
+
"""
|
159 |
+
)
|
160 |
+
|
161 |
+
gr.HTML(
|
162 |
+
"""
|
163 |
+
<div class="footer">
|
164 |
+
<center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a>
|
165 |
+
<center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
|
166 |
+
<center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
|
167 |
</p>
|
168 |
</div>
|
169 |
"""
|
170 |
)
|
171 |
|
172 |
+
block.launch(debug=False)
|
config.yaml
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
project: pop2piano
|
2 |
-
dataset:
|
3 |
-
target_length: 256
|
4 |
-
input_length: 1024
|
5 |
-
n_bars: 2
|
6 |
-
sample_rate: 22050
|
7 |
-
use_mel: true
|
8 |
-
mel_is_conditioned: true
|
9 |
-
composer_to_feature_token:
|
10 |
-
composer1: 2052
|
11 |
-
composer2: 2053
|
12 |
-
composer3: 2054
|
13 |
-
composer4: 2055
|
14 |
-
composer5: 2056
|
15 |
-
composer6: 2057
|
16 |
-
composer7: 2058
|
17 |
-
composer8: 2059
|
18 |
-
composer9: 2060
|
19 |
-
composer10: 2061
|
20 |
-
composer11: 2062
|
21 |
-
composer12: 2063
|
22 |
-
composer13: 2064
|
23 |
-
composer14: 2065
|
24 |
-
composer15: 2066
|
25 |
-
composer16: 2067
|
26 |
-
composer17: 2068
|
27 |
-
composer18: 2069
|
28 |
-
composer19: 2070
|
29 |
-
composer20: 2071
|
30 |
-
composer21: 2072
|
31 |
-
t5:
|
32 |
-
feed_forward_proj: gated-gelu
|
33 |
-
tie_word_embeddings: false
|
34 |
-
tie_encoder_decoder: false
|
35 |
-
vocab_size: 2400
|
36 |
-
n_positions: 1024
|
37 |
-
relative_attention_num_buckets: 32
|
38 |
-
tokenizer:
|
39 |
-
vocab_size:
|
40 |
-
special: 4
|
41 |
-
note: 128
|
42 |
-
velocity: 2
|
43 |
-
time: 100
|
44 |
-
training:
|
45 |
-
seed: 3407
|
46 |
-
resume: false
|
47 |
-
offline: false
|
48 |
-
num_gpu: 1
|
49 |
-
max_epochs: 5000
|
50 |
-
accumulate_grad_batches: 1
|
51 |
-
check_val_every_n_epoch: 20
|
52 |
-
find_lr: false
|
53 |
-
optimizer: adafactor
|
54 |
-
version: none
|
55 |
-
lr: 0.001
|
56 |
-
lr_min: 1.0e-06
|
57 |
-
lr_scheduler: false
|
58 |
-
lr_decay: 0.99
|
59 |
-
batch_size: 32
|
60 |
-
num_workers: 32
|
61 |
-
gradient_clip_val: 3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/BornThisWay.mp3
DELETED
Binary file (482 kB)
|
|
examples/Sk8erBoi.mp3
DELETED
Binary file (673 kB)
|
|
examples/custom_song.mp3
ADDED
Binary file (247 kB). View file
|
|
layer/__init__.py
DELETED
File without changes
|
layer/input.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
import torchaudio
|
4 |
-
|
5 |
-
|
6 |
-
class LogMelSpectrogram(nn.Module):
|
7 |
-
def __init__(self) -> None:
|
8 |
-
super().__init__()
|
9 |
-
self.melspectrogram = torchaudio.transforms.MelSpectrogram(
|
10 |
-
sample_rate=22050,
|
11 |
-
n_fft=4096,
|
12 |
-
hop_length=1024,
|
13 |
-
f_min=10.0,
|
14 |
-
n_mels=512,
|
15 |
-
)
|
16 |
-
|
17 |
-
def forward(self, x):
|
18 |
-
# x : audio(batch, sample)
|
19 |
-
# X : melspec (batch, freq, frame)
|
20 |
-
with torch.no_grad():
|
21 |
-
with torch.cuda.amp.autocast(enabled=False):
|
22 |
-
X = self.melspectrogram(x)
|
23 |
-
X = X.clamp(min=1e-6).log()
|
24 |
-
|
25 |
-
return X
|
26 |
-
|
27 |
-
|
28 |
-
class ConcatEmbeddingToMel(nn.Module):
|
29 |
-
def __init__(self, embedding_offset, n_vocab, n_dim) -> None:
|
30 |
-
super().__init__()
|
31 |
-
self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_dim)
|
32 |
-
self.embedding_offset = embedding_offset
|
33 |
-
|
34 |
-
def forward(self, feature, index_value):
|
35 |
-
"""
|
36 |
-
index_value : (batch, )
|
37 |
-
feature : (batch, time, feature_dim)
|
38 |
-
"""
|
39 |
-
index_shifted = index_value - self.embedding_offset
|
40 |
-
|
41 |
-
# (batch, 1, feature_dim)
|
42 |
-
composer_embedding = self.embedding(index_shifted).unsqueeze(1)
|
43 |
-
# print(composer_embedding.shape, feature.shape)
|
44 |
-
# (batch, 1 + time, feature_dim)
|
45 |
-
inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
|
46 |
-
return inputs_embeds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
midi_tokenizer.py
DELETED
@@ -1,430 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from numba import jit
|
3 |
-
import pretty_midi
|
4 |
-
import scipy.interpolate as interp
|
5 |
-
|
6 |
-
TOKEN_SPECIAL: int = 0
|
7 |
-
TOKEN_NOTE: int = 1
|
8 |
-
TOKEN_VELOCITY: int = 2
|
9 |
-
TOKEN_TIME: int = 3
|
10 |
-
|
11 |
-
DEFAULT_VELOCITY: int = 77
|
12 |
-
|
13 |
-
TIE: int = 2
|
14 |
-
EOS: int = 1
|
15 |
-
PAD: int = 0
|
16 |
-
|
17 |
-
|
18 |
-
def extrapolate_beat_times(beat_times, n_extend=1):
|
19 |
-
beat_times_function = interp.interp1d(
|
20 |
-
np.arange(beat_times.size),
|
21 |
-
beat_times,
|
22 |
-
bounds_error=False,
|
23 |
-
fill_value="extrapolate",
|
24 |
-
)
|
25 |
-
|
26 |
-
ext_beats = beat_times_function(
|
27 |
-
np.linspace(0, beat_times.size + n_extend - 1, beat_times.size + n_extend)
|
28 |
-
)
|
29 |
-
|
30 |
-
return ext_beats
|
31 |
-
|
32 |
-
|
33 |
-
@jit(nopython=True, cache=True)
|
34 |
-
def fast_tokenize(idx, token_type, n_special, n_note, n_velocity):
|
35 |
-
if token_type == TOKEN_TIME:
|
36 |
-
return n_special + n_note + n_velocity + idx
|
37 |
-
elif token_type == TOKEN_VELOCITY:
|
38 |
-
return n_special + n_note + idx
|
39 |
-
elif token_type == TOKEN_NOTE:
|
40 |
-
return n_special + idx
|
41 |
-
elif token_type == TOKEN_SPECIAL:
|
42 |
-
return idx
|
43 |
-
else:
|
44 |
-
return -1
|
45 |
-
|
46 |
-
|
47 |
-
@jit(nopython=True, cache=True)
|
48 |
-
def fast_detokenize(idx, n_special, n_note, n_velocity, time_idx_offset):
|
49 |
-
if idx >= n_special + n_note + n_velocity:
|
50 |
-
return (TOKEN_TIME, (idx - (n_special + n_note + n_velocity)) + time_idx_offset)
|
51 |
-
elif idx >= n_special + n_note:
|
52 |
-
return TOKEN_VELOCITY, idx - (n_special + n_note)
|
53 |
-
elif idx >= n_special:
|
54 |
-
return TOKEN_NOTE, idx - n_special
|
55 |
-
else:
|
56 |
-
return TOKEN_SPECIAL, idx
|
57 |
-
|
58 |
-
|
59 |
-
class MidiTokenizer:
|
60 |
-
def __init__(self, config) -> None:
|
61 |
-
self.config = config
|
62 |
-
|
63 |
-
def tokenize_note(self, idx, token_type):
|
64 |
-
rt = fast_tokenize(
|
65 |
-
idx,
|
66 |
-
token_type,
|
67 |
-
self.config.vocab_size.special,
|
68 |
-
self.config.vocab_size.note,
|
69 |
-
self.config.vocab_size.velocity,
|
70 |
-
)
|
71 |
-
if rt == -1:
|
72 |
-
raise ValueError(f"type {type} is not a predefined token type.")
|
73 |
-
else:
|
74 |
-
return rt
|
75 |
-
|
76 |
-
def notes_to_tokens(self, notes):
|
77 |
-
"""
|
78 |
-
notes : (onset idx, offset idx, pitch, velocity)
|
79 |
-
"""
|
80 |
-
max_time_idx = notes[:, :2].max()
|
81 |
-
|
82 |
-
times = [[] for i in range((max_time_idx + 1))]
|
83 |
-
for onset, offset, pitch, velocity in notes:
|
84 |
-
times[onset].append([pitch, velocity])
|
85 |
-
times[offset].append([pitch, 0])
|
86 |
-
|
87 |
-
tokens = []
|
88 |
-
current_velocity = 0
|
89 |
-
for i, time in enumerate(times):
|
90 |
-
if len(time) == 0:
|
91 |
-
continue
|
92 |
-
tokens.append(self.tokenize_note(i, TOKEN_TIME))
|
93 |
-
for pitch, velocity in time:
|
94 |
-
velocity = int(velocity > 0)
|
95 |
-
if current_velocity != velocity:
|
96 |
-
current_velocity = velocity
|
97 |
-
tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
|
98 |
-
tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
|
99 |
-
|
100 |
-
return np.array(tokens, dtype=int)
|
101 |
-
|
102 |
-
def detokenize(self, token, time_idx_offset):
|
103 |
-
type, value = fast_detokenize(
|
104 |
-
token,
|
105 |
-
n_special=self.config.vocab_size.special,
|
106 |
-
n_note=self.config.vocab_size.note,
|
107 |
-
n_velocity=self.config.vocab_size.velocity,
|
108 |
-
time_idx_offset=time_idx_offset,
|
109 |
-
)
|
110 |
-
if type != TOKEN_TIME:
|
111 |
-
value = int(value)
|
112 |
-
return [type, value]
|
113 |
-
|
114 |
-
def to_string(self, tokens, time_idx_offset=0):
|
115 |
-
nums = [
|
116 |
-
self.detokenize(token, time_idx_offset=time_idx_offset) for token in tokens
|
117 |
-
]
|
118 |
-
strings = []
|
119 |
-
for i in range(len(nums)):
|
120 |
-
type = nums[i][0]
|
121 |
-
value = nums[i][1]
|
122 |
-
|
123 |
-
if type == TOKEN_TIME:
|
124 |
-
type = "time"
|
125 |
-
elif type == TOKEN_SPECIAL:
|
126 |
-
if value == EOS:
|
127 |
-
value = "EOS"
|
128 |
-
elif value == PAD:
|
129 |
-
value = "PAD"
|
130 |
-
elif value == TIE:
|
131 |
-
value = "TIE"
|
132 |
-
else:
|
133 |
-
value = "Unknown Special"
|
134 |
-
elif type == TOKEN_NOTE:
|
135 |
-
type = "note"
|
136 |
-
elif type == TOKEN_VELOCITY:
|
137 |
-
type = "velocity"
|
138 |
-
strings.append((type, value))
|
139 |
-
return strings
|
140 |
-
|
141 |
-
def split_notes(self, notes, beatsteps, time_from, time_to):
|
142 |
-
"""
|
143 |
-
Assumptions
|
144 |
-
- notes are sorted by onset time
|
145 |
-
- beatsteps are sorted by time
|
146 |
-
"""
|
147 |
-
start_idx = np.searchsorted(beatsteps, time_from)
|
148 |
-
start_note = np.searchsorted(notes[:, 0], start_idx)
|
149 |
-
|
150 |
-
end_idx = np.searchsorted(beatsteps, time_to)
|
151 |
-
end_note = np.searchsorted(notes[:, 0], end_idx)
|
152 |
-
splited_notes = notes[start_note:end_note]
|
153 |
-
|
154 |
-
return splited_notes, (start_idx, end_idx, start_note, end_note)
|
155 |
-
|
156 |
-
def notes_to_relative_tokens(
|
157 |
-
self, notes, offset_idx, add_eos=False, add_composer=False, composer_value=None
|
158 |
-
):
|
159 |
-
"""
|
160 |
-
notes : (onset idx, offset idx, pitch, velocity)
|
161 |
-
"""
|
162 |
-
|
163 |
-
def _add_eos(tokens):
|
164 |
-
tokens = np.concatenate((tokens, np.array([EOS], dtype=tokens.dtype)))
|
165 |
-
return tokens
|
166 |
-
|
167 |
-
def _add_composer(tokens, composer_value):
|
168 |
-
tokens = np.concatenate(
|
169 |
-
(np.array([composer_value], dtype=tokens.dtype), tokens)
|
170 |
-
)
|
171 |
-
return tokens
|
172 |
-
|
173 |
-
if len(notes) == 0:
|
174 |
-
tokens = np.array([], dtype=int)
|
175 |
-
if add_eos:
|
176 |
-
tokens = _add_eos(tokens)
|
177 |
-
if add_composer:
|
178 |
-
tokens = _add_composer(tokens, composer_value=composer_value)
|
179 |
-
return tokens
|
180 |
-
|
181 |
-
max_time_idx = notes[:, :2].max()
|
182 |
-
|
183 |
-
# times[time_idx] = [[pitch, .. ], [pitch, 0], ..]
|
184 |
-
times = [[] for i in range((max_time_idx + 1 - offset_idx))]
|
185 |
-
for abs_onset, abs_offset, pitch, velocity in notes:
|
186 |
-
rel_onset = abs_onset - offset_idx
|
187 |
-
rel_offset = abs_offset - offset_idx
|
188 |
-
times[rel_onset].append([pitch, velocity])
|
189 |
-
times[rel_offset].append([pitch, 0])
|
190 |
-
|
191 |
-
# 여기서부터는 전부 시간 0(offset) 기준
|
192 |
-
tokens = []
|
193 |
-
current_velocity = 0
|
194 |
-
current_time_idx = 0
|
195 |
-
|
196 |
-
for rel_idx, time in enumerate(times):
|
197 |
-
if len(time) == 0:
|
198 |
-
continue
|
199 |
-
time_idx_shift = rel_idx - current_time_idx
|
200 |
-
current_time_idx = rel_idx
|
201 |
-
|
202 |
-
tokens.append(self.tokenize_note(time_idx_shift, TOKEN_TIME))
|
203 |
-
for pitch, velocity in time:
|
204 |
-
velocity = int(velocity > 0)
|
205 |
-
if current_velocity != velocity:
|
206 |
-
current_velocity = velocity
|
207 |
-
tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
|
208 |
-
tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
|
209 |
-
|
210 |
-
tokens = np.array(tokens, dtype=int)
|
211 |
-
if add_eos:
|
212 |
-
tokens = _add_eos(tokens)
|
213 |
-
if add_composer:
|
214 |
-
tokens = _add_composer(tokens, composer_value=composer_value)
|
215 |
-
return tokens
|
216 |
-
|
217 |
-
def relative_batch_tokens_to_midi(
|
218 |
-
self,
|
219 |
-
tokens,
|
220 |
-
beatstep,
|
221 |
-
beat_offset_idx=None,
|
222 |
-
bars_per_batch=None,
|
223 |
-
cutoff_time_idx=None,
|
224 |
-
):
|
225 |
-
"""
|
226 |
-
tokens : (batch, sequence)
|
227 |
-
beatstep : (times, )
|
228 |
-
"""
|
229 |
-
beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
|
230 |
-
notes = None
|
231 |
-
bars_per_batch = 2 if bars_per_batch is None else bars_per_batch
|
232 |
-
|
233 |
-
N = len(tokens)
|
234 |
-
for n in range(N):
|
235 |
-
_tokens = tokens[n]
|
236 |
-
_start_idx = beat_offset_idx + n * bars_per_batch * 4
|
237 |
-
_cutoff_time_idx = cutoff_time_idx + _start_idx
|
238 |
-
_notes = self.relative_tokens_to_notes(
|
239 |
-
_tokens,
|
240 |
-
start_idx=_start_idx,
|
241 |
-
cutoff_time_idx=_cutoff_time_idx,
|
242 |
-
)
|
243 |
-
# print(_notes, "\n-------")
|
244 |
-
if len(_notes) == 0:
|
245 |
-
pass
|
246 |
-
# print("_notes zero")
|
247 |
-
elif notes is None:
|
248 |
-
notes = _notes
|
249 |
-
else:
|
250 |
-
notes = np.concatenate((notes, _notes), axis=0)
|
251 |
-
|
252 |
-
if notes is None:
|
253 |
-
notes = []
|
254 |
-
midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
|
255 |
-
return midi, notes
|
256 |
-
|
257 |
-
def relative_tokens_to_notes(self, tokens, start_idx, cutoff_time_idx=None):
|
258 |
-
# TODO remove legacy
|
259 |
-
# decoding 첫토큰이 편곡자인 경우
|
260 |
-
if tokens[0] >= sum(self.config.vocab_size.values()):
|
261 |
-
tokens = tokens[1:]
|
262 |
-
|
263 |
-
words = [self.detokenize(token, time_idx_offset=0) for token in tokens]
|
264 |
-
|
265 |
-
if hasattr(start_idx, "item"):
|
266 |
-
"""
|
267 |
-
if numpy or torch tensor
|
268 |
-
"""
|
269 |
-
start_idx = start_idx.item()
|
270 |
-
|
271 |
-
current_idx = start_idx
|
272 |
-
current_velocity = 0
|
273 |
-
note_onsets_ready = [None for i in range(self.config.vocab_size.note + 1)]
|
274 |
-
notes = []
|
275 |
-
for type, number in words:
|
276 |
-
if type == TOKEN_SPECIAL:
|
277 |
-
if number == EOS:
|
278 |
-
break
|
279 |
-
elif type == TOKEN_TIME:
|
280 |
-
current_idx += number
|
281 |
-
if cutoff_time_idx is not None:
|
282 |
-
current_idx = min(current_idx, cutoff_time_idx)
|
283 |
-
|
284 |
-
elif type == TOKEN_VELOCITY:
|
285 |
-
current_velocity = number
|
286 |
-
elif type == TOKEN_NOTE:
|
287 |
-
pitch = number
|
288 |
-
if current_velocity == 0:
|
289 |
-
# note_offset
|
290 |
-
if note_onsets_ready[pitch] is None:
|
291 |
-
# offset without onset
|
292 |
-
pass
|
293 |
-
else:
|
294 |
-
onset_idx = note_onsets_ready[pitch]
|
295 |
-
if onset_idx >= current_idx:
|
296 |
-
# No time shift after previous note_on
|
297 |
-
pass
|
298 |
-
else:
|
299 |
-
offset_idx = current_idx
|
300 |
-
notes.append(
|
301 |
-
[onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
|
302 |
-
)
|
303 |
-
note_onsets_ready[pitch] = None
|
304 |
-
else:
|
305 |
-
# note_on
|
306 |
-
if note_onsets_ready[pitch] is None:
|
307 |
-
note_onsets_ready[pitch] = current_idx
|
308 |
-
else:
|
309 |
-
# note-on already exists
|
310 |
-
onset_idx = note_onsets_ready[pitch]
|
311 |
-
if onset_idx >= current_idx:
|
312 |
-
# No time shift after previous note_on
|
313 |
-
pass
|
314 |
-
else:
|
315 |
-
offset_idx = current_idx
|
316 |
-
notes.append(
|
317 |
-
[onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
|
318 |
-
)
|
319 |
-
note_onsets_ready[pitch] = current_idx
|
320 |
-
else:
|
321 |
-
raise ValueError
|
322 |
-
|
323 |
-
for pitch, note_on in enumerate(note_onsets_ready):
|
324 |
-
# force offset if no offset for each pitch
|
325 |
-
if note_on is not None:
|
326 |
-
if cutoff_time_idx is None:
|
327 |
-
cutoff = note_on + 1
|
328 |
-
else:
|
329 |
-
cutoff = max(cutoff_time_idx, note_on + 1)
|
330 |
-
|
331 |
-
offset_idx = max(current_idx, cutoff)
|
332 |
-
notes.append([note_on, offset_idx, pitch, DEFAULT_VELOCITY])
|
333 |
-
|
334 |
-
if len(notes) == 0:
|
335 |
-
return []
|
336 |
-
else:
|
337 |
-
notes = np.array(notes)
|
338 |
-
note_order = notes[:, 0] * 128 + notes[:, 1]
|
339 |
-
notes = notes[note_order.argsort()]
|
340 |
-
return notes
|
341 |
-
|
342 |
-
def notes_to_midi(self, notes, beatstep, offset_sec=None):
|
343 |
-
new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
|
344 |
-
new_inst = pretty_midi.Instrument(program=0)
|
345 |
-
new_notes = []
|
346 |
-
if offset_sec is None:
|
347 |
-
offset_sec = 0.0
|
348 |
-
|
349 |
-
for onset_idx, offset_idx, pitch, velocity in notes:
|
350 |
-
new_note = pretty_midi.Note(
|
351 |
-
velocity=velocity,
|
352 |
-
pitch=pitch,
|
353 |
-
start=beatstep[onset_idx] - offset_sec,
|
354 |
-
end=beatstep[offset_idx] - offset_sec,
|
355 |
-
)
|
356 |
-
new_notes.append(new_note)
|
357 |
-
new_inst.notes = new_notes
|
358 |
-
new_pm.instruments.append(new_inst)
|
359 |
-
new_pm.remove_invalid_notes()
|
360 |
-
return new_pm
|
361 |
-
|
362 |
-
|
363 |
-
@jit(nopython=True, cache=False)
|
364 |
-
def fast_notes_to_relative_tokens(
|
365 |
-
notes, offset_idx, max_time_idx, n_special, n_note, n_velocity
|
366 |
-
):
|
367 |
-
"""
|
368 |
-
notes : (onset idx, offset idx, pitch, velocity)
|
369 |
-
"""
|
370 |
-
|
371 |
-
times_p = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
|
372 |
-
times_v = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
|
373 |
-
|
374 |
-
for abs_onset, abs_offset, pitch, velocity in notes:
|
375 |
-
rel_onset = abs_onset - offset_idx
|
376 |
-
rel_offset = abs_offset - offset_idx
|
377 |
-
times_p[rel_onset] = np.append(times_p[rel_onset], pitch)
|
378 |
-
times_v[rel_onset] = np.append(times_v[rel_onset], velocity)
|
379 |
-
times_p[rel_offset] = np.append(times_p[rel_offset], pitch)
|
380 |
-
times_v[rel_offset] = np.append(times_v[rel_offset], velocity)
|
381 |
-
|
382 |
-
# 여기서부터는 전부 시간 0(offset) 기준
|
383 |
-
tokens = []
|
384 |
-
current_velocity = np.array([0])
|
385 |
-
current_time_idx = np.array([0])
|
386 |
-
|
387 |
-
# range가 0일 수도 있으니까..
|
388 |
-
for i in range(len(times_p)):
|
389 |
-
rel_idx = i
|
390 |
-
notes_at_time = times_p[i]
|
391 |
-
if len(notes_at_time) == 0:
|
392 |
-
continue
|
393 |
-
|
394 |
-
time_idx_shift = rel_idx - current_time_idx[0]
|
395 |
-
current_time_idx[0] = rel_idx
|
396 |
-
|
397 |
-
token = fast_tokenize(
|
398 |
-
time_idx_shift,
|
399 |
-
TOKEN_TIME,
|
400 |
-
n_special=n_special,
|
401 |
-
n_note=n_note,
|
402 |
-
n_velocity=n_velocity,
|
403 |
-
)
|
404 |
-
tokens.append(token)
|
405 |
-
|
406 |
-
for j in range(len(notes_at_time)):
|
407 |
-
pitch = times_p[j]
|
408 |
-
velocity = times_v[j]
|
409 |
-
# for pitch, velocity in time:
|
410 |
-
velocity = int(velocity > 0)
|
411 |
-
if current_velocity[0] != velocity:
|
412 |
-
current_velocity[0] = velocity
|
413 |
-
token = fast_tokenize(
|
414 |
-
velocity,
|
415 |
-
TOKEN_VELOCITY,
|
416 |
-
n_special=n_special,
|
417 |
-
n_note=n_note,
|
418 |
-
n_velocity=n_velocity,
|
419 |
-
)
|
420 |
-
tokens.append(token)
|
421 |
-
token = fast_tokenize(
|
422 |
-
pitch,
|
423 |
-
TOKEN_NOTE,
|
424 |
-
n_special=n_special,
|
425 |
-
n_note=n_note,
|
426 |
-
n_velocity=n_velocity,
|
427 |
-
)
|
428 |
-
tokens.append(token)
|
429 |
-
|
430 |
-
return np.array(tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
packages.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
fluidsynth
|
|
|
|
1 |
+
fluidsynth
|
2 |
+
ffmpeg
|
preprocess/README.md
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
# Preprocess Scripts
|
2 |
-
---
|
3 |
-
- Note : the order of these scripts is IMPORTANT.
|
4 |
-
- the preprocessing step is easy. but environment setting is not. please understand.
|
5 |
-
- If you encounter any problems, please do not hesitate to email me or open an issue to the github.
|
6 |
-
|
7 |
-
1. Transcribe piano wavs to midi
|
8 |
-
- You should transcribe {piano_cover_file.wav} -> {piano_cover_file.mid}
|
9 |
-
- I recommend you to use original codes from this repo : [High-resolution Piano Transcription with Pedals by Regressing Onsets and Offsets Times](https://github.com/qiuqiangkong/piano_transcription_inference)
|
10 |
-
|
11 |
-
- Instead, you can also you my docker script.
|
12 |
-
```bash
|
13 |
-
docker run -it --gpus all --rm -v /DIRECTORY_THAT_CONTAINS_PIANO_WAV/:/input -v /DIRECTORY_THAT_MIDI_OUTPUT/:/output jonghochoi/piano_transcribe:bytedance1
|
14 |
-
```
|
15 |
-
- If you are using GPU RTX 30XX or higher, this script may not work properly. It's because the version of pytorch is too low(1.4).
|
16 |
-
- then upgrade the version of pytorch in the docker..
|
17 |
-
|
18 |
-
2. Estimate Pop's beats
|
19 |
-
```bash
|
20 |
-
python bpm_quantize.py DATA_DIR
|
21 |
-
```
|
22 |
-
|
23 |
-
3. synchronize midi
|
24 |
-
```bash
|
25 |
-
python pop_align.py DATA_DIR
|
26 |
-
```
|
27 |
-
|
28 |
-
4. get separated vocal track
|
29 |
-
```bash
|
30 |
-
python split_spleeter.py DATA_DIR
|
31 |
-
```
|
32 |
-
|
33 |
-
5. caculate melody chroma accuracy
|
34 |
-
```bash
|
35 |
-
python melody_accuracy.py DATA_DIR
|
36 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/beat_quantizer.py
DELETED
@@ -1,111 +0,0 @@
|
|
1 |
-
import copy
|
2 |
-
import librosa
|
3 |
-
import essentia
|
4 |
-
import essentia.standard
|
5 |
-
import numpy as np
|
6 |
-
import scipy.interpolate as interp
|
7 |
-
import note_seq
|
8 |
-
|
9 |
-
SAMPLERATE = 44100
|
10 |
-
|
11 |
-
|
12 |
-
def nearest_onset_offset_digitize(on, off, bins):
|
13 |
-
intermediate = (bins[1:] + bins[:-1]) / 2
|
14 |
-
on_idx = np.digitize(on, intermediate)
|
15 |
-
off_idx = np.digitize(off, intermediate)
|
16 |
-
off_idx[on_idx == off_idx] += 1
|
17 |
-
# off_idx = np.clip(off_idx, a_min=0, a_max=len(bins) - 1)
|
18 |
-
return on_idx, off_idx
|
19 |
-
|
20 |
-
|
21 |
-
def apply_sustain_pedal(pm):
|
22 |
-
ns = note_seq.midi_to_note_sequence(pm)
|
23 |
-
susns = note_seq.apply_sustain_control_changes(ns)
|
24 |
-
suspm = note_seq.note_sequence_to_pretty_midi(susns)
|
25 |
-
return suspm
|
26 |
-
|
27 |
-
|
28 |
-
def interpolate_beat_times(beat_times, steps_per_beat, extend=False):
|
29 |
-
beat_times_function = interp.interp1d(
|
30 |
-
np.arange(beat_times.size),
|
31 |
-
beat_times,
|
32 |
-
bounds_error=False,
|
33 |
-
fill_value="extrapolate",
|
34 |
-
)
|
35 |
-
if extend:
|
36 |
-
beat_steps_8th = beat_times_function(
|
37 |
-
np.linspace(0, beat_times.size, beat_times.size * steps_per_beat + 1)
|
38 |
-
)
|
39 |
-
else:
|
40 |
-
beat_steps_8th = beat_times_function(
|
41 |
-
np.linspace(0, beat_times.size - 1, beat_times.size * steps_per_beat - 1)
|
42 |
-
)
|
43 |
-
return beat_steps_8th
|
44 |
-
|
45 |
-
|
46 |
-
def midi_quantize_by_beats(
|
47 |
-
sample, beat_times, steps_per_beat, ignore_sustain_pedal=False
|
48 |
-
):
|
49 |
-
ns = note_seq.midi_file_to_note_sequence(sample.midi)
|
50 |
-
if ignore_sustain_pedal:
|
51 |
-
susns = ns
|
52 |
-
else:
|
53 |
-
susns = note_seq.apply_sustain_control_changes(ns)
|
54 |
-
|
55 |
-
qns = copy.deepcopy(susns)
|
56 |
-
|
57 |
-
notes = np.array([[n.start_time, n.end_time] for n in susns.notes])
|
58 |
-
note_attributes = np.array([[n.pitch, n.velocity] for n in susns.notes])
|
59 |
-
|
60 |
-
note_ons = np.array(notes[:, 0])
|
61 |
-
note_offs = np.array(notes[:, 1])
|
62 |
-
|
63 |
-
beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=False)
|
64 |
-
|
65 |
-
on_idx, off_idx = nearest_onset_offset_digitize(note_ons, note_offs, beat_steps_8th)
|
66 |
-
|
67 |
-
beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
|
68 |
-
|
69 |
-
discrete_notes = np.concatenate(
|
70 |
-
(np.stack((on_idx, off_idx), axis=1), note_attributes), axis=1
|
71 |
-
)
|
72 |
-
|
73 |
-
def delete_duplicate_notes(dnotes):
|
74 |
-
note_order = dnotes[:, 0] * 128 + dnotes[:, 2]
|
75 |
-
dnotes = dnotes[note_order.argsort()]
|
76 |
-
indices = []
|
77 |
-
for i in range(1, len(dnotes)):
|
78 |
-
if dnotes[i, 0] == dnotes[i - 1, 0] and dnotes[i, 2] == dnotes[i - 1, 2]:
|
79 |
-
indices.append(i)
|
80 |
-
dnotes = np.delete(dnotes, indices, axis=0)
|
81 |
-
note_order = dnotes[:, 0] * 128 + dnotes[:, 1]
|
82 |
-
dnotes = dnotes[note_order.argsort()]
|
83 |
-
return dnotes
|
84 |
-
|
85 |
-
discrete_notes = delete_duplicate_notes(discrete_notes)
|
86 |
-
|
87 |
-
digitized_note_ons, digitized_note_offs = (
|
88 |
-
beat_steps_8th[on_idx],
|
89 |
-
beat_steps_8th[off_idx],
|
90 |
-
)
|
91 |
-
|
92 |
-
for i, note in enumerate(qns.notes):
|
93 |
-
note.start_time = digitized_note_ons[i]
|
94 |
-
note.end_time = digitized_note_offs[i]
|
95 |
-
|
96 |
-
return qns, discrete_notes, beat_steps_8th
|
97 |
-
|
98 |
-
|
99 |
-
def extract_rhythm(song, y=None):
|
100 |
-
if y is None:
|
101 |
-
y, sr = librosa.load(song, sr=SAMPLERATE)
|
102 |
-
|
103 |
-
essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
|
104 |
-
(
|
105 |
-
bpm,
|
106 |
-
beat_times,
|
107 |
-
confidence,
|
108 |
-
estimates,
|
109 |
-
essentia_beat_intervals,
|
110 |
-
) = essentia_tracker(y)
|
111 |
-
return bpm, beat_times, confidence, estimates, essentia_beat_intervals
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/bpm_quantize.py
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import sys
|
3 |
-
import os
|
4 |
-
|
5 |
-
|
6 |
-
import librosa
|
7 |
-
import soundfile as sf
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
import note_seq
|
11 |
-
from omegaconf import OmegaConf
|
12 |
-
from beat_quantizer import extract_rhythm, midi_quantize_by_beats
|
13 |
-
|
14 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
15 |
-
from midiaudiopair import MidiAudioPair
|
16 |
-
from utils.dsp import get_stereo
|
17 |
-
|
18 |
-
|
19 |
-
def estimate(meta_file, ignore_sustain_pedal):
|
20 |
-
sample = MidiAudioPair(meta_file)
|
21 |
-
|
22 |
-
if (
|
23 |
-
sample.error_code == MidiAudioPair.NO_PIANO
|
24 |
-
or sample.error_code == MidiAudioPair.NO_SONG_DIR
|
25 |
-
or sample.error_code == MidiAudioPair.NO_SONG
|
26 |
-
):
|
27 |
-
return
|
28 |
-
|
29 |
-
bpm, beat_times, confidence, estimates, essentia_beat_intervals = extract_rhythm(sample.song)
|
30 |
-
beat_times = np.array(beat_times)
|
31 |
-
essentia_beat_intervals = np.array(essentia_beat_intervals)
|
32 |
-
|
33 |
-
qns, discrete_notes, beat_steps_8th = midi_quantize_by_beats(
|
34 |
-
sample, beat_times, 2, ignore_sustain_pedal=ignore_sustain_pedal
|
35 |
-
)
|
36 |
-
|
37 |
-
qpm = note_seq.note_sequence_to_pretty_midi(qns)
|
38 |
-
qpm.instruments[0].control_changes = []
|
39 |
-
qpm.write(sample.qmidi)
|
40 |
-
y, sr = librosa.load(sample.song, sr=None)
|
41 |
-
qpm_y = qpm.fluidsynth(sr)
|
42 |
-
qmix = get_stereo(y, qpm_y, 0.4)
|
43 |
-
sf.write(file=sample.qmix, data=qmix.T, samplerate=sr, format="flac")
|
44 |
-
|
45 |
-
meta = OmegaConf.load(meta_file)
|
46 |
-
meta.tempo = OmegaConf.create()
|
47 |
-
meta.tempo.bpm = bpm
|
48 |
-
meta.tempo.confidence = confidence
|
49 |
-
OmegaConf.save(meta, meta_file)
|
50 |
-
|
51 |
-
np.save(sample.notes, discrete_notes)
|
52 |
-
np.save(sample.beatstep, beat_steps_8th)
|
53 |
-
np.save(sample.beattime, beat_times)
|
54 |
-
np.save(sample.beatinterval, essentia_beat_intervals)
|
55 |
-
|
56 |
-
|
57 |
-
def main(meta_files, ignore_sustain_pedal):
|
58 |
-
from tqdm import tqdm
|
59 |
-
import multiprocessing
|
60 |
-
from joblib import Parallel, delayed
|
61 |
-
|
62 |
-
def files():
|
63 |
-
pbar = tqdm(meta_files)
|
64 |
-
for meta_file in pbar:
|
65 |
-
pbar.set_description(meta_file)
|
66 |
-
yield meta_file
|
67 |
-
|
68 |
-
Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
|
69 |
-
delayed(estimate)(meta_file, ignore_sustain_pedal) for meta_file in files()
|
70 |
-
)
|
71 |
-
|
72 |
-
|
73 |
-
if __name__ == "__main__":
|
74 |
-
import argparse
|
75 |
-
|
76 |
-
parser = argparse.ArgumentParser(description="bpm estimate using essentia")
|
77 |
-
|
78 |
-
parser.add_argument(
|
79 |
-
"data_dir",
|
80 |
-
type=str,
|
81 |
-
default=None,
|
82 |
-
help="""directory contains {id}/{pop_filename.wav}
|
83 |
-
""",
|
84 |
-
)
|
85 |
-
|
86 |
-
parser.add_argument(
|
87 |
-
"--ignore_sustain_pedal",
|
88 |
-
default=False,
|
89 |
-
action="store_true",
|
90 |
-
help="whether dry_run",
|
91 |
-
)
|
92 |
-
|
93 |
-
args = parser.parse_args()
|
94 |
-
|
95 |
-
meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
|
96 |
-
print("meta ", len(meta_files))
|
97 |
-
|
98 |
-
main(meta_files, args.ignore_sustain_pedal)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/melody_accuracy.py
DELETED
@@ -1,81 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import sys
|
3 |
-
import os
|
4 |
-
|
5 |
-
import librosa
|
6 |
-
import pretty_midi
|
7 |
-
|
8 |
-
from omegaconf import OmegaConf
|
9 |
-
|
10 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11 |
-
from midiaudiopair import MidiAudioPair
|
12 |
-
from evaluate import midi_melody_accuracy as ma
|
13 |
-
|
14 |
-
|
15 |
-
def estimate(meta_file):
|
16 |
-
|
17 |
-
import warnings
|
18 |
-
|
19 |
-
warnings.filterwarnings(action="ignore")
|
20 |
-
|
21 |
-
sample = MidiAudioPair(meta_file)
|
22 |
-
|
23 |
-
if (
|
24 |
-
sample.error_code == MidiAudioPair.NO_PIANO
|
25 |
-
or sample.error_code == MidiAudioPair.NO_SONG_DIR
|
26 |
-
or sample.error_code == MidiAudioPair.NO_SONG
|
27 |
-
):
|
28 |
-
return
|
29 |
-
|
30 |
-
if "vocals" in sample.invalids:
|
31 |
-
print("no vocal:", meta_file)
|
32 |
-
return
|
33 |
-
|
34 |
-
midi = pretty_midi.PrettyMIDI(sample.qmidi)
|
35 |
-
vocals, sr = librosa.load(sample.vocals, sr=44100)
|
36 |
-
|
37 |
-
chroma_accuracy, pitch_accuracy = ma.evaluate_melody(
|
38 |
-
midi, vocals, sr=sr, hop_length=1024
|
39 |
-
)
|
40 |
-
meta = OmegaConf.load(meta_file)
|
41 |
-
meta.eval = OmegaConf.create()
|
42 |
-
meta.eval.melody_chroma_accuracy = chroma_accuracy.item()
|
43 |
-
meta.eval.melody_pitch_accuracy = pitch_accuracy.item()
|
44 |
-
OmegaConf.save(meta, meta_file)
|
45 |
-
|
46 |
-
|
47 |
-
def main(meta_files):
|
48 |
-
from tqdm import tqdm
|
49 |
-
import multiprocessing
|
50 |
-
from joblib import Parallel, delayed
|
51 |
-
|
52 |
-
def files():
|
53 |
-
pbar = tqdm(meta_files)
|
54 |
-
for meta_file in pbar:
|
55 |
-
pbar.set_description(meta_file)
|
56 |
-
yield meta_file
|
57 |
-
|
58 |
-
Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
|
59 |
-
delayed(estimate)(meta_file) for meta_file in files()
|
60 |
-
)
|
61 |
-
|
62 |
-
|
63 |
-
if __name__ == "__main__":
|
64 |
-
import argparse
|
65 |
-
|
66 |
-
parser = argparse.ArgumentParser(description="bpm estimate using essentia")
|
67 |
-
|
68 |
-
parser.add_argument(
|
69 |
-
"data_dir",
|
70 |
-
type=str,
|
71 |
-
default=None,
|
72 |
-
help="""directory contains {id}/{pop_filename.wav}
|
73 |
-
""",
|
74 |
-
)
|
75 |
-
|
76 |
-
args = parser.parse_args()
|
77 |
-
|
78 |
-
meta_files = sorted(glob.glob(args.data_dir + "/**/*.yaml", recursive=True))
|
79 |
-
print("meta ", len(meta_files))
|
80 |
-
|
81 |
-
main(meta_files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/pop_align.py
DELETED
@@ -1,331 +0,0 @@
|
|
1 |
-
import librosa
|
2 |
-
import soundfile as sf
|
3 |
-
import glob
|
4 |
-
import os
|
5 |
-
import copy
|
6 |
-
import sys
|
7 |
-
|
8 |
-
import numpy as np
|
9 |
-
import pyrubberband as pyrb
|
10 |
-
import pretty_midi
|
11 |
-
from omegaconf import OmegaConf
|
12 |
-
from tqdm.auto import tqdm
|
13 |
-
|
14 |
-
from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
|
15 |
-
from synctoolbox.dtw.utils import (
|
16 |
-
compute_optimal_chroma_shift,
|
17 |
-
shift_chroma_vectors,
|
18 |
-
make_path_strictly_monotonic,
|
19 |
-
)
|
20 |
-
from synctoolbox.feature.chroma import (
|
21 |
-
pitch_to_chroma,
|
22 |
-
quantize_chroma,
|
23 |
-
quantized_chroma_to_CENS,
|
24 |
-
)
|
25 |
-
from synctoolbox.feature.dlnco import pitch_onset_features_to_DLNCO
|
26 |
-
from synctoolbox.feature.pitch import audio_to_pitch_features
|
27 |
-
from synctoolbox.feature.pitch_onset import audio_to_pitch_onset_features
|
28 |
-
from synctoolbox.feature.utils import estimate_tuning
|
29 |
-
|
30 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
31 |
-
print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
32 |
-
from utils.dsp import normalize, get_stereo
|
33 |
-
from midiaudiopair import MidiAudioPair
|
34 |
-
|
35 |
-
Fs = 22050
|
36 |
-
feature_rate = 50
|
37 |
-
step_weights = np.array([1.5, 1.5, 2.0])
|
38 |
-
threshold_rec = 10 ** 6
|
39 |
-
|
40 |
-
|
41 |
-
def save_delayed_song(
|
42 |
-
sample,
|
43 |
-
dry_run,
|
44 |
-
):
|
45 |
-
import warnings
|
46 |
-
|
47 |
-
warnings.filterwarnings(action="ignore")
|
48 |
-
|
49 |
-
song_audio, _ = librosa.load(sample.original_song, Fs)
|
50 |
-
midi_pm = pretty_midi.PrettyMIDI(sample.original_midi)
|
51 |
-
|
52 |
-
if np.power(song_audio, 2).sum() < 1: # low energy: invalid file
|
53 |
-
print("invalid audio :", sample.original_song)
|
54 |
-
sample.delete_files_myself()
|
55 |
-
return
|
56 |
-
|
57 |
-
rd = get_aligned_results(midi_pm=midi_pm, song_audio=song_audio)
|
58 |
-
|
59 |
-
mix_song = rd["mix_song"]
|
60 |
-
song_pitch_shifted = rd["song_pitch_shifted"]
|
61 |
-
midi_warped_pm = rd["midi_warped_pm"]
|
62 |
-
pitch_shift_for_song_audio = rd["pitch_shift_for_song_audio"]
|
63 |
-
tuning_offset_song = rd["tuning_offset_song"]
|
64 |
-
tuning_offset_piano = rd["tuning_offset_piano"]
|
65 |
-
|
66 |
-
try:
|
67 |
-
if dry_run:
|
68 |
-
print("write audio files: ", sample.song)
|
69 |
-
else:
|
70 |
-
sf.write(
|
71 |
-
file=sample.song,
|
72 |
-
data=song_pitch_shifted,
|
73 |
-
samplerate=Fs,
|
74 |
-
format="wav",
|
75 |
-
)
|
76 |
-
except:
|
77 |
-
print("Fail : ", sample.song)
|
78 |
-
|
79 |
-
try:
|
80 |
-
if dry_run:
|
81 |
-
print("write warped midi :", sample.midi)
|
82 |
-
else:
|
83 |
-
midi_warped_pm.write(sample.midi)
|
84 |
-
|
85 |
-
except:
|
86 |
-
midi_warped_pm._tick_scales = midi_pm._tick_scales
|
87 |
-
try:
|
88 |
-
if dry_run:
|
89 |
-
print("write warped midi2 :", sample.midi)
|
90 |
-
else:
|
91 |
-
midi_warped_pm.write(sample.midi)
|
92 |
-
|
93 |
-
except:
|
94 |
-
print("ad-hoc failed midi : ", sample.midi)
|
95 |
-
print("ad-hoc midi : ", sample.midi)
|
96 |
-
|
97 |
-
sample.yaml.song.pitch_shift = pitch_shift_for_song_audio.item()
|
98 |
-
sample.yaml.song.tuning_offset = tuning_offset_song.item()
|
99 |
-
sample.yaml.piano.tuning_offset = tuning_offset_piano.item()
|
100 |
-
OmegaConf.save(sample.yaml, sample.yaml_path)
|
101 |
-
|
102 |
-
|
103 |
-
def get_aligned_results(midi_pm, song_audio):
|
104 |
-
piano_audio = midi_pm.fluidsynth(Fs)
|
105 |
-
|
106 |
-
song_audio = normalize(song_audio)
|
107 |
-
|
108 |
-
# The reason for estimating tuning ::
|
109 |
-
# https://www.audiolabs-erlangen.de/resources/MIR/FMP/C3/C3S1_TranspositionTuning.html
|
110 |
-
tuning_offset_1 = estimate_tuning(song_audio, Fs)
|
111 |
-
tuning_offset_2 = estimate_tuning(piano_audio, Fs)
|
112 |
-
|
113 |
-
# DLNCO features (Sebastian Ewert, Meinard Müller, and Peter Grosche: High Resolution Audio Synchronization Using Chroma Onset Features, In Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP): 1869–1872, 2009.):
|
114 |
-
# helpful to increase synchronization accuracy, especially for music with clear onsets.
|
115 |
-
|
116 |
-
# Quantized and smoothed chroma : CENS features
|
117 |
-
# Because, MrMsDTW Requires CENS.
|
118 |
-
f_chroma_quantized_1, f_DLNCO_1 = get_features_from_audio(
|
119 |
-
song_audio, tuning_offset_1
|
120 |
-
)
|
121 |
-
f_chroma_quantized_2, f_DLNCO_2 = get_features_from_audio(
|
122 |
-
piano_audio, tuning_offset_2
|
123 |
-
)
|
124 |
-
|
125 |
-
# Shift chroma vectors :
|
126 |
-
# Otherwise, different keys of two audio leads to degradation of alignment.
|
127 |
-
opt_chroma_shift = compute_optimal_chroma_shift(
|
128 |
-
quantized_chroma_to_CENS(f_chroma_quantized_1, 201, 50, feature_rate)[0],
|
129 |
-
quantized_chroma_to_CENS(f_chroma_quantized_2, 201, 50, feature_rate)[0],
|
130 |
-
)
|
131 |
-
f_chroma_quantized_2 = shift_chroma_vectors(f_chroma_quantized_2, opt_chroma_shift)
|
132 |
-
f_DLNCO_2 = shift_chroma_vectors(f_DLNCO_2, opt_chroma_shift)
|
133 |
-
|
134 |
-
wp = sync_via_mrmsdtw(
|
135 |
-
f_chroma1=f_chroma_quantized_1,
|
136 |
-
f_onset1=f_DLNCO_1,
|
137 |
-
f_chroma2=f_chroma_quantized_2,
|
138 |
-
f_onset2=f_DLNCO_2,
|
139 |
-
input_feature_rate=feature_rate,
|
140 |
-
step_weights=step_weights,
|
141 |
-
threshold_rec=threshold_rec,
|
142 |
-
verbose=False,
|
143 |
-
)
|
144 |
-
|
145 |
-
wp = make_path_strictly_monotonic(wp)
|
146 |
-
pitch_shift_for_song_audio = -opt_chroma_shift % 12
|
147 |
-
if pitch_shift_for_song_audio > 6:
|
148 |
-
pitch_shift_for_song_audio -= 12
|
149 |
-
|
150 |
-
if pitch_shift_for_song_audio != 0:
|
151 |
-
song_audio_shifted = pyrb.pitch_shift(
|
152 |
-
song_audio, Fs, pitch_shift_for_song_audio
|
153 |
-
)
|
154 |
-
else:
|
155 |
-
song_audio_shifted = song_audio
|
156 |
-
|
157 |
-
time_map_second = wp / feature_rate
|
158 |
-
midi_pm_warped = copy.deepcopy(midi_pm)
|
159 |
-
|
160 |
-
midi_pm_warped = simple_adjust_times(
|
161 |
-
midi_pm_warped, time_map_second[1], time_map_second[0]
|
162 |
-
)
|
163 |
-
piano_audio_warped = midi_pm_warped.fluidsynth(Fs)
|
164 |
-
|
165 |
-
song_audio_shifted = normalize(song_audio_shifted)
|
166 |
-
stereo_sonification_piano = get_stereo(song_audio_shifted, piano_audio_warped)
|
167 |
-
|
168 |
-
rd = dict(
|
169 |
-
mix_song=stereo_sonification_piano,
|
170 |
-
song_pitch_shifted=song_audio_shifted,
|
171 |
-
midi_warped_pm=midi_pm_warped,
|
172 |
-
pitch_shift_for_song_audio=pitch_shift_for_song_audio,
|
173 |
-
tuning_offset_song=tuning_offset_1,
|
174 |
-
tuning_offset_piano=tuning_offset_2,
|
175 |
-
)
|
176 |
-
return rd
|
177 |
-
|
178 |
-
|
179 |
-
def simple_adjust_times(pm, original_times, new_times):
|
180 |
-
"""
|
181 |
-
most of these codes are from original pretty_midi
|
182 |
-
https://github.com/craffel/pretty-midi/blob/main/pretty_midi/pretty_midi.py
|
183 |
-
"""
|
184 |
-
for instrument in pm.instruments:
|
185 |
-
instrument.notes = [
|
186 |
-
copy.deepcopy(note)
|
187 |
-
for note in instrument.notes
|
188 |
-
if note.start >= original_times[0] and note.end <= original_times[-1]
|
189 |
-
]
|
190 |
-
# Get array of note-on locations and correct them
|
191 |
-
note_ons = np.array(
|
192 |
-
[note.start for instrument in pm.instruments for note in instrument.notes]
|
193 |
-
)
|
194 |
-
adjusted_note_ons = np.interp(note_ons, original_times, new_times)
|
195 |
-
# Same for note-offs
|
196 |
-
note_offs = np.array(
|
197 |
-
[note.end for instrument in pm.instruments for note in instrument.notes]
|
198 |
-
)
|
199 |
-
adjusted_note_offs = np.interp(note_offs, original_times, new_times)
|
200 |
-
# Correct notes
|
201 |
-
for n, note in enumerate(
|
202 |
-
[note for instrument in pm.instruments for note in instrument.notes]
|
203 |
-
):
|
204 |
-
note.start = (adjusted_note_ons[n] > 0) * adjusted_note_ons[n]
|
205 |
-
note.end = (adjusted_note_offs[n] > 0) * adjusted_note_offs[n]
|
206 |
-
# After performing alignment, some notes may have an end time which is
|
207 |
-
# on or before the start time. Remove these!
|
208 |
-
pm.remove_invalid_notes()
|
209 |
-
|
210 |
-
def adjust_events(event_getter):
|
211 |
-
"""This function calls event_getter with each instrument as the
|
212 |
-
sole argument and adjusts the events which are returned."""
|
213 |
-
# Sort the events by time
|
214 |
-
for instrument in pm.instruments:
|
215 |
-
event_getter(instrument).sort(key=lambda e: e.time)
|
216 |
-
# Correct the events by interpolating
|
217 |
-
event_times = np.array(
|
218 |
-
[
|
219 |
-
event.time
|
220 |
-
for instrument in pm.instruments
|
221 |
-
for event in event_getter(instrument)
|
222 |
-
]
|
223 |
-
)
|
224 |
-
adjusted_event_times = np.interp(event_times, original_times, new_times)
|
225 |
-
for n, event in enumerate(
|
226 |
-
[
|
227 |
-
event
|
228 |
-
for instrument in pm.instruments
|
229 |
-
for event in event_getter(instrument)
|
230 |
-
]
|
231 |
-
):
|
232 |
-
event.time = adjusted_event_times[n]
|
233 |
-
for instrument in pm.instruments:
|
234 |
-
# We want to keep only the final event which has time ==
|
235 |
-
# new_times[0]
|
236 |
-
valid_events = [
|
237 |
-
event
|
238 |
-
for event in event_getter(instrument)
|
239 |
-
if event.time == new_times[0]
|
240 |
-
]
|
241 |
-
if valid_events:
|
242 |
-
valid_events = valid_events[-1:]
|
243 |
-
# Otherwise only keep events within the new set of times
|
244 |
-
valid_events.extend(
|
245 |
-
event
|
246 |
-
for event in event_getter(instrument)
|
247 |
-
if event.time > new_times[0] and event.time < new_times[-1]
|
248 |
-
)
|
249 |
-
event_getter(instrument)[:] = valid_events
|
250 |
-
|
251 |
-
# Correct pitch bends and control changes
|
252 |
-
adjust_events(lambda i: i.pitch_bends)
|
253 |
-
adjust_events(lambda i: i.control_changes)
|
254 |
-
|
255 |
-
return pm
|
256 |
-
|
257 |
-
|
258 |
-
def get_features_from_audio(audio, tuning_offset, visualize=False):
|
259 |
-
f_pitch = audio_to_pitch_features(
|
260 |
-
f_audio=audio,
|
261 |
-
Fs=Fs,
|
262 |
-
tuning_offset=tuning_offset,
|
263 |
-
feature_rate=feature_rate,
|
264 |
-
verbose=visualize,
|
265 |
-
)
|
266 |
-
f_chroma = pitch_to_chroma(f_pitch=f_pitch)
|
267 |
-
f_chroma_quantized = quantize_chroma(f_chroma=f_chroma)
|
268 |
-
|
269 |
-
f_pitch_onset = audio_to_pitch_onset_features(
|
270 |
-
f_audio=audio, Fs=Fs, tuning_offset=tuning_offset, verbose=visualize
|
271 |
-
)
|
272 |
-
f_DLNCO = pitch_onset_features_to_DLNCO(
|
273 |
-
f_peaks=f_pitch_onset,
|
274 |
-
feature_rate=feature_rate,
|
275 |
-
feature_sequence_length=f_chroma_quantized.shape[1],
|
276 |
-
visualize=visualize,
|
277 |
-
)
|
278 |
-
return f_chroma_quantized, f_DLNCO
|
279 |
-
|
280 |
-
|
281 |
-
def main(samples, dry_run):
|
282 |
-
import multiprocessing
|
283 |
-
from joblib import Parallel, delayed
|
284 |
-
|
285 |
-
Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
|
286 |
-
delayed(save_delayed_song)(sample=sample, dry_run=dry_run)
|
287 |
-
for sample in tqdm(samples)
|
288 |
-
)
|
289 |
-
|
290 |
-
|
291 |
-
if __name__ == "__main__":
|
292 |
-
|
293 |
-
import argparse
|
294 |
-
|
295 |
-
parser = argparse.ArgumentParser(description="piano cover downloader")
|
296 |
-
|
297 |
-
parser.add_argument(
|
298 |
-
"data_dir",
|
299 |
-
type=str,
|
300 |
-
default=None,
|
301 |
-
help="""directory contains {id}/{song_filename.wav}
|
302 |
-
""",
|
303 |
-
)
|
304 |
-
parser.add_argument(
|
305 |
-
"--dry_run", default=False, action="store_true", help="whether dry_run"
|
306 |
-
)
|
307 |
-
|
308 |
-
args = parser.parse_args()
|
309 |
-
|
310 |
-
def getfiles():
|
311 |
-
meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
|
312 |
-
print("meta ", len(meta_files))
|
313 |
-
|
314 |
-
samples = list()
|
315 |
-
for meta_file in tqdm(meta_files):
|
316 |
-
m = MidiAudioPair(meta_file, auto_remove_no_song=True)
|
317 |
-
if m.error_code != MidiAudioPair.NO_SONG:
|
318 |
-
aux_txt = os.path.join(
|
319 |
-
m.audio_dir,
|
320 |
-
m.yaml.piano.ytid,
|
321 |
-
f"{m.yaml.piano.title[:50]}___{m.yaml.song.title[:50]}.txt",
|
322 |
-
)
|
323 |
-
with open(aux_txt, "w") as f:
|
324 |
-
f.write(".")
|
325 |
-
samples.append(m)
|
326 |
-
|
327 |
-
print(f"files available {len(samples)}")
|
328 |
-
return samples
|
329 |
-
|
330 |
-
samples = getfiles()
|
331 |
-
main(samples=samples, dry_run=args.dry_run)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/split_spleeter.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import os
|
3 |
-
import random
|
4 |
-
import sys
|
5 |
-
|
6 |
-
from tqdm.auto import tqdm
|
7 |
-
|
8 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
9 |
-
from midiaudiopair import MidiAudioPair
|
10 |
-
|
11 |
-
|
12 |
-
def split_spleeter(meta_files):
|
13 |
-
# Use audio loader explicitly for loading audio waveform :
|
14 |
-
from spleeter.audio.adapter import AudioAdapter
|
15 |
-
from spleeter.separator import Separator
|
16 |
-
import spleeter
|
17 |
-
|
18 |
-
sample_rate = 44100
|
19 |
-
audio_loader = AudioAdapter.default()
|
20 |
-
|
21 |
-
# Using embedded configuration.
|
22 |
-
separator = Separator("spleeter:2stems")
|
23 |
-
|
24 |
-
for meta_file in tqdm(meta_files):
|
25 |
-
sample = MidiAudioPair(meta_file)
|
26 |
-
if sample.error_code == MidiAudioPair.NO_SONG:
|
27 |
-
continue
|
28 |
-
if os.path.exists(sample.vocals):
|
29 |
-
continue
|
30 |
-
|
31 |
-
waveform, _ = audio_loader.load(sample.song, sample_rate=sample_rate)
|
32 |
-
|
33 |
-
# Perform the separation :
|
34 |
-
prediction = separator.separate(waveform)
|
35 |
-
|
36 |
-
audio_loader.save(
|
37 |
-
path=sample.vocals,
|
38 |
-
data=prediction["vocals"][:, 0:1],
|
39 |
-
codec=spleeter.audio.Codec.MP3,
|
40 |
-
sample_rate=sample_rate,
|
41 |
-
)
|
42 |
-
|
43 |
-
|
44 |
-
if __name__ == "__main__":
|
45 |
-
import argparse
|
46 |
-
|
47 |
-
parser = argparse.ArgumentParser(description="bpm estimate using essentia")
|
48 |
-
|
49 |
-
parser.add_argument(
|
50 |
-
"data_dir",
|
51 |
-
type=str,
|
52 |
-
default=None,
|
53 |
-
help="""directory contains {id}/{pop_filename.wav}
|
54 |
-
""",
|
55 |
-
)
|
56 |
-
|
57 |
-
parser.add_argument(
|
58 |
-
"--random_order",
|
59 |
-
default=False,
|
60 |
-
action="store_true",
|
61 |
-
help="Random order process (to run multiple process)",
|
62 |
-
)
|
63 |
-
|
64 |
-
args = parser.parse_args()
|
65 |
-
|
66 |
-
meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
|
67 |
-
if args.random_order:
|
68 |
-
random.shuffle(meta_files)
|
69 |
-
|
70 |
-
print("meta ", len(meta_files))
|
71 |
-
|
72 |
-
split_spleeter(meta_files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,10 +1,11 @@
|
|
|
|
|
|
1 |
pretty-midi==0.2.9
|
2 |
-
|
3 |
-
transformers==4.16.1
|
4 |
-
pytorch-lightning==1.8.3
|
5 |
-
essentia==2.1b6.dev1034
|
6 |
-
note-seq==0.0.5
|
7 |
pyFluidSynth==1.3.0
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
librosa
|
3 |
pretty-midi==0.2.9
|
4 |
+
essentia==2.1b6.dev1034
|
|
|
|
|
|
|
|
|
5 |
pyFluidSynth==1.3.0
|
6 |
+
git+https://github.com/huggingface/transformers
|
7 |
+
midi2audio
|
8 |
+
pytube
|
9 |
+
gradio
|
10 |
+
resampy
|
11 |
+
soundfile
|
transformer_wrapper.py
DELETED
@@ -1,330 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import random
|
3 |
-
|
4 |
-
import numpy as np
|
5 |
-
import librosa
|
6 |
-
import torch
|
7 |
-
|
8 |
-
import pytorch_lightning as pl
|
9 |
-
import soundfile as sf
|
10 |
-
from torch.nn.utils.rnn import pad_sequence
|
11 |
-
from transformers import T5Config, T5ForConditionalGeneration
|
12 |
-
|
13 |
-
from midi_tokenizer import MidiTokenizer, extrapolate_beat_times
|
14 |
-
from layer.input import LogMelSpectrogram, ConcatEmbeddingToMel
|
15 |
-
from preprocess.beat_quantizer import extract_rhythm, interpolate_beat_times
|
16 |
-
from utils.dsp import get_stereo
|
17 |
-
|
18 |
-
|
19 |
-
DEFAULT_COMPOSERS = {"various composer": 2052}
|
20 |
-
|
21 |
-
|
22 |
-
class TransformerWrapper(pl.LightningModule):
|
23 |
-
def __init__(self, config):
|
24 |
-
super().__init__()
|
25 |
-
self.config = config
|
26 |
-
|
27 |
-
self.tokenizer = MidiTokenizer(config.tokenizer)
|
28 |
-
self.t5config = T5Config.from_pretrained("t5-small")
|
29 |
-
|
30 |
-
for k, v in config.t5.items():
|
31 |
-
self.t5config.__setattr__(k, v)
|
32 |
-
|
33 |
-
self.transformer = T5ForConditionalGeneration(self.t5config)
|
34 |
-
self.use_mel = self.config.dataset.use_mel
|
35 |
-
self.mel_is_conditioned = self.config.dataset.mel_is_conditioned
|
36 |
-
self.composer_to_feature_token = config.composer_to_feature_token
|
37 |
-
|
38 |
-
if self.use_mel and not self.mel_is_conditioned:
|
39 |
-
self.composer_to_feature_token = DEFAULT_COMPOSERS
|
40 |
-
|
41 |
-
if self.use_mel:
|
42 |
-
self.spectrogram = LogMelSpectrogram()
|
43 |
-
if self.mel_is_conditioned:
|
44 |
-
n_dim = 512
|
45 |
-
composer_n_vocab = len(self.composer_to_feature_token)
|
46 |
-
embedding_offset = min(self.composer_to_feature_token.values())
|
47 |
-
self.mel_conditioner = ConcatEmbeddingToMel(
|
48 |
-
embedding_offset=embedding_offset,
|
49 |
-
n_vocab=composer_n_vocab,
|
50 |
-
n_dim=n_dim,
|
51 |
-
)
|
52 |
-
else:
|
53 |
-
self.spectrogram = None
|
54 |
-
|
55 |
-
self.lr = config.training.lr
|
56 |
-
|
57 |
-
def forward(self, input_ids, labels):
|
58 |
-
"""
|
59 |
-
Deprecated.
|
60 |
-
"""
|
61 |
-
rt = self.transformer(input_ids=input_ids, labels=labels)
|
62 |
-
return rt
|
63 |
-
|
64 |
-
@torch.no_grad()
|
65 |
-
def single_inference(
|
66 |
-
self,
|
67 |
-
feature_tokens=None,
|
68 |
-
audio=None,
|
69 |
-
beatstep=None,
|
70 |
-
max_length=256,
|
71 |
-
max_batch_size=64,
|
72 |
-
n_bars=None,
|
73 |
-
composer_value=None,
|
74 |
-
):
|
75 |
-
"""
|
76 |
-
generate a long audio sequence
|
77 |
-
|
78 |
-
feature_tokens or audio : shape (time, )
|
79 |
-
|
80 |
-
beatstep : shape (time, )
|
81 |
-
- input_ids가 해당하는 beatstep 값들
|
82 |
-
(offset 빠짐, 즉 beatstep[0] == 0)
|
83 |
-
- beatstep[-1] : input_ids가 끝나는 지점의 시간값
|
84 |
-
(즉 beatstep[-1] == len(y)//sr)
|
85 |
-
"""
|
86 |
-
|
87 |
-
assert feature_tokens is not None or audio is not None
|
88 |
-
assert beatstep is not None
|
89 |
-
|
90 |
-
if feature_tokens is not None:
|
91 |
-
assert len(feature_tokens.shape) == 1
|
92 |
-
|
93 |
-
if audio is not None:
|
94 |
-
assert len(audio.shape) == 1
|
95 |
-
|
96 |
-
config = self.config
|
97 |
-
PAD = self.t5config.pad_token_id
|
98 |
-
n_bars = config.dataset.n_bars if n_bars is None else n_bars
|
99 |
-
|
100 |
-
if beatstep[0] > 0.01:
|
101 |
-
print(
|
102 |
-
"inference warning : beatstep[0] is not 0 ({beatstep[0]}). all beatstep will be shifted."
|
103 |
-
)
|
104 |
-
beatstep = beatstep - beatstep[0]
|
105 |
-
|
106 |
-
if self.use_mel:
|
107 |
-
input_ids = None
|
108 |
-
inputs_embeds, ext_beatstep = self.prepare_inference_mel(
|
109 |
-
audio,
|
110 |
-
beatstep,
|
111 |
-
n_bars=n_bars,
|
112 |
-
padding_value=PAD,
|
113 |
-
composer_value=composer_value,
|
114 |
-
)
|
115 |
-
batch_size = inputs_embeds.shape[0]
|
116 |
-
else:
|
117 |
-
raise NotImplementedError
|
118 |
-
|
119 |
-
# Considering GPU capacity, some sequence would not be generated at once.
|
120 |
-
relative_tokens = list()
|
121 |
-
for i in range(0, batch_size, max_batch_size):
|
122 |
-
start = i
|
123 |
-
end = min(batch_size, i + max_batch_size)
|
124 |
-
|
125 |
-
if input_ids is None:
|
126 |
-
_input_ids = None
|
127 |
-
_inputs_embeds = inputs_embeds[start:end]
|
128 |
-
else:
|
129 |
-
_input_ids = input_ids[start:end]
|
130 |
-
_inputs_embeds = None
|
131 |
-
|
132 |
-
_relative_tokens = self.transformer.generate(
|
133 |
-
input_ids=_input_ids,
|
134 |
-
inputs_embeds=_inputs_embeds,
|
135 |
-
max_length=max_length,
|
136 |
-
)
|
137 |
-
_relative_tokens = _relative_tokens.cpu().numpy()
|
138 |
-
relative_tokens.append(_relative_tokens)
|
139 |
-
|
140 |
-
max_length = max([rt.shape[-1] for rt in relative_tokens])
|
141 |
-
for i in range(len(relative_tokens)):
|
142 |
-
relative_tokens[i] = np.pad(
|
143 |
-
relative_tokens[i],
|
144 |
-
[(0, 0), (0, max_length - relative_tokens[i].shape[-1])],
|
145 |
-
constant_values=PAD,
|
146 |
-
)
|
147 |
-
relative_tokens = np.concatenate(relative_tokens)
|
148 |
-
|
149 |
-
pm, notes = self.tokenizer.relative_batch_tokens_to_midi(
|
150 |
-
relative_tokens,
|
151 |
-
beatstep=ext_beatstep,
|
152 |
-
bars_per_batch=n_bars,
|
153 |
-
cutoff_time_idx=(n_bars + 1) * 4,
|
154 |
-
)
|
155 |
-
|
156 |
-
return relative_tokens, notes, pm
|
157 |
-
|
158 |
-
def prepare_inference_mel(self, audio, beatstep, n_bars, padding_value, composer_value=None):
|
159 |
-
n_steps = n_bars * 4
|
160 |
-
n_target_step = len(beatstep)
|
161 |
-
sample_rate = self.config.dataset.sample_rate
|
162 |
-
ext_beatstep = extrapolate_beat_times(beatstep, (n_bars + 1) * 4 + 1)
|
163 |
-
|
164 |
-
def split_audio(audio):
|
165 |
-
# Split audio corresponding beat intervals.
|
166 |
-
# Each audio's lengths are different.
|
167 |
-
# Because each corresponding beat interval times are different.
|
168 |
-
batch = []
|
169 |
-
|
170 |
-
for i in range(0, n_target_step, n_steps):
|
171 |
-
|
172 |
-
start_idx = i
|
173 |
-
end_idx = min(i + n_steps, n_target_step)
|
174 |
-
|
175 |
-
start_sample = int(ext_beatstep[start_idx] * sample_rate)
|
176 |
-
end_sample = int(ext_beatstep[end_idx] * sample_rate)
|
177 |
-
feature = audio[start_sample:end_sample]
|
178 |
-
batch.append(feature)
|
179 |
-
return batch
|
180 |
-
|
181 |
-
def pad_and_stack_batch(batch):
|
182 |
-
batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
|
183 |
-
return batch
|
184 |
-
|
185 |
-
batch = split_audio(audio)
|
186 |
-
batch = pad_and_stack_batch(batch)
|
187 |
-
|
188 |
-
inputs_embeds = self.spectrogram(batch).transpose(-1, -2)
|
189 |
-
if self.mel_is_conditioned:
|
190 |
-
composer_value = torch.tensor(composer_value).to(self.device)
|
191 |
-
composer_value = composer_value.repeat(inputs_embeds.shape[0])
|
192 |
-
inputs_embeds = self.mel_conditioner(inputs_embeds, composer_value)
|
193 |
-
return inputs_embeds, ext_beatstep
|
194 |
-
|
195 |
-
@torch.no_grad()
|
196 |
-
def generate(
|
197 |
-
self,
|
198 |
-
audio_path=None,
|
199 |
-
composer=None,
|
200 |
-
model="generated",
|
201 |
-
steps_per_beat=2,
|
202 |
-
stereo_amp=0.5,
|
203 |
-
n_bars=2,
|
204 |
-
ignore_duplicate=True,
|
205 |
-
show_plot=False,
|
206 |
-
save_midi=False,
|
207 |
-
save_mix=False,
|
208 |
-
midi_path=None,
|
209 |
-
mix_path=None,
|
210 |
-
click_amp=0.2,
|
211 |
-
add_click=False,
|
212 |
-
max_batch_size=None,
|
213 |
-
beatsteps=None,
|
214 |
-
mix_sample_rate=None,
|
215 |
-
audio_y=None,
|
216 |
-
audio_sr=None,
|
217 |
-
):
|
218 |
-
config = self.config
|
219 |
-
device = self.device
|
220 |
-
|
221 |
-
if audio_path is not None:
|
222 |
-
extension = os.path.splitext(audio_path)[1]
|
223 |
-
mix_path = (
|
224 |
-
audio_path.replace(extension, f".{model}.{composer}.wav")
|
225 |
-
if mix_path is None
|
226 |
-
else mix_path
|
227 |
-
)
|
228 |
-
midi_path = (
|
229 |
-
audio_path.replace(extension, f".{model}.{composer}.mid")
|
230 |
-
if midi_path is None
|
231 |
-
else midi_path
|
232 |
-
)
|
233 |
-
|
234 |
-
max_batch_size = 64 // n_bars if max_batch_size is None else max_batch_size
|
235 |
-
composer_to_feature_token = self.composer_to_feature_token
|
236 |
-
|
237 |
-
if composer is None:
|
238 |
-
composer = random.sample(list(composer_to_feature_token.keys()), 1)[0]
|
239 |
-
|
240 |
-
composer_value = composer_to_feature_token[composer]
|
241 |
-
mix_sample_rate = config.dataset.sample_rate if mix_sample_rate is None else mix_sample_rate
|
242 |
-
|
243 |
-
if not ignore_duplicate:
|
244 |
-
if os.path.exists(midi_path):
|
245 |
-
return
|
246 |
-
|
247 |
-
ESSENTIA_SAMPLERATE = 44100
|
248 |
-
|
249 |
-
if beatsteps is None:
|
250 |
-
y, sr = librosa.load(audio_path, sr=ESSENTIA_SAMPLERATE)
|
251 |
-
(
|
252 |
-
bpm,
|
253 |
-
beat_times,
|
254 |
-
confidence,
|
255 |
-
estimates,
|
256 |
-
essentia_beat_intervals,
|
257 |
-
) = extract_rhythm(audio_path, y=y)
|
258 |
-
beat_times = np.array(beat_times)
|
259 |
-
beatsteps = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
|
260 |
-
else:
|
261 |
-
y = None
|
262 |
-
|
263 |
-
if self.use_mel:
|
264 |
-
if audio_y is None and config.dataset.sample_rate != ESSENTIA_SAMPLERATE:
|
265 |
-
if y is not None:
|
266 |
-
y = librosa.core.resample(
|
267 |
-
y,
|
268 |
-
orig_sr=ESSENTIA_SAMPLERATE,
|
269 |
-
target_sr=config.dataset.sample_rate,
|
270 |
-
)
|
271 |
-
sr = config.dataset.sample_rate
|
272 |
-
else:
|
273 |
-
y, sr = librosa.load(audio_path, sr=config.dataset.sample_rate)
|
274 |
-
elif audio_y is not None:
|
275 |
-
if audio_sr != config.dataset.sample_rate:
|
276 |
-
audio_y = librosa.core.resample(
|
277 |
-
audio_y, orig_sr=audio_sr, target_sr=config.dataset.sample_rate
|
278 |
-
)
|
279 |
-
audio_sr = config.dataset.sample_rate
|
280 |
-
y = audio_y
|
281 |
-
sr = audio_sr
|
282 |
-
|
283 |
-
start_sample = int(beatsteps[0] * sr)
|
284 |
-
end_sample = int(beatsteps[-1] * sr)
|
285 |
-
_audio = torch.from_numpy(y)[start_sample:end_sample].to(device)
|
286 |
-
fzs = None
|
287 |
-
else:
|
288 |
-
raise NotImplementedError
|
289 |
-
|
290 |
-
relative_tokens, notes, pm = self.single_inference(
|
291 |
-
feature_tokens=fzs,
|
292 |
-
audio=_audio,
|
293 |
-
beatstep=beatsteps - beatsteps[0],
|
294 |
-
max_length=config.dataset.target_length * max(1, (n_bars // config.dataset.n_bars)),
|
295 |
-
max_batch_size=max_batch_size,
|
296 |
-
n_bars=n_bars,
|
297 |
-
composer_value=composer_value,
|
298 |
-
)
|
299 |
-
|
300 |
-
for n in pm.instruments[0].notes:
|
301 |
-
n.start += beatsteps[0]
|
302 |
-
n.end += beatsteps[0]
|
303 |
-
|
304 |
-
if show_plot or save_mix:
|
305 |
-
if mix_sample_rate != sr:
|
306 |
-
y = librosa.core.resample(y, orig_sr=sr, target_sr=mix_sample_rate)
|
307 |
-
sr = mix_sample_rate
|
308 |
-
if add_click:
|
309 |
-
clicks = librosa.clicks(times=beatsteps, sr=sr, length=len(y)) * click_amp
|
310 |
-
y = y + clicks
|
311 |
-
pm_y = pm.fluidsynth(sr)
|
312 |
-
stereo = get_stereo(y, pm_y, pop_scale=stereo_amp)
|
313 |
-
|
314 |
-
if show_plot:
|
315 |
-
import note_seq
|
316 |
-
|
317 |
-
note_seq.plot_sequence(note_seq.midi_to_note_sequence(pm))
|
318 |
-
|
319 |
-
if save_mix:
|
320 |
-
sf.write(
|
321 |
-
file=mix_path,
|
322 |
-
data=stereo.T,
|
323 |
-
samplerate=sr,
|
324 |
-
format="wav",
|
325 |
-
)
|
326 |
-
|
327 |
-
if save_midi:
|
328 |
-
pm.write(midi_path)
|
329 |
-
|
330 |
-
return pm, composer, mix_path, midi_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/__init__.py
DELETED
File without changes
|
utils/dsp.py
DELETED
@@ -1,63 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from scipy.interpolate import interp1d
|
3 |
-
|
4 |
-
|
5 |
-
def normalize(audio, min_y=-1.0, max_y=1.0, eps=1e-8):
|
6 |
-
assert len(audio.shape) == 1
|
7 |
-
max_y -= eps
|
8 |
-
min_y += eps
|
9 |
-
amax = audio.max()
|
10 |
-
amin = audio.min()
|
11 |
-
audio = (max_y - min_y) * (audio - amin) / (amax - amin) + min_y
|
12 |
-
return audio
|
13 |
-
|
14 |
-
|
15 |
-
def get_stereo(pop_y, midi_y, pop_scale=0.99):
|
16 |
-
if len(pop_y) > len(midi_y):
|
17 |
-
midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
|
18 |
-
elif len(pop_y) < len(midi_y):
|
19 |
-
pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
|
20 |
-
stereo = np.stack((midi_y, pop_y * pop_scale))
|
21 |
-
return stereo
|
22 |
-
|
23 |
-
|
24 |
-
def generate_variable_f0_sine_wave(f0, len_y, sr):
|
25 |
-
"""
|
26 |
-
integrate instant frequencies to get pure tone sine wave
|
27 |
-
"""
|
28 |
-
x_sample = np.arange(len(f0))
|
29 |
-
intp = interp1d(x_sample, f0, kind="linear")
|
30 |
-
f0_audiorate = intp(np.linspace(0, len(f0) - 1, len_y))
|
31 |
-
pitch_wave = np.sin((np.nan_to_num(f0_audiorate) / sr * 2 * np.pi).cumsum())
|
32 |
-
return pitch_wave
|
33 |
-
|
34 |
-
|
35 |
-
def fluidsynth_without_normalize(self, fs=44100, sf2_path=None):
|
36 |
-
"""Synthesize using fluidsynth. without signal normalize
|
37 |
-
Parameters
|
38 |
-
----------
|
39 |
-
fs : int
|
40 |
-
Sampling rate to synthesize at.
|
41 |
-
sf2_path : str
|
42 |
-
Path to a .sf2 file.
|
43 |
-
Default ``None``, which uses the TimGM6mb.sf2 file included with
|
44 |
-
``pretty_midi``.
|
45 |
-
Returns
|
46 |
-
-------
|
47 |
-
synthesized : np.ndarray
|
48 |
-
Waveform of the MIDI data, synthesized at ``fs``.
|
49 |
-
"""
|
50 |
-
# If there are no instruments, or all instruments have no notes, return
|
51 |
-
# an empty array
|
52 |
-
if len(self.instruments) == 0 or all(len(i.notes) == 0 for i in self.instruments):
|
53 |
-
return np.array([])
|
54 |
-
# Get synthesized waveform for each instrument
|
55 |
-
waveforms = [i.fluidsynth(fs=fs, sf2_path=sf2_path) for i in self.instruments]
|
56 |
-
# Allocate output waveform, with #sample = max length of all waveforms
|
57 |
-
synthesized = np.zeros(np.max([w.shape[0] for w in waveforms]))
|
58 |
-
# Sum all waveforms in
|
59 |
-
for waveform in waveforms:
|
60 |
-
synthesized[: waveform.shape[0]] += waveform
|
61 |
-
# Normalize
|
62 |
-
# synthesized /= np.abs(synthesized).max()
|
63 |
-
return synthesized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|