sweetcocoa commited on
Commit
88490a8
1 Parent(s): e28a4f0

initial test

Browse files
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Pop2piano
3
- emoji: 🚀
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.10.0
8
  app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Pop2Piano Demo
3
+ emoji: 🎹
4
+ colorFrom: black
5
+ colorTo: white
6
  sdk: streamlit
7
  sdk_version: 1.10.0
8
  app_file: app.py
9
+ pinned: true
10
+ ---
 
 
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from transformer_wrapper import TransformerWrapper
4
+ from omegaconf import OmegaConf
5
+
6
+
7
+ @st.cache(show_spinner=False)
8
+ def get_file_content_as_string(path):
9
+ return open(path, "r", encoding="utf-8").read()
10
+
11
+
12
+ @st.cache(show_spinner=True)
13
+ def model_load():
14
+ config = OmegaConf.load("config.yaml")
15
+ wrapper = TransformerWrapper(config)
16
+ wrapper = wrapper.load_from_checkpoint(
17
+ "https://huggingface.co/sweetcocoa/pop2piano/raw/main/model-1999-val_0.67311615.ckpt",
18
+ config=config,
19
+ ).cuda()
20
+ model_id = "dpipqxiy"
21
+ wrapper.eval()
22
+ return wrapper, model_id, config
23
+
24
+
25
+ def main():
26
+
27
+ wrapper, model_id, config = model_load()
28
+ composers = list(config.composer_to_feature_token.keys())
29
+ dest_dir = "ytsamples"
30
+ composer = st.selectbox(label="Arranger", options=composers)
31
+ file_up = st.file_uploader("Upload an audio", type=["mp3", "wav"])
32
+
33
+ if st.button("convert"):
34
+
35
+ if file_up is not None:
36
+ bytes_data = file_up.getvalue()
37
+ target_file = f"{dest_dir}/{file_up.name}"
38
+ with open(target_file, "wb") as f:
39
+ f.write(bytes_data)
40
+
41
+ with st.spinner("Wait for it..."):
42
+ midi, arranger, mix_path, midi_path = wrapper.generate(
43
+ audio_path=target_file,
44
+ composer=composer,
45
+ model=model_id,
46
+ ignore_duplicate=True,
47
+ show_plot=False,
48
+ save_midi=True,
49
+ save_mix=True,
50
+ vqvae=None,
51
+ )
52
+
53
+ with open(midi_path, "rb") as midi_f:
54
+ file_down = st.download_button(
55
+ "Download midi",
56
+ data=midi_f,
57
+ file_name=os.path.basename(midi_path),
58
+ )
59
+ with open(mix_path, "rb") as audio_f:
60
+ st.audio(audio_f.read(), format="audio/wav")
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
config.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project: pop2piano
2
+ dataset:
3
+ target_length: 256
4
+ input_length: 1024
5
+ n_bars: 2
6
+ sample_rate: 22050
7
+ use_mel: true
8
+ mel_is_conditioned: true
9
+ composer_to_feature_token:
10
+ composer1: 2052
11
+ composer2: 2053
12
+ composer3: 2054
13
+ composer4: 2055
14
+ composer5: 2056
15
+ composer6: 2057
16
+ composer7: 2058
17
+ composer8: 2059
18
+ composer9: 2060
19
+ composer10: 2061
20
+ composer11: 2062
21
+ composer12: 2063
22
+ composer13: 2064
23
+ composer14: 2065
24
+ composer15: 2066
25
+ composer16: 2067
26
+ composer17: 2068
27
+ composer18: 2069
28
+ composer19: 2070
29
+ composer20: 2071
30
+ composer21: 2072
31
+ t5:
32
+ feed_forward_proj: gated-gelu
33
+ tie_word_embeddings: false
34
+ tie_encoder_decoder: false
35
+ vocab_size: 2400
36
+ n_positions: 1024
37
+ relative_attention_num_buckets: 32
38
+ tokenizer:
39
+ vocab_size:
40
+ special: 4
41
+ note: 128
42
+ velocity: 2
43
+ time: 100
44
+ training:
45
+ seed: 3407
46
+ resume: false
47
+ offline: false
48
+ num_gpu: 1
49
+ max_epochs: 5000
50
+ accumulate_grad_batches: 1
51
+ check_val_every_n_epoch: 20
52
+ find_lr: false
53
+ optimizer: adafactor
54
+ version: none
55
+ lr: 0.001
56
+ lr_min: 1.0e-06
57
+ lr_scheduler: false
58
+ lr_decay: 0.99
59
+ batch_size: 32
60
+ num_workers: 32
61
+ gradient_clip_val: 3.0
layer/__init__.py ADDED
File without changes
layer/input.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torchaudio
4
+
5
+
6
+ class LogMelSpectrogram(nn.Module):
7
+ def __init__(self) -> None:
8
+ super().__init__()
9
+ self.melspectrogram = torchaudio.transforms.MelSpectrogram(
10
+ sample_rate=22050,
11
+ n_fft=4096,
12
+ hop_length=1024,
13
+ f_min=10.0,
14
+ n_mels=512,
15
+ )
16
+
17
+ def forward(self, x):
18
+ # x : audio(batch, sample)
19
+ # X : melspec (batch, freq, frame)
20
+ with torch.no_grad():
21
+ with torch.cuda.amp.autocast(enabled=False):
22
+ X = self.melspectrogram(x)
23
+ X = X.clamp(min=1e-6).log()
24
+
25
+ return X
26
+
27
+
28
+ class ConcatEmbeddingToMel(nn.Module):
29
+ def __init__(self, embedding_offset, n_vocab, n_dim) -> None:
30
+ super().__init__()
31
+ self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_dim)
32
+ self.embedding_offset = embedding_offset
33
+
34
+ def forward(self, feature, index_value):
35
+ """
36
+ index_value : (batch, )
37
+ feature : (batch, time, feature_dim)
38
+ """
39
+ index_shifted = index_value - self.embedding_offset
40
+
41
+ # (batch, 1, feature_dim)
42
+ composer_embedding = self.embedding(index_shifted).unsqueeze(1)
43
+ # print(composer_embedding.shape, feature.shape)
44
+ # (batch, 1 + time, feature_dim)
45
+ inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
46
+ return inputs_embeds
midi_tokenizer.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from numba import jit
3
+ import pretty_midi
4
+ import scipy.interpolate as interp
5
+
6
+ TOKEN_SPECIAL: int = 0
7
+ TOKEN_NOTE: int = 1
8
+ TOKEN_VELOCITY: int = 2
9
+ TOKEN_TIME: int = 3
10
+
11
+ DEFAULT_VELOCITY: int = 77
12
+
13
+ TIE: int = 2
14
+ EOS: int = 1
15
+ PAD: int = 0
16
+
17
+
18
+ def extrapolate_beat_times(beat_times, n_extend=1):
19
+ beat_times_function = interp.interp1d(
20
+ np.arange(beat_times.size),
21
+ beat_times,
22
+ bounds_error=False,
23
+ fill_value="extrapolate",
24
+ )
25
+
26
+ ext_beats = beat_times_function(
27
+ np.linspace(0, beat_times.size + n_extend - 1, beat_times.size + n_extend)
28
+ )
29
+
30
+ return ext_beats
31
+
32
+
33
+ @jit(nopython=True, cache=True)
34
+ def fast_tokenize(idx, token_type, n_special, n_note, n_velocity):
35
+ if token_type == TOKEN_TIME:
36
+ return n_special + n_note + n_velocity + idx
37
+ elif token_type == TOKEN_VELOCITY:
38
+ return n_special + n_note + idx
39
+ elif token_type == TOKEN_NOTE:
40
+ return n_special + idx
41
+ elif token_type == TOKEN_SPECIAL:
42
+ return idx
43
+ else:
44
+ return -1
45
+
46
+
47
+ @jit(nopython=True, cache=True)
48
+ def fast_detokenize(idx, n_special, n_note, n_velocity, time_idx_offset):
49
+ if idx >= n_special + n_note + n_velocity:
50
+ return (TOKEN_TIME, (idx - (n_special + n_note + n_velocity)) + time_idx_offset)
51
+ elif idx >= n_special + n_note:
52
+ return TOKEN_VELOCITY, idx - (n_special + n_note)
53
+ elif idx >= n_special:
54
+ return TOKEN_NOTE, idx - n_special
55
+ else:
56
+ return TOKEN_SPECIAL, idx
57
+
58
+
59
+ class MidiTokenizer:
60
+ def __init__(self, config) -> None:
61
+ self.config = config
62
+
63
+ def tokenize_note(self, idx, token_type):
64
+ rt = fast_tokenize(
65
+ idx,
66
+ token_type,
67
+ self.config.vocab_size.special,
68
+ self.config.vocab_size.note,
69
+ self.config.vocab_size.velocity,
70
+ )
71
+ if rt == -1:
72
+ raise ValueError(f"type {type} is not a predefined token type.")
73
+ else:
74
+ return rt
75
+
76
+ def notes_to_tokens(self, notes):
77
+ """
78
+ notes : (onset idx, offset idx, pitch, velocity)
79
+ """
80
+ max_time_idx = notes[:, :2].max()
81
+
82
+ times = [[] for i in range((max_time_idx + 1))]
83
+ for onset, offset, pitch, velocity in notes:
84
+ times[onset].append([pitch, velocity])
85
+ times[offset].append([pitch, 0])
86
+
87
+ tokens = []
88
+ current_velocity = 0
89
+ for i, time in enumerate(times):
90
+ if len(time) == 0:
91
+ continue
92
+ tokens.append(self.tokenize_note(i, TOKEN_TIME))
93
+ for pitch, velocity in time:
94
+ velocity = int(velocity > 0)
95
+ if current_velocity != velocity:
96
+ current_velocity = velocity
97
+ tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
98
+ tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
99
+
100
+ return np.array(tokens, dtype=int)
101
+
102
+ def detokenize(self, token, time_idx_offset):
103
+ type, value = fast_detokenize(
104
+ token,
105
+ n_special=self.config.vocab_size.special,
106
+ n_note=self.config.vocab_size.note,
107
+ n_velocity=self.config.vocab_size.velocity,
108
+ time_idx_offset=time_idx_offset,
109
+ )
110
+ if type != TOKEN_TIME:
111
+ value = int(value)
112
+ return [type, value]
113
+
114
+ def to_string(self, tokens, time_idx_offset=0):
115
+ nums = [
116
+ self.detokenize(token, time_idx_offset=time_idx_offset) for token in tokens
117
+ ]
118
+ strings = []
119
+ for i in range(len(nums)):
120
+ type = nums[i][0]
121
+ value = nums[i][1]
122
+
123
+ if type == TOKEN_TIME:
124
+ type = "time"
125
+ elif type == TOKEN_SPECIAL:
126
+ if value == EOS:
127
+ value = "EOS"
128
+ elif value == PAD:
129
+ value = "PAD"
130
+ elif value == TIE:
131
+ value = "TIE"
132
+ else:
133
+ value = "Unknown Special"
134
+ elif type == TOKEN_NOTE:
135
+ type = "note"
136
+ elif type == TOKEN_VELOCITY:
137
+ type = "velocity"
138
+ strings.append((type, value))
139
+ return strings
140
+
141
+ def split_notes(self, notes, beatsteps, time_from, time_to):
142
+ """
143
+ Assumptions
144
+ - notes are sorted by onset time
145
+ - beatsteps are sorted by time
146
+ """
147
+ start_idx = np.searchsorted(beatsteps, time_from)
148
+ start_note = np.searchsorted(notes[:, 0], start_idx)
149
+
150
+ end_idx = np.searchsorted(beatsteps, time_to)
151
+ end_note = np.searchsorted(notes[:, 0], end_idx)
152
+ splited_notes = notes[start_note:end_note]
153
+
154
+ return splited_notes, (start_idx, end_idx, start_note, end_note)
155
+
156
+ def notes_to_relative_tokens(
157
+ self, notes, offset_idx, add_eos=False, add_composer=False, composer_value=None
158
+ ):
159
+ """
160
+ notes : (onset idx, offset idx, pitch, velocity)
161
+ """
162
+
163
+ def _add_eos(tokens):
164
+ tokens = np.concatenate((tokens, np.array([EOS], dtype=tokens.dtype)))
165
+ return tokens
166
+
167
+ def _add_composer(tokens, composer_value):
168
+ tokens = np.concatenate(
169
+ (np.array([composer_value], dtype=tokens.dtype), tokens)
170
+ )
171
+ return tokens
172
+
173
+ if len(notes) == 0:
174
+ tokens = np.array([], dtype=int)
175
+ if add_eos:
176
+ tokens = _add_eos(tokens)
177
+ if add_composer:
178
+ tokens = _add_composer(tokens, composer_value=composer_value)
179
+ return tokens
180
+
181
+ max_time_idx = notes[:, :2].max()
182
+
183
+ # times[time_idx] = [[pitch, .. ], [pitch, 0], ..]
184
+ times = [[] for i in range((max_time_idx + 1 - offset_idx))]
185
+ for abs_onset, abs_offset, pitch, velocity in notes:
186
+ rel_onset = abs_onset - offset_idx
187
+ rel_offset = abs_offset - offset_idx
188
+ times[rel_onset].append([pitch, velocity])
189
+ times[rel_offset].append([pitch, 0])
190
+
191
+ # 여기서부터는 전부 시간 0(offset) 기준
192
+ tokens = []
193
+ current_velocity = 0
194
+ current_time_idx = 0
195
+
196
+ for rel_idx, time in enumerate(times):
197
+ if len(time) == 0:
198
+ continue
199
+ time_idx_shift = rel_idx - current_time_idx
200
+ current_time_idx = rel_idx
201
+
202
+ tokens.append(self.tokenize_note(time_idx_shift, TOKEN_TIME))
203
+ for pitch, velocity in time:
204
+ velocity = int(velocity > 0)
205
+ if current_velocity != velocity:
206
+ current_velocity = velocity
207
+ tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
208
+ tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
209
+
210
+ tokens = np.array(tokens, dtype=int)
211
+ if add_eos:
212
+ tokens = _add_eos(tokens)
213
+ if add_composer:
214
+ tokens = _add_composer(tokens, composer_value=composer_value)
215
+ return tokens
216
+
217
+ def relative_batch_tokens_to_midi(
218
+ self,
219
+ tokens,
220
+ beatstep,
221
+ beat_offset_idx=None,
222
+ bars_per_batch=None,
223
+ cutoff_time_idx=None,
224
+ ):
225
+ """
226
+ tokens : (batch, sequence)
227
+ beatstep : (times, )
228
+ """
229
+ beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
230
+ notes = None
231
+ bars_per_batch = 2 if bars_per_batch is None else bars_per_batch
232
+
233
+ N = len(tokens)
234
+ for n in range(N):
235
+ _tokens = tokens[n]
236
+ _start_idx = beat_offset_idx + n * bars_per_batch * 4
237
+ _cutoff_time_idx = cutoff_time_idx + _start_idx
238
+ _notes = self.relative_tokens_to_notes(
239
+ _tokens,
240
+ start_idx=_start_idx,
241
+ cutoff_time_idx=_cutoff_time_idx,
242
+ )
243
+ # print(_notes, "\n-------")
244
+ if len(_notes) == 0:
245
+ pass
246
+ # print("_notes zero")
247
+ elif notes is None:
248
+ notes = _notes
249
+ else:
250
+ notes = np.concatenate((notes, _notes), axis=0)
251
+
252
+ if notes is None:
253
+ notes = []
254
+ midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
255
+ return midi, notes
256
+
257
+ def relative_tokens_to_notes(self, tokens, start_idx, cutoff_time_idx=None):
258
+ # TODO remove legacy
259
+ # decoding 첫토큰이 편곡자인 경우
260
+ if tokens[0] >= sum(self.config.vocab_size.values()):
261
+ tokens = tokens[1:]
262
+
263
+ words = [self.detokenize(token, time_idx_offset=0) for token in tokens]
264
+
265
+ if hasattr(start_idx, "item"):
266
+ """
267
+ if numpy or torch tensor
268
+ """
269
+ start_idx = start_idx.item()
270
+
271
+ current_idx = start_idx
272
+ current_velocity = 0
273
+ note_onsets_ready = [None for i in range(self.config.vocab_size.note + 1)]
274
+ notes = []
275
+ for type, number in words:
276
+ if type == TOKEN_SPECIAL:
277
+ if number == EOS:
278
+ break
279
+ elif type == TOKEN_TIME:
280
+ current_idx += number
281
+ if cutoff_time_idx is not None:
282
+ current_idx = min(current_idx, cutoff_time_idx)
283
+
284
+ elif type == TOKEN_VELOCITY:
285
+ current_velocity = number
286
+ elif type == TOKEN_NOTE:
287
+ pitch = number
288
+ if current_velocity == 0:
289
+ # note_offset
290
+ if note_onsets_ready[pitch] is None:
291
+ # offset without onset
292
+ pass
293
+ else:
294
+ onset_idx = note_onsets_ready[pitch]
295
+ if onset_idx >= current_idx:
296
+ # No time shift after previous note_on
297
+ pass
298
+ else:
299
+ offset_idx = current_idx
300
+ notes.append(
301
+ [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
302
+ )
303
+ note_onsets_ready[pitch] = None
304
+ else:
305
+ # note_on
306
+ if note_onsets_ready[pitch] is None:
307
+ note_onsets_ready[pitch] = current_idx
308
+ else:
309
+ # note-on already exists
310
+ onset_idx = note_onsets_ready[pitch]
311
+ if onset_idx >= current_idx:
312
+ # No time shift after previous note_on
313
+ pass
314
+ else:
315
+ offset_idx = current_idx
316
+ notes.append(
317
+ [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
318
+ )
319
+ note_onsets_ready[pitch] = current_idx
320
+ else:
321
+ raise ValueError
322
+
323
+ for pitch, note_on in enumerate(note_onsets_ready):
324
+ # force offset if no offset for each pitch
325
+ if note_on is not None:
326
+ if cutoff_time_idx is None:
327
+ cutoff = note_on + 1
328
+ else:
329
+ cutoff = max(cutoff_time_idx, note_on + 1)
330
+
331
+ offset_idx = max(current_idx, cutoff)
332
+ notes.append([note_on, offset_idx, pitch, DEFAULT_VELOCITY])
333
+
334
+ if len(notes) == 0:
335
+ return []
336
+ else:
337
+ notes = np.array(notes)
338
+ note_order = notes[:, 0] * 128 + notes[:, 1]
339
+ notes = notes[note_order.argsort()]
340
+ return notes
341
+
342
+ def notes_to_midi(self, notes, beatstep, offset_sec=None):
343
+ new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
344
+ new_inst = pretty_midi.Instrument(program=0)
345
+ new_notes = []
346
+ if offset_sec is None:
347
+ offset_sec = 0.0
348
+
349
+ for onset_idx, offset_idx, pitch, velocity in notes:
350
+ new_note = pretty_midi.Note(
351
+ velocity=velocity,
352
+ pitch=pitch,
353
+ start=beatstep[onset_idx] - offset_sec,
354
+ end=beatstep[offset_idx] - offset_sec,
355
+ )
356
+ new_notes.append(new_note)
357
+ new_inst.notes = new_notes
358
+ new_pm.instruments.append(new_inst)
359
+ new_pm.remove_invalid_notes()
360
+ return new_pm
361
+
362
+
363
+ @jit(nopython=True, cache=False)
364
+ def fast_notes_to_relative_tokens(
365
+ notes, offset_idx, max_time_idx, n_special, n_note, n_velocity
366
+ ):
367
+ """
368
+ notes : (onset idx, offset idx, pitch, velocity)
369
+ """
370
+
371
+ times_p = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
372
+ times_v = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
373
+
374
+ for abs_onset, abs_offset, pitch, velocity in notes:
375
+ rel_onset = abs_onset - offset_idx
376
+ rel_offset = abs_offset - offset_idx
377
+ times_p[rel_onset] = np.append(times_p[rel_onset], pitch)
378
+ times_v[rel_onset] = np.append(times_v[rel_onset], velocity)
379
+ times_p[rel_offset] = np.append(times_p[rel_offset], pitch)
380
+ times_v[rel_offset] = np.append(times_v[rel_offset], velocity)
381
+
382
+ # 여기서부터는 전부 시간 0(offset) 기준
383
+ tokens = []
384
+ current_velocity = np.array([0])
385
+ current_time_idx = np.array([0])
386
+
387
+ # range가 0일 수도 있으니까..
388
+ for i in range(len(times_p)):
389
+ rel_idx = i
390
+ notes_at_time = times_p[i]
391
+ if len(notes_at_time) == 0:
392
+ continue
393
+
394
+ time_idx_shift = rel_idx - current_time_idx[0]
395
+ current_time_idx[0] = rel_idx
396
+
397
+ token = fast_tokenize(
398
+ time_idx_shift,
399
+ TOKEN_TIME,
400
+ n_special=n_special,
401
+ n_note=n_note,
402
+ n_velocity=n_velocity,
403
+ )
404
+ tokens.append(token)
405
+
406
+ for j in range(len(notes_at_time)):
407
+ pitch = times_p[j]
408
+ velocity = times_v[j]
409
+ # for pitch, velocity in time:
410
+ velocity = int(velocity > 0)
411
+ if current_velocity[0] != velocity:
412
+ current_velocity[0] = velocity
413
+ token = fast_tokenize(
414
+ velocity,
415
+ TOKEN_VELOCITY,
416
+ n_special=n_special,
417
+ n_note=n_note,
418
+ n_velocity=n_velocity,
419
+ )
420
+ tokens.append(token)
421
+ token = fast_tokenize(
422
+ pitch,
423
+ TOKEN_NOTE,
424
+ n_special=n_special,
425
+ n_note=n_note,
426
+ n_velocity=n_velocity,
427
+ )
428
+ tokens.append(token)
429
+
430
+ return np.array(tokens)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ fluidsynth
preprocess/README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preprocess Scripts
2
+ ---
3
+ - Note : the order of these scripts is IMPORTANT.
4
+ - the preprocessing step is easy. but environment setting is not. please understand.
5
+ - If you encounter any problems, please do not hesitate to email me or open an issue to the github.
6
+
7
+ 1. Transcribe piano wavs to midi
8
+ - You should transcribe {piano_cover_file.wav} -> {piano_cover_file.mid}
9
+ - I recommend you to use original codes from this repo : [High-resolution Piano Transcription with Pedals by Regressing Onsets and Offsets Times](https://github.com/qiuqiangkong/piano_transcription_inference)
10
+
11
+ - Instead, you can also you my docker script.
12
+ ```bash
13
+ docker run -it --gpus all --rm -v /DIRECTORY_THAT_CONTAINS_PIANO_WAV/:/input -v /DIRECTORY_THAT_MIDI_OUTPUT/:/output jonghochoi/piano_transcribe:bytedance1
14
+ ```
15
+ - If you are using GPU RTX 30XX or higher, this script may not work properly. It's because the version of pytorch is too low(1.4).
16
+ - then upgrade the version of pytorch in the docker..
17
+
18
+ 2. Estimate Pop's beats
19
+ ```bash
20
+ python bpm_quantize.py DATA_DIR
21
+ ```
22
+
23
+ 3. synchronize midi
24
+ ```bash
25
+ python pop_align.py DATA_DIR
26
+ ```
27
+
28
+ 4. get separated vocal track
29
+ ```bash
30
+ python split_spleeter.py DATA_DIR
31
+ ```
32
+
33
+ 5. caculate melody chroma accuracy
34
+ ```bash
35
+ python melody_accuracy.py DATA_DIR
36
+ ```
preprocess/beat_quantizer.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import librosa
3
+ import essentia
4
+ import essentia.standard
5
+ import numpy as np
6
+ import scipy.interpolate as interp
7
+ import note_seq
8
+
9
+ SAMPLERATE = 44100
10
+
11
+
12
+ def nearest_onset_offset_digitize(on, off, bins):
13
+ intermediate = (bins[1:] + bins[:-1]) / 2
14
+ on_idx = np.digitize(on, intermediate)
15
+ off_idx = np.digitize(off, intermediate)
16
+ off_idx[on_idx == off_idx] += 1
17
+ # off_idx = np.clip(off_idx, a_min=0, a_max=len(bins) - 1)
18
+ return on_idx, off_idx
19
+
20
+
21
+ def apply_sustain_pedal(pm):
22
+ ns = note_seq.midi_to_note_sequence(pm)
23
+ susns = note_seq.apply_sustain_control_changes(ns)
24
+ suspm = note_seq.note_sequence_to_pretty_midi(susns)
25
+ return suspm
26
+
27
+
28
+ def interpolate_beat_times(beat_times, steps_per_beat, extend=False):
29
+ beat_times_function = interp.interp1d(
30
+ np.arange(beat_times.size),
31
+ beat_times,
32
+ bounds_error=False,
33
+ fill_value="extrapolate",
34
+ )
35
+ if extend:
36
+ beat_steps_8th = beat_times_function(
37
+ np.linspace(0, beat_times.size, beat_times.size * steps_per_beat + 1)
38
+ )
39
+ else:
40
+ beat_steps_8th = beat_times_function(
41
+ np.linspace(0, beat_times.size - 1, beat_times.size * steps_per_beat - 1)
42
+ )
43
+ return beat_steps_8th
44
+
45
+
46
+ def midi_quantize_by_beats(
47
+ sample, beat_times, steps_per_beat, ignore_sustain_pedal=False
48
+ ):
49
+ ns = note_seq.midi_file_to_note_sequence(sample.midi)
50
+ if ignore_sustain_pedal:
51
+ susns = ns
52
+ else:
53
+ susns = note_seq.apply_sustain_control_changes(ns)
54
+
55
+ qns = copy.deepcopy(susns)
56
+
57
+ notes = np.array([[n.start_time, n.end_time] for n in susns.notes])
58
+ note_attributes = np.array([[n.pitch, n.velocity] for n in susns.notes])
59
+
60
+ note_ons = np.array(notes[:, 0])
61
+ note_offs = np.array(notes[:, 1])
62
+
63
+ beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=False)
64
+
65
+ on_idx, off_idx = nearest_onset_offset_digitize(note_ons, note_offs, beat_steps_8th)
66
+
67
+ beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
68
+
69
+ discrete_notes = np.concatenate(
70
+ (np.stack((on_idx, off_idx), axis=1), note_attributes), axis=1
71
+ )
72
+
73
+ def delete_duplicate_notes(dnotes):
74
+ note_order = dnotes[:, 0] * 128 + dnotes[:, 2]
75
+ dnotes = dnotes[note_order.argsort()]
76
+ indices = []
77
+ for i in range(1, len(dnotes)):
78
+ if dnotes[i, 0] == dnotes[i - 1, 0] and dnotes[i, 2] == dnotes[i - 1, 2]:
79
+ indices.append(i)
80
+ dnotes = np.delete(dnotes, indices, axis=0)
81
+ note_order = dnotes[:, 0] * 128 + dnotes[:, 1]
82
+ dnotes = dnotes[note_order.argsort()]
83
+ return dnotes
84
+
85
+ discrete_notes = delete_duplicate_notes(discrete_notes)
86
+
87
+ digitized_note_ons, digitized_note_offs = (
88
+ beat_steps_8th[on_idx],
89
+ beat_steps_8th[off_idx],
90
+ )
91
+
92
+ for i, note in enumerate(qns.notes):
93
+ note.start_time = digitized_note_ons[i]
94
+ note.end_time = digitized_note_offs[i]
95
+
96
+ return qns, discrete_notes, beat_steps_8th
97
+
98
+
99
+ def extract_rhythm(song, y=None):
100
+ if y is None:
101
+ y, sr = librosa.load(song, sr=SAMPLERATE)
102
+
103
+ essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
104
+ (
105
+ bpm,
106
+ beat_times,
107
+ confidence,
108
+ estimates,
109
+ essentia_beat_intervals,
110
+ ) = essentia_tracker(y)
111
+ return bpm, beat_times, confidence, estimates, essentia_beat_intervals
preprocess/bpm_quantize.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import sys
3
+ import os
4
+
5
+
6
+ import librosa
7
+ import soundfile as sf
8
+ import numpy as np
9
+
10
+ import note_seq
11
+ from omegaconf import OmegaConf
12
+ from beat_quantizer import extract_rhythm, midi_quantize_by_beats
13
+
14
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
+ from midiaudiopair import MidiAudioPair
16
+ from utils.dsp import get_stereo
17
+
18
+
19
+ def estimate(meta_file, ignore_sustain_pedal):
20
+ sample = MidiAudioPair(meta_file)
21
+
22
+ if (
23
+ sample.error_code == MidiAudioPair.NO_PIANO
24
+ or sample.error_code == MidiAudioPair.NO_SONG_DIR
25
+ or sample.error_code == MidiAudioPair.NO_SONG
26
+ ):
27
+ return
28
+
29
+ bpm, beat_times, confidence, estimates, essentia_beat_intervals = extract_rhythm(sample.song)
30
+ beat_times = np.array(beat_times)
31
+ essentia_beat_intervals = np.array(essentia_beat_intervals)
32
+
33
+ qns, discrete_notes, beat_steps_8th = midi_quantize_by_beats(
34
+ sample, beat_times, 2, ignore_sustain_pedal=ignore_sustain_pedal
35
+ )
36
+
37
+ qpm = note_seq.note_sequence_to_pretty_midi(qns)
38
+ qpm.instruments[0].control_changes = []
39
+ qpm.write(sample.qmidi)
40
+ y, sr = librosa.load(sample.song, sr=None)
41
+ qpm_y = qpm.fluidsynth(sr)
42
+ qmix = get_stereo(y, qpm_y, 0.4)
43
+ sf.write(file=sample.qmix, data=qmix.T, samplerate=sr, format="flac")
44
+
45
+ meta = OmegaConf.load(meta_file)
46
+ meta.tempo = OmegaConf.create()
47
+ meta.tempo.bpm = bpm
48
+ meta.tempo.confidence = confidence
49
+ OmegaConf.save(meta, meta_file)
50
+
51
+ np.save(sample.notes, discrete_notes)
52
+ np.save(sample.beatstep, beat_steps_8th)
53
+ np.save(sample.beattime, beat_times)
54
+ np.save(sample.beatinterval, essentia_beat_intervals)
55
+
56
+
57
+ def main(meta_files, ignore_sustain_pedal):
58
+ from tqdm import tqdm
59
+ import multiprocessing
60
+ from joblib import Parallel, delayed
61
+
62
+ def files():
63
+ pbar = tqdm(meta_files)
64
+ for meta_file in pbar:
65
+ pbar.set_description(meta_file)
66
+ yield meta_file
67
+
68
+ Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
69
+ delayed(estimate)(meta_file, ignore_sustain_pedal) for meta_file in files()
70
+ )
71
+
72
+
73
+ if __name__ == "__main__":
74
+ import argparse
75
+
76
+ parser = argparse.ArgumentParser(description="bpm estimate using essentia")
77
+
78
+ parser.add_argument(
79
+ "data_dir",
80
+ type=str,
81
+ default=None,
82
+ help="""directory contains {id}/{pop_filename.wav}
83
+ """,
84
+ )
85
+
86
+ parser.add_argument(
87
+ "--ignore_sustain_pedal",
88
+ default=False,
89
+ action="store_true",
90
+ help="whether dry_run",
91
+ )
92
+
93
+ args = parser.parse_args()
94
+
95
+ meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
96
+ print("meta ", len(meta_files))
97
+
98
+ main(meta_files, args.ignore_sustain_pedal)
preprocess/melody_accuracy.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import sys
3
+ import os
4
+
5
+ import librosa
6
+ import pretty_midi
7
+
8
+ from omegaconf import OmegaConf
9
+
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+ from midiaudiopair import MidiAudioPair
12
+ from evaluate import midi_melody_accuracy as ma
13
+
14
+
15
+ def estimate(meta_file):
16
+
17
+ import warnings
18
+
19
+ warnings.filterwarnings(action="ignore")
20
+
21
+ sample = MidiAudioPair(meta_file)
22
+
23
+ if (
24
+ sample.error_code == MidiAudioPair.NO_PIANO
25
+ or sample.error_code == MidiAudioPair.NO_SONG_DIR
26
+ or sample.error_code == MidiAudioPair.NO_SONG
27
+ ):
28
+ return
29
+
30
+ if "vocals" in sample.invalids:
31
+ print("no vocal:", meta_file)
32
+ return
33
+
34
+ midi = pretty_midi.PrettyMIDI(sample.qmidi)
35
+ vocals, sr = librosa.load(sample.vocals, sr=44100)
36
+
37
+ chroma_accuracy, pitch_accuracy = ma.evaluate_melody(
38
+ midi, vocals, sr=sr, hop_length=1024
39
+ )
40
+ meta = OmegaConf.load(meta_file)
41
+ meta.eval = OmegaConf.create()
42
+ meta.eval.melody_chroma_accuracy = chroma_accuracy.item()
43
+ meta.eval.melody_pitch_accuracy = pitch_accuracy.item()
44
+ OmegaConf.save(meta, meta_file)
45
+
46
+
47
+ def main(meta_files):
48
+ from tqdm import tqdm
49
+ import multiprocessing
50
+ from joblib import Parallel, delayed
51
+
52
+ def files():
53
+ pbar = tqdm(meta_files)
54
+ for meta_file in pbar:
55
+ pbar.set_description(meta_file)
56
+ yield meta_file
57
+
58
+ Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
59
+ delayed(estimate)(meta_file) for meta_file in files()
60
+ )
61
+
62
+
63
+ if __name__ == "__main__":
64
+ import argparse
65
+
66
+ parser = argparse.ArgumentParser(description="bpm estimate using essentia")
67
+
68
+ parser.add_argument(
69
+ "data_dir",
70
+ type=str,
71
+ default=None,
72
+ help="""directory contains {id}/{pop_filename.wav}
73
+ """,
74
+ )
75
+
76
+ args = parser.parse_args()
77
+
78
+ meta_files = sorted(glob.glob(args.data_dir + "/**/*.yaml", recursive=True))
79
+ print("meta ", len(meta_files))
80
+
81
+ main(meta_files)
preprocess/pop_align.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import soundfile as sf
3
+ import glob
4
+ import os
5
+ import copy
6
+ import sys
7
+
8
+ import numpy as np
9
+ import pyrubberband as pyrb
10
+ import pretty_midi
11
+ from omegaconf import OmegaConf
12
+ from tqdm.auto import tqdm
13
+
14
+ from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
15
+ from synctoolbox.dtw.utils import (
16
+ compute_optimal_chroma_shift,
17
+ shift_chroma_vectors,
18
+ make_path_strictly_monotonic,
19
+ )
20
+ from synctoolbox.feature.chroma import (
21
+ pitch_to_chroma,
22
+ quantize_chroma,
23
+ quantized_chroma_to_CENS,
24
+ )
25
+ from synctoolbox.feature.dlnco import pitch_onset_features_to_DLNCO
26
+ from synctoolbox.feature.pitch import audio_to_pitch_features
27
+ from synctoolbox.feature.pitch_onset import audio_to_pitch_onset_features
28
+ from synctoolbox.feature.utils import estimate_tuning
29
+
30
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
31
+ print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
32
+ from utils.dsp import normalize, get_stereo
33
+ from midiaudiopair import MidiAudioPair
34
+
35
+ Fs = 22050
36
+ feature_rate = 50
37
+ step_weights = np.array([1.5, 1.5, 2.0])
38
+ threshold_rec = 10 ** 6
39
+
40
+
41
+ def save_delayed_song(
42
+ sample,
43
+ dry_run,
44
+ ):
45
+ import warnings
46
+
47
+ warnings.filterwarnings(action="ignore")
48
+
49
+ song_audio, _ = librosa.load(sample.original_song, Fs)
50
+ midi_pm = pretty_midi.PrettyMIDI(sample.original_midi)
51
+
52
+ if np.power(song_audio, 2).sum() < 1: # low energy: invalid file
53
+ print("invalid audio :", sample.original_song)
54
+ sample.delete_files_myself()
55
+ return
56
+
57
+ rd = get_aligned_results(midi_pm=midi_pm, song_audio=song_audio)
58
+
59
+ mix_song = rd["mix_song"]
60
+ song_pitch_shifted = rd["song_pitch_shifted"]
61
+ midi_warped_pm = rd["midi_warped_pm"]
62
+ pitch_shift_for_song_audio = rd["pitch_shift_for_song_audio"]
63
+ tuning_offset_song = rd["tuning_offset_song"]
64
+ tuning_offset_piano = rd["tuning_offset_piano"]
65
+
66
+ try:
67
+ if dry_run:
68
+ print("write audio files: ", sample.song)
69
+ else:
70
+ sf.write(
71
+ file=sample.song,
72
+ data=song_pitch_shifted,
73
+ samplerate=Fs,
74
+ format="wav",
75
+ )
76
+ except:
77
+ print("Fail : ", sample.song)
78
+
79
+ try:
80
+ if dry_run:
81
+ print("write warped midi :", sample.midi)
82
+ else:
83
+ midi_warped_pm.write(sample.midi)
84
+
85
+ except:
86
+ midi_warped_pm._tick_scales = midi_pm._tick_scales
87
+ try:
88
+ if dry_run:
89
+ print("write warped midi2 :", sample.midi)
90
+ else:
91
+ midi_warped_pm.write(sample.midi)
92
+
93
+ except:
94
+ print("ad-hoc failed midi : ", sample.midi)
95
+ print("ad-hoc midi : ", sample.midi)
96
+
97
+ sample.yaml.song.pitch_shift = pitch_shift_for_song_audio.item()
98
+ sample.yaml.song.tuning_offset = tuning_offset_song.item()
99
+ sample.yaml.piano.tuning_offset = tuning_offset_piano.item()
100
+ OmegaConf.save(sample.yaml, sample.yaml_path)
101
+
102
+
103
+ def get_aligned_results(midi_pm, song_audio):
104
+ piano_audio = midi_pm.fluidsynth(Fs)
105
+
106
+ song_audio = normalize(song_audio)
107
+
108
+ # The reason for estimating tuning ::
109
+ # https://www.audiolabs-erlangen.de/resources/MIR/FMP/C3/C3S1_TranspositionTuning.html
110
+ tuning_offset_1 = estimate_tuning(song_audio, Fs)
111
+ tuning_offset_2 = estimate_tuning(piano_audio, Fs)
112
+
113
+ # DLNCO features (Sebastian Ewert, Meinard Müller, and Peter Grosche: High Resolution Audio Synchronization Using Chroma Onset Features, In Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP): 1869–1872, 2009.):
114
+ # helpful to increase synchronization accuracy, especially for music with clear onsets.
115
+
116
+ # Quantized and smoothed chroma : CENS features
117
+ # Because, MrMsDTW Requires CENS.
118
+ f_chroma_quantized_1, f_DLNCO_1 = get_features_from_audio(
119
+ song_audio, tuning_offset_1
120
+ )
121
+ f_chroma_quantized_2, f_DLNCO_2 = get_features_from_audio(
122
+ piano_audio, tuning_offset_2
123
+ )
124
+
125
+ # Shift chroma vectors :
126
+ # Otherwise, different keys of two audio leads to degradation of alignment.
127
+ opt_chroma_shift = compute_optimal_chroma_shift(
128
+ quantized_chroma_to_CENS(f_chroma_quantized_1, 201, 50, feature_rate)[0],
129
+ quantized_chroma_to_CENS(f_chroma_quantized_2, 201, 50, feature_rate)[0],
130
+ )
131
+ f_chroma_quantized_2 = shift_chroma_vectors(f_chroma_quantized_2, opt_chroma_shift)
132
+ f_DLNCO_2 = shift_chroma_vectors(f_DLNCO_2, opt_chroma_shift)
133
+
134
+ wp = sync_via_mrmsdtw(
135
+ f_chroma1=f_chroma_quantized_1,
136
+ f_onset1=f_DLNCO_1,
137
+ f_chroma2=f_chroma_quantized_2,
138
+ f_onset2=f_DLNCO_2,
139
+ input_feature_rate=feature_rate,
140
+ step_weights=step_weights,
141
+ threshold_rec=threshold_rec,
142
+ verbose=False,
143
+ )
144
+
145
+ wp = make_path_strictly_monotonic(wp)
146
+ pitch_shift_for_song_audio = -opt_chroma_shift % 12
147
+ if pitch_shift_for_song_audio > 6:
148
+ pitch_shift_for_song_audio -= 12
149
+
150
+ if pitch_shift_for_song_audio != 0:
151
+ song_audio_shifted = pyrb.pitch_shift(
152
+ song_audio, Fs, pitch_shift_for_song_audio
153
+ )
154
+ else:
155
+ song_audio_shifted = song_audio
156
+
157
+ time_map_second = wp / feature_rate
158
+ midi_pm_warped = copy.deepcopy(midi_pm)
159
+
160
+ midi_pm_warped = simple_adjust_times(
161
+ midi_pm_warped, time_map_second[1], time_map_second[0]
162
+ )
163
+ piano_audio_warped = midi_pm_warped.fluidsynth(Fs)
164
+
165
+ song_audio_shifted = normalize(song_audio_shifted)
166
+ stereo_sonification_piano = get_stereo(song_audio_shifted, piano_audio_warped)
167
+
168
+ rd = dict(
169
+ mix_song=stereo_sonification_piano,
170
+ song_pitch_shifted=song_audio_shifted,
171
+ midi_warped_pm=midi_pm_warped,
172
+ pitch_shift_for_song_audio=pitch_shift_for_song_audio,
173
+ tuning_offset_song=tuning_offset_1,
174
+ tuning_offset_piano=tuning_offset_2,
175
+ )
176
+ return rd
177
+
178
+
179
+ def simple_adjust_times(pm, original_times, new_times):
180
+ """
181
+ most of these codes are from original pretty_midi
182
+ https://github.com/craffel/pretty-midi/blob/main/pretty_midi/pretty_midi.py
183
+ """
184
+ for instrument in pm.instruments:
185
+ instrument.notes = [
186
+ copy.deepcopy(note)
187
+ for note in instrument.notes
188
+ if note.start >= original_times[0] and note.end <= original_times[-1]
189
+ ]
190
+ # Get array of note-on locations and correct them
191
+ note_ons = np.array(
192
+ [note.start for instrument in pm.instruments for note in instrument.notes]
193
+ )
194
+ adjusted_note_ons = np.interp(note_ons, original_times, new_times)
195
+ # Same for note-offs
196
+ note_offs = np.array(
197
+ [note.end for instrument in pm.instruments for note in instrument.notes]
198
+ )
199
+ adjusted_note_offs = np.interp(note_offs, original_times, new_times)
200
+ # Correct notes
201
+ for n, note in enumerate(
202
+ [note for instrument in pm.instruments for note in instrument.notes]
203
+ ):
204
+ note.start = (adjusted_note_ons[n] > 0) * adjusted_note_ons[n]
205
+ note.end = (adjusted_note_offs[n] > 0) * adjusted_note_offs[n]
206
+ # After performing alignment, some notes may have an end time which is
207
+ # on or before the start time. Remove these!
208
+ pm.remove_invalid_notes()
209
+
210
+ def adjust_events(event_getter):
211
+ """This function calls event_getter with each instrument as the
212
+ sole argument and adjusts the events which are returned."""
213
+ # Sort the events by time
214
+ for instrument in pm.instruments:
215
+ event_getter(instrument).sort(key=lambda e: e.time)
216
+ # Correct the events by interpolating
217
+ event_times = np.array(
218
+ [
219
+ event.time
220
+ for instrument in pm.instruments
221
+ for event in event_getter(instrument)
222
+ ]
223
+ )
224
+ adjusted_event_times = np.interp(event_times, original_times, new_times)
225
+ for n, event in enumerate(
226
+ [
227
+ event
228
+ for instrument in pm.instruments
229
+ for event in event_getter(instrument)
230
+ ]
231
+ ):
232
+ event.time = adjusted_event_times[n]
233
+ for instrument in pm.instruments:
234
+ # We want to keep only the final event which has time ==
235
+ # new_times[0]
236
+ valid_events = [
237
+ event
238
+ for event in event_getter(instrument)
239
+ if event.time == new_times[0]
240
+ ]
241
+ if valid_events:
242
+ valid_events = valid_events[-1:]
243
+ # Otherwise only keep events within the new set of times
244
+ valid_events.extend(
245
+ event
246
+ for event in event_getter(instrument)
247
+ if event.time > new_times[0] and event.time < new_times[-1]
248
+ )
249
+ event_getter(instrument)[:] = valid_events
250
+
251
+ # Correct pitch bends and control changes
252
+ adjust_events(lambda i: i.pitch_bends)
253
+ adjust_events(lambda i: i.control_changes)
254
+
255
+ return pm
256
+
257
+
258
+ def get_features_from_audio(audio, tuning_offset, visualize=False):
259
+ f_pitch = audio_to_pitch_features(
260
+ f_audio=audio,
261
+ Fs=Fs,
262
+ tuning_offset=tuning_offset,
263
+ feature_rate=feature_rate,
264
+ verbose=visualize,
265
+ )
266
+ f_chroma = pitch_to_chroma(f_pitch=f_pitch)
267
+ f_chroma_quantized = quantize_chroma(f_chroma=f_chroma)
268
+
269
+ f_pitch_onset = audio_to_pitch_onset_features(
270
+ f_audio=audio, Fs=Fs, tuning_offset=tuning_offset, verbose=visualize
271
+ )
272
+ f_DLNCO = pitch_onset_features_to_DLNCO(
273
+ f_peaks=f_pitch_onset,
274
+ feature_rate=feature_rate,
275
+ feature_sequence_length=f_chroma_quantized.shape[1],
276
+ visualize=visualize,
277
+ )
278
+ return f_chroma_quantized, f_DLNCO
279
+
280
+
281
+ def main(samples, dry_run):
282
+ import multiprocessing
283
+ from joblib import Parallel, delayed
284
+
285
+ Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
286
+ delayed(save_delayed_song)(sample=sample, dry_run=dry_run)
287
+ for sample in tqdm(samples)
288
+ )
289
+
290
+
291
+ if __name__ == "__main__":
292
+
293
+ import argparse
294
+
295
+ parser = argparse.ArgumentParser(description="piano cover downloader")
296
+
297
+ parser.add_argument(
298
+ "data_dir",
299
+ type=str,
300
+ default=None,
301
+ help="""directory contains {id}/{song_filename.wav}
302
+ """,
303
+ )
304
+ parser.add_argument(
305
+ "--dry_run", default=False, action="store_true", help="whether dry_run"
306
+ )
307
+
308
+ args = parser.parse_args()
309
+
310
+ def getfiles():
311
+ meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
312
+ print("meta ", len(meta_files))
313
+
314
+ samples = list()
315
+ for meta_file in tqdm(meta_files):
316
+ m = MidiAudioPair(meta_file, auto_remove_no_song=True)
317
+ if m.error_code != MidiAudioPair.NO_SONG:
318
+ aux_txt = os.path.join(
319
+ m.audio_dir,
320
+ m.yaml.piano.ytid,
321
+ f"{m.yaml.piano.title[:50]}___{m.yaml.song.title[:50]}.txt",
322
+ )
323
+ with open(aux_txt, "w") as f:
324
+ f.write(".")
325
+ samples.append(m)
326
+
327
+ print(f"files available {len(samples)}")
328
+ return samples
329
+
330
+ samples = getfiles()
331
+ main(samples=samples, dry_run=args.dry_run)
preprocess/split_spleeter.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ import random
4
+ import sys
5
+
6
+ from tqdm.auto import tqdm
7
+
8
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9
+ from midiaudiopair import MidiAudioPair
10
+
11
+
12
+ def split_spleeter(meta_files):
13
+ # Use audio loader explicitly for loading audio waveform :
14
+ from spleeter.audio.adapter import AudioAdapter
15
+ from spleeter.separator import Separator
16
+ import spleeter
17
+
18
+ sample_rate = 44100
19
+ audio_loader = AudioAdapter.default()
20
+
21
+ # Using embedded configuration.
22
+ separator = Separator("spleeter:2stems")
23
+
24
+ for meta_file in tqdm(meta_files):
25
+ sample = MidiAudioPair(meta_file)
26
+ if sample.error_code == MidiAudioPair.NO_SONG:
27
+ continue
28
+ if os.path.exists(sample.vocals):
29
+ continue
30
+
31
+ waveform, _ = audio_loader.load(sample.song, sample_rate=sample_rate)
32
+
33
+ # Perform the separation :
34
+ prediction = separator.separate(waveform)
35
+
36
+ audio_loader.save(
37
+ path=sample.vocals,
38
+ data=prediction["vocals"][:, 0:1],
39
+ codec=spleeter.audio.Codec.MP3,
40
+ sample_rate=sample_rate,
41
+ )
42
+
43
+
44
+ if __name__ == "__main__":
45
+ import argparse
46
+
47
+ parser = argparse.ArgumentParser(description="bpm estimate using essentia")
48
+
49
+ parser.add_argument(
50
+ "data_dir",
51
+ type=str,
52
+ default=None,
53
+ help="""directory contains {id}/{pop_filename.wav}
54
+ """,
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--random_order",
59
+ default=False,
60
+ action="store_true",
61
+ help="Random order process (to run multiple process)",
62
+ )
63
+
64
+ args = parser.parse_args()
65
+
66
+ meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
67
+ if args.random_order:
68
+ random.shuffle(meta_files)
69
+
70
+ print("meta ", len(meta_files))
71
+
72
+ split_spleeter(meta_files)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pretty-midi==0.2.9
2
+ omegaconf==2.1.1
3
+ transformers==4.16.1
4
+ pytorch-lightning
5
+ essentia==2.1b6.dev609
6
+ note-seq==0.0.3
7
+ pyFluidSynth==1.3.0
8
+ torch
transformer_wrapper.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import numpy as np
5
+ import librosa
6
+ import torch
7
+
8
+ import pytorch_lightning as pl
9
+ import soundfile as sf
10
+ from torch.nn.utils.rnn import pad_sequence
11
+ from transformers import T5Config, T5ForConditionalGeneration
12
+
13
+ from midi_tokenizer import MidiTokenizer, extrapolate_beat_times
14
+ from layer.input import LogMelSpectrogram, ConcatEmbeddingToMel
15
+ from preprocess.beat_quantizer import extract_rhythm, interpolate_beat_times
16
+ from utils.dsp import get_stereo
17
+
18
+
19
+ DEFAULT_COMPOSERS = {"various composer": 2052}
20
+
21
+
22
+ class TransformerWrapper(pl.LightningModule):
23
+ def __init__(self, config):
24
+ super().__init__()
25
+ self.config = config
26
+
27
+ self.tokenizer = MidiTokenizer(config.tokenizer)
28
+ self.t5config = T5Config.from_pretrained("t5-small")
29
+
30
+ for k, v in config.t5.items():
31
+ self.t5config.__setattr__(k, v)
32
+
33
+ self.transformer = T5ForConditionalGeneration(self.t5config)
34
+ self.use_mel = self.config.dataset.use_mel
35
+ self.mel_is_conditioned = self.config.dataset.mel_is_conditioned
36
+ self.composer_to_feature_token = config.composer_to_feature_token
37
+
38
+ if self.use_mel and not self.mel_is_conditioned:
39
+ self.composer_to_feature_token = DEFAULT_COMPOSERS
40
+
41
+ if self.use_mel:
42
+ self.spectrogram = LogMelSpectrogram()
43
+ if self.mel_is_conditioned:
44
+ n_dim = 512
45
+ composer_n_vocab = len(self.composer_to_feature_token)
46
+ embedding_offset = min(self.composer_to_feature_token.values())
47
+ self.mel_conditioner = ConcatEmbeddingToMel(
48
+ embedding_offset=embedding_offset,
49
+ n_vocab=composer_n_vocab,
50
+ n_dim=n_dim,
51
+ )
52
+ else:
53
+ self.spectrogram = None
54
+
55
+ self.lr = config.training.lr
56
+
57
+ def forward(self, input_ids, labels):
58
+ """
59
+ Deprecated.
60
+ """
61
+ rt = self.transformer(input_ids=input_ids, labels=labels)
62
+ return rt
63
+
64
+ @torch.no_grad()
65
+ def single_inference(
66
+ self,
67
+ feature_tokens=None,
68
+ audio=None,
69
+ beatstep=None,
70
+ max_length=256,
71
+ max_batch_size=64,
72
+ n_bars=None,
73
+ composer_value=None,
74
+ ):
75
+ """
76
+ generate a long audio sequence
77
+
78
+ feature_tokens or audio : shape (time, )
79
+
80
+ beatstep : shape (time, )
81
+ - input_ids가 해당하는 beatstep 값들
82
+ (offset 빠짐, 즉 beatstep[0] == 0)
83
+ - beatstep[-1] : input_ids가 끝나는 지점의 시간값
84
+ (즉 beatstep[-1] == len(y)//sr)
85
+ """
86
+
87
+ assert feature_tokens is not None or audio is not None
88
+ assert beatstep is not None
89
+
90
+ if feature_tokens is not None:
91
+ assert len(feature_tokens.shape) == 1
92
+
93
+ if audio is not None:
94
+ assert len(audio.shape) == 1
95
+
96
+ config = self.config
97
+ PAD = self.t5config.pad_token_id
98
+ n_bars = config.dataset.n_bars if n_bars is None else n_bars
99
+
100
+ if beatstep[0] > 0.01:
101
+ print(
102
+ "inference warning : beatstep[0] is not 0 ({beatstep[0]}). all beatstep will be shifted."
103
+ )
104
+ beatstep = beatstep - beatstep[0]
105
+
106
+ if self.use_mel:
107
+ input_ids = None
108
+ inputs_embeds, ext_beatstep = self.prepare_inference_mel(
109
+ audio,
110
+ beatstep,
111
+ n_bars=n_bars,
112
+ padding_value=PAD,
113
+ composer_value=composer_value,
114
+ )
115
+ batch_size = inputs_embeds.shape[0]
116
+ else:
117
+ raise NotImplementedError
118
+
119
+ # Considering GPU capacity, some sequence would not be generated at once.
120
+ relative_tokens = list()
121
+ for i in range(0, batch_size, max_batch_size):
122
+ start = i
123
+ end = min(batch_size, i + max_batch_size)
124
+
125
+ if input_ids is None:
126
+ _input_ids = None
127
+ _inputs_embeds = inputs_embeds[start:end]
128
+ else:
129
+ _input_ids = input_ids[start:end]
130
+ _inputs_embeds = None
131
+
132
+ _relative_tokens = self.transformer.generate(
133
+ input_ids=_input_ids,
134
+ inputs_embeds=_inputs_embeds,
135
+ max_length=max_length,
136
+ )
137
+ _relative_tokens = _relative_tokens.cpu().numpy()
138
+ relative_tokens.append(_relative_tokens)
139
+
140
+ max_length = max([rt.shape[-1] for rt in relative_tokens])
141
+ for i in range(len(relative_tokens)):
142
+ relative_tokens[i] = np.pad(
143
+ relative_tokens[i],
144
+ [(0, 0), (0, max_length - relative_tokens[i].shape[-1])],
145
+ constant_values=PAD,
146
+ )
147
+ relative_tokens = np.concatenate(relative_tokens)
148
+
149
+ pm, notes = self.tokenizer.relative_batch_tokens_to_midi(
150
+ relative_tokens,
151
+ beatstep=ext_beatstep,
152
+ bars_per_batch=n_bars,
153
+ cutoff_time_idx=(n_bars + 1) * 4,
154
+ )
155
+
156
+ return relative_tokens, notes, pm
157
+
158
+ def prepare_inference_mel(
159
+ self, audio, beatstep, n_bars, padding_value, composer_value=None
160
+ ):
161
+ n_steps = n_bars * 4
162
+ n_target_step = len(beatstep)
163
+ sample_rate = self.config.dataset.sample_rate
164
+ ext_beatstep = extrapolate_beat_times(beatstep, (n_bars + 1) * 4 + 1)
165
+
166
+ def split_audio(audio):
167
+ # Split audio corresponding beat intervals.
168
+ # Each audio's lengths are different.
169
+ # Because each corresponding beat interval times are different.
170
+ batch = []
171
+
172
+ for i in range(0, n_target_step, n_steps):
173
+
174
+ start_idx = i
175
+ end_idx = min(i + n_steps, n_target_step)
176
+
177
+ start_sample = int(ext_beatstep[start_idx] * sample_rate)
178
+ end_sample = int(ext_beatstep[end_idx] * sample_rate)
179
+ feature = audio[start_sample:end_sample]
180
+ batch.append(feature)
181
+ return batch
182
+
183
+ def pad_and_stack_batch(batch):
184
+ batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
185
+ return batch
186
+
187
+ batch = split_audio(audio)
188
+ batch = pad_and_stack_batch(batch)
189
+
190
+ inputs_embeds = self.spectrogram(batch).transpose(-1, -2)
191
+ if self.mel_is_conditioned:
192
+ composer_value = torch.tensor(composer_value).to(self.device)
193
+ composer_value = composer_value.repeat(inputs_embeds.shape[0])
194
+ inputs_embeds = self.mel_conditioner(inputs_embeds, composer_value)
195
+ return inputs_embeds, ext_beatstep
196
+
197
+ @torch.no_grad()
198
+ def generate(
199
+ self,
200
+ audio_path=None,
201
+ composer=None,
202
+ model="generated",
203
+ steps_per_beat=2,
204
+ stereo_amp=0.5,
205
+ n_bars=2,
206
+ ignore_duplicate=True,
207
+ show_plot=False,
208
+ save_midi=False,
209
+ save_mix=False,
210
+ midi_path=None,
211
+ mix_path=None,
212
+ click_amp=0.2,
213
+ add_click=False,
214
+ max_batch_size=None,
215
+ beatsteps=None,
216
+ mix_sample_rate=None,
217
+ audio_y=None,
218
+ audio_sr=None,
219
+ ):
220
+ config = self.config
221
+ device = self.device
222
+
223
+ if audio_path is not None:
224
+ extension = os.path.splitext(audio_path)[1]
225
+ mix_path = (
226
+ audio_path.replace(extension, f".{model}.{composer}.wav")
227
+ if mix_path is None
228
+ else mix_path
229
+ )
230
+ midi_path = (
231
+ audio_path.replace(extension, f".{model}.{composer}.mid")
232
+ if midi_path is None
233
+ else midi_path
234
+ )
235
+
236
+ max_batch_size = 64 // n_bars if max_batch_size is None else max_batch_size
237
+ composer_to_feature_token = self.composer_to_feature_token
238
+
239
+ if composer is None:
240
+ composer = random.sample(list(composer_to_feature_token.keys()), 1)[0]
241
+
242
+ composer_value = composer_to_feature_token[composer]
243
+ mix_sample_rate = (
244
+ config.dataset.sample_rate if mix_sample_rate is None else mix_sample_rate
245
+ )
246
+
247
+ if not ignore_duplicate:
248
+ if os.path.exists(midi_path):
249
+ return
250
+
251
+ ESSENTIA_SAMPLERATE = 44100
252
+
253
+ if beatsteps is None:
254
+ y, sr = librosa.load(audio_path, sr=ESSENTIA_SAMPLERATE)
255
+ (
256
+ bpm,
257
+ beat_times,
258
+ confidence,
259
+ estimates,
260
+ essentia_beat_intervals,
261
+ ) = extract_rhythm(audio_path, y=y)
262
+ beat_times = np.array(beat_times)
263
+ beatsteps = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
264
+ else:
265
+ y = None
266
+
267
+ if self.use_mel:
268
+ if audio_y is None and config.dataset.sample_rate != ESSENTIA_SAMPLERATE:
269
+ if y is not None:
270
+ y = librosa.core.resample(
271
+ y,
272
+ orig_sr=ESSENTIA_SAMPLERATE,
273
+ target_sr=config.dataset.sample_rate,
274
+ )
275
+ sr = config.dataset.sample_rate
276
+ else:
277
+ y, sr = librosa.load(audio_path, sr=config.dataset.sample_rate)
278
+ elif audio_y is not None:
279
+ if audio_sr != config.dataset.sample_rate:
280
+ audio_y = librosa.core.resample(
281
+ audio_y, orig_sr=audio_sr, target_sr=config.dataset.sample_rate
282
+ )
283
+ audio_sr = config.dataset.sample_rate
284
+ y = audio_y
285
+ sr = audio_sr
286
+
287
+ start_sample = int(beatsteps[0] * sr)
288
+ end_sample = int(beatsteps[-1] * sr)
289
+ _audio = torch.from_numpy(y)[start_sample:end_sample].to(device)
290
+ fzs = None
291
+ else:
292
+ raise NotImplementedError
293
+
294
+ relative_tokens, notes, pm = self.single_inference(
295
+ feature_tokens=fzs,
296
+ audio=_audio,
297
+ beatstep=beatsteps - beatsteps[0],
298
+ max_length=config.dataset.target_length
299
+ * max(1, (n_bars // config.dataset.n_bars)),
300
+ max_batch_size=max_batch_size,
301
+ n_bars=n_bars,
302
+ composer_value=composer_value,
303
+ )
304
+
305
+ for n in pm.instruments[0].notes:
306
+ n.start += beatsteps[0]
307
+ n.end += beatsteps[0]
308
+
309
+ if show_plot or save_mix:
310
+ if mix_sample_rate != sr:
311
+ y = librosa.core.resample(y, orig_sr=sr, target_sr=mix_sample_rate)
312
+ sr = mix_sample_rate
313
+ if add_click:
314
+ clicks = (
315
+ librosa.clicks(times=beatsteps, sr=sr, length=len(y)) * click_amp
316
+ )
317
+ y = y + clicks
318
+ pm_y = pm.fluidsynth(sr)
319
+ stereo = get_stereo(y, pm_y, pop_scale=stereo_amp)
320
+
321
+ if show_plot:
322
+ import IPython.display as ipd
323
+ from IPython.display import display
324
+ import note_seq
325
+
326
+ display("Stereo MIX", ipd.Audio(stereo, rate=sr))
327
+ display("Rendered MIDI", ipd.Audio(pm_y, rate=sr))
328
+ display("Original Song", ipd.Audio(y, rate=sr))
329
+ display(note_seq.plot_sequence(note_seq.midi_to_note_sequence(pm)))
330
+
331
+ if save_mix:
332
+ sf.write(
333
+ file=mix_path,
334
+ data=stereo.T,
335
+ samplerate=sr,
336
+ format="wav",
337
+ )
338
+
339
+ if save_midi:
340
+ pm.write(midi_path)
341
+
342
+ return pm, composer, mix_path, midi_path
utils/__init__.py ADDED
File without changes
utils/dsp.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy.interpolate import interp1d
3
+
4
+
5
+ def normalize(audio, min_y=-1.0, max_y=1.0, eps=1e-8):
6
+ assert len(audio.shape) == 1
7
+ max_y -= eps
8
+ min_y += eps
9
+ amax = audio.max()
10
+ amin = audio.min()
11
+ audio = (max_y - min_y) * (audio - amin) / (amax - amin) + min_y
12
+ return audio
13
+
14
+
15
+ def get_stereo(pop_y, midi_y, pop_scale=0.99):
16
+ if len(pop_y) > len(midi_y):
17
+ midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
18
+ elif len(pop_y) < len(midi_y):
19
+ pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
20
+ stereo = np.stack((midi_y, pop_y * pop_scale))
21
+ return stereo
22
+
23
+
24
+ def generate_variable_f0_sine_wave(f0, len_y, sr):
25
+ """
26
+ integrate instant frequencies to get pure tone sine wave
27
+ """
28
+ x_sample = np.arange(len(f0))
29
+ intp = interp1d(x_sample, f0, kind="linear")
30
+ f0_audiorate = intp(np.linspace(0, len(f0) - 1, len_y))
31
+ pitch_wave = np.sin((np.nan_to_num(f0_audiorate) / sr * 2 * np.pi).cumsum())
32
+ return pitch_wave
33
+
34
+
35
+ def fluidsynth_without_normalize(self, fs=44100, sf2_path=None):
36
+ """Synthesize using fluidsynth. without signal normalize
37
+ Parameters
38
+ ----------
39
+ fs : int
40
+ Sampling rate to synthesize at.
41
+ sf2_path : str
42
+ Path to a .sf2 file.
43
+ Default ``None``, which uses the TimGM6mb.sf2 file included with
44
+ ``pretty_midi``.
45
+ Returns
46
+ -------
47
+ synthesized : np.ndarray
48
+ Waveform of the MIDI data, synthesized at ``fs``.
49
+ """
50
+ # If there are no instruments, or all instruments have no notes, return
51
+ # an empty array
52
+ if len(self.instruments) == 0 or all(len(i.notes) == 0 for i in self.instruments):
53
+ return np.array([])
54
+ # Get synthesized waveform for each instrument
55
+ waveforms = [i.fluidsynth(fs=fs, sf2_path=sf2_path) for i in self.instruments]
56
+ # Allocate output waveform, with #sample = max length of all waveforms
57
+ synthesized = np.zeros(np.max([w.shape[0] for w in waveforms]))
58
+ # Sum all waveforms in
59
+ for waveform in waveforms:
60
+ synthesized[: waveform.shape[0]] += waveform
61
+ # Normalize
62
+ # synthesized /= np.abs(synthesized).max()
63
+ return synthesized