File size: 3,202 Bytes
2d5fdd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import librosa
import numpy as np
from pathlib import Path
import json
import os.path
import sys
import argparse

'''
Compute transforms which r4quire to be fitted on all the data at once rather than sequential (so they dont implement the `partial_fit` function)
'''

THIS_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.pardir))
sys.path.append(ROOT_DIR)
from audio_feature_utils import extract_features_hybrid, extract_features_mel, extract_features_multi_mel
from utils import distribute_tasks
#from scripts.feature_extraction.utils import distribute_tasks

parser = argparse.ArgumentParser(description="Preprocess songs data")

parser.add_argument("data_path", type=str, help="Directory contining Beat Saber level folders")
parser.add_argument("--feature_name", metavar='', type=str, default="mel", help="mel, chroma, multi_mel")
parser.add_argument("--transforms", metavar='', type=str, default="scaler", help="comma-separated lists of transforms to extract (scaler,pca_transform)")
args = parser.parse_args()

# makes arugments into global variables of the same name, used later in the code
globals().update(vars(args))
data_path = Path(data_path)

## distributing tasks accross nodes ##
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
print(rank)
assert size == 1
candidate_files = sorted(data_path.glob('**/*'+feature_name+'.npy'), key=lambda path: path.parent.__str__())
tasks = range(len(candidate_files))

from sklearn import decomposition, preprocessing
features = None
for i in tasks:
    path = candidate_files[i]
    feature_file = path.__str__()
    if i == 0:
        features = np.load(feature_file)
    else:
        feature = np.load(feature_file)
        features = np.concatenate([features,feature],0)

import pickle
transforms = transforms.split(",")
for transform in transforms:
    if transform == "2moments":
        if len(features.shape) == 3:
            features = features[:,0,:]
        C = np.dot(features.T,features)/features.shape[0]
        m = np.mean(features,0)
        pickle.dump((m,C), open(data_path.joinpath(feature_name+'_2moments.pkl'), 'wb'))
    elif transform == "2moments_ext":
        if len(features.shape) == 3:
            features = features[:,0,:]
        if features.shape[0] % 3 != 0:
            features = features[:-(features.shape[0]%3)]
        features = np.reshape(features,(-1,3*features.shape[1]))
        C = np.dot(features.T,features)/features.shape[0]
        m = np.mean(features,0)
        pickle.dump((m,C), open(data_path.joinpath(feature_name+'_2moments_ext.pkl'), 'wb'))
    elif transform == "scaler":
        scaler = preprocessing.StandardScaler().fit(features)
        pickle.dump(scaler, open(data_path.joinpath(feature_name+'_scaler.pkl'), 'wb'))
    elif transform == "pca_transform":
        feature_size = features.shape[1]
        pca = decomposition.PCA(n_components=feature_size)
        pca_transform = pca.fit(features)
        pickle.dump(pca_transform, open(data_path.joinpath(feature_name+'_pca_transform.pkl'), 'wb'))
    else:
        raise NotImplementedError("Transform type "+transform+" not implemented")