File size: 2,777 Bytes
2d5fdd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import librosa
import numpy as np
from pathlib import Path
import json
import os.path
import sys
import argparse
import pickle

THIS_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.abspath(os.path.join(os.path.join(THIS_DIR, os.pardir), os.pardir))
DATA_DIR = os.path.join(ROOT_DIR, 'data')
EXTRACT_DIR = os.path.join(DATA_DIR, 'extracted_data')
if not os.path.isdir(DATA_DIR):
    os.mkdir(DATA_DIR)
if not os.path.isdir(EXTRACT_DIR):
    os.mkdir(EXTRACT_DIR)
sys.path.append(ROOT_DIR)
from audio_feature_utils import extract_features_hybrid, extract_features_mel, extract_features_multi_mel
from utils import distribute_tasks

parser = argparse.ArgumentParser(description="Preprocess songs data")

parser.add_argument("data_path", type=str, help="features path")
parser.add_argument("--feature_name", metavar='', type=str, default="mel", help="coma separated list of names of features to combine")
parser.add_argument("--transform_name", metavar='', type=str, default="scaler", help="pca_transform,scaler")
parser.add_argument("--pca_dims", metavar='', type=int, default=2, help="number of pca dimensions to keep, if applying pca transform")
parser.add_argument("--keep_feature_name", action="store_true")
parser.add_argument("--new_feature_name", metavar='', type=str, default=None)
parser.add_argument("--replace_existing", action="store_true")
args = parser.parse_args()

# makes arugments into global variables of the same name, used later in the code
globals().update(vars(args))
data_path = Path(data_path)

## distributing tasks accross nodes ##
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
print(rank)

#assuming mp3 for now. TODO: generalize
candidate_files = sorted(data_path.glob('**/*'+feature_name+'.npy'), key=lambda path: path.parent.__str__())
tasks = distribute_tasks(candidate_files,rank,size)

for i in tasks:
    path = candidate_files[i]
    print(path)
    feature_file = path.__str__()
    if new_feature_name is None:
        if keep_feature_name:
            new_feature_name = feature_name
        else:
            new_feature_name = feature_name+"_applied_"+transform_name
    base_filename = feature_file[:-(len(feature_name)+4)]
    new_feature_file = base_filename+new_feature_name+".npy"
    if replace_existing or not os.path.isfile(new_feature_file):
        features = np.load(feature_file)
        transform = pickle.load(open(data_path.joinpath(feature_name+'_'+transform_name+'.pkl'), "rb"))
        pickle.dump(transform, open(data_path.joinpath(new_feature_name+'_scaler.pkl'), "wb"))
        features = transform.transform(features)
        if transform_name == "pca_transform":
            features = features[:,:pca_dims]
        np.save(new_feature_file,features)