Spaces:
Running
Running
import pandas as pd | |
import argparse | |
import pickle | |
import yaml | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder | |
import numpy as np | |
def featurise_features(featdf, params): | |
# adjust features dataset for chosen encoding | |
game = featdf.iloc[:, :1] | |
rounddata = featdf.iloc[:, 1:7] | |
bontiles = featdf.iloc[:, 7:17] | |
playerdata = featdf.iloc[:, 17:18] | |
colours = featdf.iloc[:, 18:25] | |
mapdata = featdf.iloc[:, -1:] | |
onehot_encoder = OneHotEncoder(sparse=False) | |
ordinal_encoder = OrdinalEncoder() | |
if params['prepare-step2']['round-features'] == 'ordinal': | |
rounddatanp = ordinal_encoder.fit_transform(rounddata) | |
rounddata = pd.DataFrame(data=rounddatanp, columns=rounddata.columns) | |
else: # one-hot | |
rounddatanp = onehot_encoder.fit_transform(rounddata) | |
rounddata = pd.DataFrame(data=rounddatanp, columns=onehot_encoder.get_feature_names()) | |
if params['prepare-step2']['playercount-features'] == 'ordinal': | |
playerdatanp = ordinal_encoder.fit_transform(playerdata) | |
playerdata = pd.DataFrame(data=playerdatanp, columns=playerdata.columns) | |
else: # one-hot | |
playerdatanp = onehot_encoder.fit_transform(playerdata) | |
playerdata = pd.DataFrame(data=playerdatanp, columns=onehot_encoder.get_feature_names()) | |
if params['prepare-step2']['map-features'] == 'ordinal': | |
mapdatanp = ordinal_encoder.fit_transform(mapdata) | |
mapdata = pd.DataFrame(data=mapdatanp, columns=mapdata.columns) | |
else: # one-hot | |
mapdatanp = onehot_encoder.fit_transform(mapdata) | |
mapdata = pd.DataFrame(data=mapdatanp, columns=onehot_encoder.get_feature_names()) | |
featdf = pd.concat([game, rounddata, bontiles, playerdata, colours, mapdata], axis=1) | |
return featdf | |
def main(params): | |
vpdfdir = params['prepare']['vp-data-dir'] | |
featdfdir = params['prepare']['feature-data-dir'] | |
pickledir = params['prepare-step2']['pickle-dir'] | |
vpdf = pd.read_csv(vpdfdir) | |
featdf = pd.read_csv(featdfdir) | |
vpdf = vpdf.sort_values('game') | |
featdf = featdf.sort_values('game') | |
featdf = featdf.drop(columns=['Unnamed: 0']) | |
featdf = featurise_features(featdf, params) | |
each_faction_dataset = dict() | |
colnames = list(vpdf.columns) | |
factions = [x for x in colnames if x != 'game' and x != 'Unnamed: 0'] | |
for faction in factions: | |
faction_dataset = {} | |
vpdf = vpdf.sort_index() | |
indexes = pd.isnull(vpdf[faction]) | |
vpdata = pd.Series(index=vpdf['game'][~indexes], data=vpdf[faction][~indexes].values) | |
featdf = featdf.sort_index() | |
featdata = featdf[~indexes] | |
featdata.index = featdata['game'] | |
featdata = featdata.drop(columns=['game']) | |
faction_dataset['vp'] = vpdata | |
faction_dataset['features'] = featdata | |
each_faction_dataset[faction] = faction_dataset | |
with open(pickledir, 'wb') as pklfile: | |
pickle.dump(each_faction_dataset, pklfile) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Input DVC params.') | |
parser.add_argument('--params', type=str) | |
args = parser.parse_args() | |
paramsdir = args.params | |
with open(paramsdir, 'r') as fd: | |
params = yaml.safe_load(fd) | |
main(params) | |