#!/Users/pranab/Tools/anaconda/bin/python # avenir-python: Machine Learning # Author: Pranab Ghosh # # Licensed under the Apache License, Version 2.0 (the "License"); you # may not use this file except in compliance with the License. You may # obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. See the License for the specific language governing # permissions and limitations under the License. # Package imports import os import sys import matplotlib.pyplot as plt import numpy as np import sklearn as sk import sklearn.linear_model import matplotlib import random import jprops from sklearn.externals import joblib from sklearn.ensemble import BaggingClassifier from random import randint if len(sys.argv) < 2: print "usage: ./svm.py " sys.exit() #train by bagging def train_bagging(): model = build_model() bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator, max_samples=bagging_sample_fraction,oob_score=bagging_use_oob) #train model bagging_model.fit(XC, yc) #persist model if persist_model: models = bagging_model.estimators_ for m in zip(range(0, len(models)), models): model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod" joblib.dump(m[1], model_file) score = bagging_model.score(XC, yc) print "average error %.3f" %(1.0 - score) #linear k fold validation def train_kfold_validation(nfold): if native_kfold_validation: print "native linear kfold validation" model = build_model() scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold) av_score = np.mean(scores) print "average error %.3f" %(1.0 - av_score) else: print "extended linear kfold validation" train_kfold_validation_ext(nfold) #linear k fold validation def train_kfold_validation_ext(nfold): model = build_model() #scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold) #print scores offset = 0 length = dsize / nfold errors = [] fp_errors = [] fn_errors = [] for i in range(0, nfold): print "....Next fold %d" %(i) #split data (XV,yv,X,y) = split_data(offset, length) dvsize = len(XV) #train model model.fit(X, y) #persist model if persist_model: model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" joblib.dump(model, model_file) #print support vectors print_support_vectors(model) #predict print "making predictions..." yp = model.predict(XV) #show prediction output (er, fp_er, fn_er) = validate(dvsize,yv,yp) errors.append(er) fp_errors.append(fp_er) fn_errors.append(fn_er) offset += length #average error av_error = np.mean(errors) av_fp_error = np.mean(fp_errors) av_fn_error = np.mean(fn_errors) print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error) # random k fold validation def train_rfold_validation(nfold, niter): if native_rfold_validation: print "native random kfold validation" train_fraction = 1.0 / nfold scores = [] for i in range(0,niter): state = randint(1,100) X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state) model = build_model() model.fit(X,y) scores.append(model.score(XV, yv)) print scores av_score = np.mean(scores) print "average error %.3f" %(1.0 - av_score) else: print "extended random kfold validation" train_rfold_validation_ext(nfold, niter) # random k fold validation def train_rfold_validation_ext(nfold, niter): max_offset_frac = 1.0 - 1.0 / nfold max_offset_frac -= .01 length = dsize / nfold errors = [] fp_errors = [] fn_errors = [] for i in range(0,niter): print "...Next iteration %d" %(i) offset = int(dsize * random.random() * max_offset_frac) print "offset: %d length: %d" %(offset, length) (XV,yv,X,y) = split_data(offset, length) dvsize = len(XV) #build model model = build_model() #train model model.fit(X, y) #persist model if persist_model: model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" print "saving model file " + model_file joblib.dump(model, model_file) #print support vectors print_support_vectors(model) #predict print "making predictions..." yp = model.predict(XV) #show prediction output (er, fp_er, fn_er) = validate(dvsize,yv,yp) errors.append(er) fp_errors.append(fp_er) fn_errors.append(fn_er) av_error = np.mean(errors) av_fp_error = np.mean(fp_errors) av_fn_error = np.mean(fn_errors) print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error) # make predictions def predict(): psize = len(X) class_counts = [] #all models for i in range(0, num_models): model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" print "loading model file " + model_file model = joblib.load(model_file) yp = model.predict(X) if i == 0: #initialize class counts for y in yp: class_count = {} if y == 0: class_count[0] = 1 class_count[1] = 0 else: class_count[1] = 1 class_count[0] = 0 class_counts.append(class_count) else: #increment class count for j in range(0, psize): class_count = class_counts[j] y = yp[j] class_count[y] += 1 # predict based on majority vote print "here are the predictions" for k in range(0, psize): class_count = class_counts[k] if (class_count[0] > class_count[1]): y = 0 majority = class_count[0] else: y = 1 majority = class_count[1] print X[k] print "prediction %d majority count %d" %(y, majority) #builds model def build_model(): #build model print "building model..." if algo == "svc": if kernel_fun == "poly": model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff) elif kernel_fun == "rbf" or kernel_fun == "sigmoid": model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff) else: model = sk.svm.SVC(C=penalty,kernel=kernel_fun) elif algo == "nusvc": if kernel_fun == "poly": model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff) elif kernel_fun == "rbf" or kernel_fun == "sigmoid": model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff) else: model = sk.svm.NuSVC(kernel=kernel_fun) elif algo == "linearsvc": model = sk.svm.LinearSVC() else: print "invalid svm algorithm" sys.exit() return model #splits data into training and validation sets def split_data(offset, length): print "splitting data..." #copy data XC_c = np.copy(XC) yc_c = list(yc) # validation set vlo = offset vup = vlo + length if (vup > len(yc)): vup = len(yc) XV = XC_c[vlo:vup:1] yv = yc_c[vlo:vup:1] dvsize = len(XV) print "data size %d validation data size %d" %(dsize, dvsize) #print "validation set" #print XV #print yv #training set X = np.delete(XC_c, np.s_[vlo:vup:1], 0) y = np.delete(yc_c, np.s_[vlo:vup:1], 0) #print "training set" #print X #print y return (XV,yv,X,y) #print support vectors def print_support_vectors(model): if (not algo == "linearsvc"): if print_sup_vectors: print "showing support vectors..." print model.support_vectors_ print "num of support vectors" print model.n_support_ #prints prediction output def validate(dvsize,yv,yp): print "showing predictions..." err_count = 0 tp = 0 tn = 0 fp = 0 fn = 0 for r in range(0,dvsize): #print "actual: %d predicted: %d" %(yv[r], yp[r]) if (not yv[r] == yp[r]): err_count += 1 if (yp[r] == 1 and yv[r] == 1): tp += 1 elif (yp[r] == 1 and yv[r] == 0): fp += 1 elif (yp[r] == 0 and yv[r] == 0): tn += 1 else: fn += 1 er = float(err_count) / dvsize fp_er = float(fp) / dvsize fn_er = float(fn) / dvsize print "error %.3f" %(er) print "true positive : %.3f" %(float(tp) / dvsize) print "false positive: %.3f" %(fp_er) print "true negative : %.3f" %(float(tn) / dvsize) print "false negative: %.3f" %(fn_er) return (er, fp_er, fn_er) # load configuration def getConfigs(configFile): configs = {} print "using following configurations" with open(configFile) as fp: for key, value in jprops.iter_properties(fp): print key, value configs[key] = value return configs # load configuration configs = getConfigs(sys.argv[1]) mode = configs["common.mode"] if mode == "train": #train print "running in train mode" data_file = configs["train.data.file"] feat_field_indices = configs["train.data.feature.fields"].split(",") feat_field_indices = [int(a) for a in feat_field_indices] class_field_index = int(configs["train.data.class.field"]) preprocess = configs["common.preprocessing"] validation = configs["train.validation"] num_folds = int(configs["train.num.folds"]) num_iter = int(configs["train.num.iter"]) algo = configs["train.algorithm"] kernel_fun = configs["train.kernel.function"] poly_degree = int(configs["train.poly.degree"]) penalty = float(configs["train.penalty"]) if penalty < 0: penalty = 1.0 print "using default for penalty" kernel_coeff = float(configs["train.gamma"]) if kernel_coeff < 0: kernel_coeff = 'auto' print "using default for gamma" print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true" persist_model = configs["train.persist.model"].lower() == "true" model_file_directory = configs["common.model.directory"] model_file_prefix = configs["common.model.file.prefix"] print feat_field_indices #extract feature fields d = np.loadtxt(data_file, delimiter=',') dsize = len(d) XC = d[:,feat_field_indices] #preprocess features if (preprocess == "scale"): XC = sk.preprocessing.scale(XC) elif (preprocess == "normalize"): XC = sk.preprocessing.normalize(XC, norm='l2') else: print "no preprocessing done" #extract output field yc = d[:,[class_field_index]] yc = yc.reshape(dsize) yc = [int(a) for a in yc] #print XC #print yc # train model if validation == "kfold": native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true" train_kfold_validation(num_folds) elif validation == "rfold": native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true" train_rfold_validation(num_folds,num_iter) elif validation == "bagging": bagging_num_estimator = int(configs["train.bagging.num.estimators"]) bagging_sample_fraction = float(configs["train.bagging.sample.fraction"]) bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true" train_bagging() else: print "invalid training validation method" sys.exit() else: #predict print "running in prediction mode" pred_data_file = configs["pred.data.file"] pred_feat_field_indices = configs["pred.data.feature.fields"].split(",") pred_feat_field_indices = [int(a) for a in pred_feat_field_indices] preprocess = configs["common.preprocessing"] num_models = int(configs["pred.num.models"]) model_file_directory = configs["common.model.directory"] model_file_prefix = configs["common.model.file.prefix"] #extract feature fields pd = np.loadtxt(pred_data_file, delimiter=',') pdsize = len(pd) X = pd[:,pred_feat_field_indices] #preprocess features if (preprocess == "scale"): X = sk.preprocessing.scale(X) elif (preprocess == "normalize"): X = sk.preprocessing.normalize(X, norm='l2') else: print "no preprocessing done" predict()