Spaces:
Runtime error
Runtime error
#!/Users/pranab/Tools/anaconda/bin/python | |
# avenir-python: Machine Learning | |
# Author: Pranab Ghosh | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you | |
# may not use this file except in compliance with the License. You may | |
# obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
# implied. See the License for the specific language governing | |
# permissions and limitations under the License. | |
# Package imports | |
import os | |
import sys | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import sklearn as sk | |
import sklearn.linear_model | |
import matplotlib | |
import random | |
import jprops | |
from sklearn.externals import joblib | |
from sklearn.ensemble import BaggingClassifier | |
from random import randint | |
if len(sys.argv) < 2: | |
print "usage: ./svm.py <config_properties_file>" | |
sys.exit() | |
#train by bagging | |
def train_bagging(): | |
model = build_model() | |
bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator, | |
max_samples=bagging_sample_fraction,oob_score=bagging_use_oob) | |
#train model | |
bagging_model.fit(XC, yc) | |
#persist model | |
if persist_model: | |
models = bagging_model.estimators_ | |
for m in zip(range(0, len(models)), models): | |
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod" | |
joblib.dump(m[1], model_file) | |
score = bagging_model.score(XC, yc) | |
print "average error %.3f" %(1.0 - score) | |
#linear k fold validation | |
def train_kfold_validation(nfold): | |
if native_kfold_validation: | |
print "native linear kfold validation" | |
model = build_model() | |
scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold) | |
av_score = np.mean(scores) | |
print "average error %.3f" %(1.0 - av_score) | |
else: | |
print "extended linear kfold validation" | |
train_kfold_validation_ext(nfold) | |
#linear k fold validation | |
def train_kfold_validation_ext(nfold): | |
model = build_model() | |
#scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold) | |
#print scores | |
offset = 0 | |
length = dsize / nfold | |
errors = [] | |
fp_errors = [] | |
fn_errors = [] | |
for i in range(0, nfold): | |
print "....Next fold %d" %(i) | |
#split data | |
(XV,yv,X,y) = split_data(offset, length) | |
dvsize = len(XV) | |
#train model | |
model.fit(X, y) | |
#persist model | |
if persist_model: | |
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" | |
joblib.dump(model, model_file) | |
#print support vectors | |
print_support_vectors(model) | |
#predict | |
print "making predictions..." | |
yp = model.predict(XV) | |
#show prediction output | |
(er, fp_er, fn_er) = validate(dvsize,yv,yp) | |
errors.append(er) | |
fp_errors.append(fp_er) | |
fn_errors.append(fn_er) | |
offset += length | |
#average error | |
av_error = np.mean(errors) | |
av_fp_error = np.mean(fp_errors) | |
av_fn_error = np.mean(fn_errors) | |
print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error) | |
# random k fold validation | |
def train_rfold_validation(nfold, niter): | |
if native_rfold_validation: | |
print "native random kfold validation" | |
train_fraction = 1.0 / nfold | |
scores = [] | |
for i in range(0,niter): | |
state = randint(1,100) | |
X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state) | |
model = build_model() | |
model.fit(X,y) | |
scores.append(model.score(XV, yv)) | |
print scores | |
av_score = np.mean(scores) | |
print "average error %.3f" %(1.0 - av_score) | |
else: | |
print "extended random kfold validation" | |
train_rfold_validation_ext(nfold, niter) | |
# random k fold validation | |
def train_rfold_validation_ext(nfold, niter): | |
max_offset_frac = 1.0 - 1.0 / nfold | |
max_offset_frac -= .01 | |
length = dsize / nfold | |
errors = [] | |
fp_errors = [] | |
fn_errors = [] | |
for i in range(0,niter): | |
print "...Next iteration %d" %(i) | |
offset = int(dsize * random.random() * max_offset_frac) | |
print "offset: %d length: %d" %(offset, length) | |
(XV,yv,X,y) = split_data(offset, length) | |
dvsize = len(XV) | |
#build model | |
model = build_model() | |
#train model | |
model.fit(X, y) | |
#persist model | |
if persist_model: | |
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" | |
print "saving model file " + model_file | |
joblib.dump(model, model_file) | |
#print support vectors | |
print_support_vectors(model) | |
#predict | |
print "making predictions..." | |
yp = model.predict(XV) | |
#show prediction output | |
(er, fp_er, fn_er) = validate(dvsize,yv,yp) | |
errors.append(er) | |
fp_errors.append(fp_er) | |
fn_errors.append(fn_er) | |
av_error = np.mean(errors) | |
av_fp_error = np.mean(fp_errors) | |
av_fn_error = np.mean(fn_errors) | |
print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error) | |
# make predictions | |
def predict(): | |
psize = len(X) | |
class_counts = [] | |
#all models | |
for i in range(0, num_models): | |
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod" | |
print "loading model file " + model_file | |
model = joblib.load(model_file) | |
yp = model.predict(X) | |
if i == 0: | |
#initialize class counts | |
for y in yp: | |
class_count = {} | |
if y == 0: | |
class_count[0] = 1 | |
class_count[1] = 0 | |
else: | |
class_count[1] = 1 | |
class_count[0] = 0 | |
class_counts.append(class_count) | |
else: | |
#increment class count | |
for j in range(0, psize): | |
class_count = class_counts[j] | |
y = yp[j] | |
class_count[y] += 1 | |
# predict based on majority vote | |
print "here are the predictions" | |
for k in range(0, psize): | |
class_count = class_counts[k] | |
if (class_count[0] > class_count[1]): | |
y = 0 | |
majority = class_count[0] | |
else: | |
y = 1 | |
majority = class_count[1] | |
print X[k] | |
print "prediction %d majority count %d" %(y, majority) | |
#builds model | |
def build_model(): | |
#build model | |
print "building model..." | |
if algo == "svc": | |
if kernel_fun == "poly": | |
model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff) | |
elif kernel_fun == "rbf" or kernel_fun == "sigmoid": | |
model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff) | |
else: | |
model = sk.svm.SVC(C=penalty,kernel=kernel_fun) | |
elif algo == "nusvc": | |
if kernel_fun == "poly": | |
model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff) | |
elif kernel_fun == "rbf" or kernel_fun == "sigmoid": | |
model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff) | |
else: | |
model = sk.svm.NuSVC(kernel=kernel_fun) | |
elif algo == "linearsvc": | |
model = sk.svm.LinearSVC() | |
else: | |
print "invalid svm algorithm" | |
sys.exit() | |
return model | |
#splits data into training and validation sets | |
def split_data(offset, length): | |
print "splitting data..." | |
#copy data | |
XC_c = np.copy(XC) | |
yc_c = list(yc) | |
# validation set | |
vlo = offset | |
vup = vlo + length | |
if (vup > len(yc)): | |
vup = len(yc) | |
XV = XC_c[vlo:vup:1] | |
yv = yc_c[vlo:vup:1] | |
dvsize = len(XV) | |
print "data size %d validation data size %d" %(dsize, dvsize) | |
#print "validation set" | |
#print XV | |
#print yv | |
#training set | |
X = np.delete(XC_c, np.s_[vlo:vup:1], 0) | |
y = np.delete(yc_c, np.s_[vlo:vup:1], 0) | |
#print "training set" | |
#print X | |
#print y | |
return (XV,yv,X,y) | |
#print support vectors | |
def print_support_vectors(model): | |
if (not algo == "linearsvc"): | |
if print_sup_vectors: | |
print "showing support vectors..." | |
print model.support_vectors_ | |
print "num of support vectors" | |
print model.n_support_ | |
#prints prediction output | |
def validate(dvsize,yv,yp): | |
print "showing predictions..." | |
err_count = 0 | |
tp = 0 | |
tn = 0 | |
fp = 0 | |
fn = 0 | |
for r in range(0,dvsize): | |
#print "actual: %d predicted: %d" %(yv[r], yp[r]) | |
if (not yv[r] == yp[r]): | |
err_count += 1 | |
if (yp[r] == 1 and yv[r] == 1): | |
tp += 1 | |
elif (yp[r] == 1 and yv[r] == 0): | |
fp += 1 | |
elif (yp[r] == 0 and yv[r] == 0): | |
tn += 1 | |
else: | |
fn += 1 | |
er = float(err_count) / dvsize | |
fp_er = float(fp) / dvsize | |
fn_er = float(fn) / dvsize | |
print "error %.3f" %(er) | |
print "true positive : %.3f" %(float(tp) / dvsize) | |
print "false positive: %.3f" %(fp_er) | |
print "true negative : %.3f" %(float(tn) / dvsize) | |
print "false negative: %.3f" %(fn_er) | |
return (er, fp_er, fn_er) | |
# load configuration | |
def getConfigs(configFile): | |
configs = {} | |
print "using following configurations" | |
with open(configFile) as fp: | |
for key, value in jprops.iter_properties(fp): | |
print key, value | |
configs[key] = value | |
return configs | |
# load configuration | |
configs = getConfigs(sys.argv[1]) | |
mode = configs["common.mode"] | |
if mode == "train": | |
#train | |
print "running in train mode" | |
data_file = configs["train.data.file"] | |
feat_field_indices = configs["train.data.feature.fields"].split(",") | |
feat_field_indices = [int(a) for a in feat_field_indices] | |
class_field_index = int(configs["train.data.class.field"]) | |
preprocess = configs["common.preprocessing"] | |
validation = configs["train.validation"] | |
num_folds = int(configs["train.num.folds"]) | |
num_iter = int(configs["train.num.iter"]) | |
algo = configs["train.algorithm"] | |
kernel_fun = configs["train.kernel.function"] | |
poly_degree = int(configs["train.poly.degree"]) | |
penalty = float(configs["train.penalty"]) | |
if penalty < 0: | |
penalty = 1.0 | |
print "using default for penalty" | |
kernel_coeff = float(configs["train.gamma"]) | |
if kernel_coeff < 0: | |
kernel_coeff = 'auto' | |
print "using default for gamma" | |
print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true" | |
persist_model = configs["train.persist.model"].lower() == "true" | |
model_file_directory = configs["common.model.directory"] | |
model_file_prefix = configs["common.model.file.prefix"] | |
print feat_field_indices | |
#extract feature fields | |
d = np.loadtxt(data_file, delimiter=',') | |
dsize = len(d) | |
XC = d[:,feat_field_indices] | |
#preprocess features | |
if (preprocess == "scale"): | |
XC = sk.preprocessing.scale(XC) | |
elif (preprocess == "normalize"): | |
XC = sk.preprocessing.normalize(XC, norm='l2') | |
else: | |
print "no preprocessing done" | |
#extract output field | |
yc = d[:,[class_field_index]] | |
yc = yc.reshape(dsize) | |
yc = [int(a) for a in yc] | |
#print XC | |
#print yc | |
# train model | |
if validation == "kfold": | |
native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true" | |
train_kfold_validation(num_folds) | |
elif validation == "rfold": | |
native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true" | |
train_rfold_validation(num_folds,num_iter) | |
elif validation == "bagging": | |
bagging_num_estimator = int(configs["train.bagging.num.estimators"]) | |
bagging_sample_fraction = float(configs["train.bagging.sample.fraction"]) | |
bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true" | |
train_bagging() | |
else: | |
print "invalid training validation method" | |
sys.exit() | |
else: | |
#predict | |
print "running in prediction mode" | |
pred_data_file = configs["pred.data.file"] | |
pred_feat_field_indices = configs["pred.data.feature.fields"].split(",") | |
pred_feat_field_indices = [int(a) for a in pred_feat_field_indices] | |
preprocess = configs["common.preprocessing"] | |
num_models = int(configs["pred.num.models"]) | |
model_file_directory = configs["common.model.directory"] | |
model_file_prefix = configs["common.model.file.prefix"] | |
#extract feature fields | |
pd = np.loadtxt(pred_data_file, delimiter=',') | |
pdsize = len(pd) | |
X = pd[:,pred_feat_field_indices] | |
#preprocess features | |
if (preprocess == "scale"): | |
X = sk.preprocessing.scale(X) | |
elif (preprocess == "normalize"): | |
X = sk.preprocessing.normalize(X, norm='l2') | |
else: | |
print "no preprocessing done" | |
predict() | |