Spaces:
Runtime error
Runtime error
#imports | |
import os | |
import pandas as pd | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import classification_report | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import VotingClassifier | |
from xgboost import XGBClassifier | |
from time import sleep | |
from sklearn.model_selection import KFold | |
from sklearn import metrics | |
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe | |
from lightgbm import LGBMClassifier | |
import pickle | |
import gensim | |
from gensim.models import Word2Vec | |
def set_paths(): | |
print("[+] Setting paths...") | |
base_path = os.getcwd() | |
data_path = os.path.join(base_path,'Data') | |
models_path = os.path.join(base_path,'Models') | |
file_path = os.path.join(data_path,'Consolidated Expense Tagging.xlsx') | |
return file_path | |
def read_data(file_path): | |
sleep(2) | |
print("[+] Reading data...") | |
xls = pd.ExcelFile(file_path) | |
data = pd.read_excel(xls) | |
print(data.head()) | |
print(data["Category"].value_counts()) | |
return data | |
#Preprocessing the data | |
def preprocess(data): | |
sleep(2) | |
print("[+] Preprocessing...") | |
map_dict = {"Food and Groceries": 0, "Medical and Healthcare": 1,"Education":2,"Lifestyle and Entertainment":3,"Travel & Transportation":4,"Clothing":5}#,"Eye":6,"Shoe":7}#"Housing and Utilities":3 | |
data["Category"] = data["Category"].map(map_dict) | |
print(map_dict) | |
return data | |
#train-test split | |
def training_utils(data): | |
sleep(2) | |
print("[+] Splitting data...") | |
xtrain, xtest, ytrain, ytest = train_test_split( | |
data['Name'], | |
data["Category"], | |
test_size=0.30, | |
random_state=60, | |
stratify=data["Category"], | |
) | |
return xtrain,xtest,ytrain,ytest | |
def word_vec(xtrain,xtest): | |
print("[+] Vectorizing...") | |
w2v = Word2Vec(data, min_count = 1, vector_size = 100,window = 5, sg = 1) | |
train_df = pd.DataFrame( | |
tfidf.transform(xtrain).toarray(), columns=tfidf.vocabulary_ | |
) | |
test_df = pd.DataFrame( | |
tfidf.transform(xtest).toarray(), columns=tfidf.vocabulary_ | |
) | |
# pickle.dump(tfidf, open("tfidf.pickle", "wb")) | |
return tfidf,train_df, test_df | |
#modeling | |
def tfidf(xtrain,xtest): | |
sleep(2) | |
print("[+] Vectorizing...") | |
tfidf = TfidfVectorizer(min_df = 5,sublinear_tf=True ,lowercase=True,max_features=300, stop_words="english") | |
tfidf = tfidf.fit(xtrain) | |
train_df = pd.DataFrame( | |
tfidf.transform(xtrain).toarray(), columns=tfidf.vocabulary_ | |
) | |
test_df = pd.DataFrame( | |
tfidf.transform(xtest).toarray(), columns=tfidf.vocabulary_ | |
) | |
# pickle.dump(tfidf, open("tfidf.pickle", "wb")) | |
return tfidf,train_df, test_df | |
def fit_model(model,train_df,test_df,y_train,ytest): | |
sleep(2) | |
print("[+] Fitting Model...") | |
model.fit(train_df, y_train) | |
preds = model.predict(test_df) | |
print(model) | |
print(classification_report(ytest, preds)) | |
return model | |
def tune_hyperparameters(space): | |
clf= XGBClassifier( | |
n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'], | |
reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), | |
colsample_bytree=int(space['colsample_bytree'])) | |
evaluation = [( train_df, y_train), ( test_df, ytest)] | |
clf.fit(train_df, y_train, | |
eval_set=evaluation, eval_metric="mlogloss",verbose=False) | |
pred = clf.predict(test_df) | |
accuracy = metrics.accuracy_score(ytest, pred) | |
print ("SCORE:", accuracy) | |
return {'loss': -accuracy, 'status': STATUS_OK } | |
if __name__=="__main__": | |
file_path = set_paths() | |
data = read_data(file_path) | |
data = preprocess(data) | |
xtrain,xtest, y_train, ytest = training_utils(data) | |
fitted_tfidf, train_df,test_df = tfidf(xtrain,xtest) | |
st_x= StandardScaler() | |
train_df = st_x.fit_transform(train_df) | |
test_df = st_x.transform(test_df) | |
# model = KNeighborsClassifier(n_neighbors=63) | |
# knn = fit_model(model,train_df,test_df,y_train,ytest) | |
#model = DecisionTreeClassifier() | |
#dt = fit_model(model,train_df,test_df,y_train,ytest) | |
#model = RandomForestClassifier(n_estimators= 300, criterion="entropy") | |
# rf = fit_model(model,train_df,test_df,y_train,ytest) | |
# final_model = VotingClassifier(estimators=[('dt',dt),('rf',rf)]) | |
# vote = fit_model(final_model,train_df,test_df,y_train,ytest) | |
# model = XGBClassifier(booster = 'dart') | |
# xgb = fit_model(model,train_df,test_df,y_train,ytest) | |
# model = XGBClassifier() | |
# xgb = fit_model(model,train_df,test_df,y_train,ytest) | |
a=0 | |
# for row in test_df: | |
# print(xgb.predict_proba(row.reshape(1,-1))) | |
# print(xgb.predict(row.reshape(1,-1))) | |
# a+=1 | |
# if a ==10: | |
# break | |
# pickle.dump(fitted_tfidf,open("tfidf2.pickle",'wb')) | |
w = ["Arun Clothes","Pooja Medical","ABC Tours and Travels", "Dev Super Mart"] | |
w1 = [] | |
for i in w: | |
w1.append(fitted_tfidf.transform([i])) | |
print(w1) | |
w2 = [] | |
for i in w: | |
w2.append(fitted_tfidf.transform([i])) | |
print(w2) | |
from sklearn.linear_model import LogisticRegression | |
#multi_class="multinomial" | |
model = LogisticRegression(solver='saga', max_iter=300,multi_class="multinomial",warm_start=True) | |
lr = fit_model(model,train_df,test_df,y_train,ytest) | |
print("LR") | |
for i in range(len(w1)): | |
print(w[i]) | |
print(lr.predict(w1[i])) | |
print(lr.predict_proba(w1[i])) | |
#model = KNeighborsClassifier(n_neighbors=75) | |
#knn = fit_model(model,train_df,test_df,y_train,ytest) | |
#print("KNN") | |
#for i in range(len(w1)): | |
# print(w[i]) | |
# print(knn.predict(w1[i])) | |
# print(knn.predict_proba(w1[i])) | |
#print("DT") | |
#for i in range(len(w1)): | |
# print(w[i]) | |
# print(dt.predict(w1[i])) | |
# print(dt.predict_proba(w1[i])) | |
filename = 'final_lr1.sav' | |
# pickle.dump(lr, open(filename, 'wb')) | |
# model = XGBClassifier(colsample_bytree=0.6059329304964837, | |
# gamma=2.361923398781385, | |
# max_depth=12, | |
# min_child_weight=6, | |
# reg_alpha=41.0, | |
# reg_lambda=0.00474534836744336, | |
# booster = 'dart' | |
# ) | |
# xgb = fit_model(model,train_df,test_df,y_train,ytest) | |
# final_model = VotingClassifier(estimators=[('dt',dt),('knn',knn),('lr',lr)]) | |
# final_model = fit_model(final_model,train_df,test_df,y_train,ytest) | |
# print("Final") | |
# for i in range(len(w1)): | |
# print(w[i]) | |
# print(dt.predict_proba(w1[i])) | |
# trials = Trials() | |
# space={'max_depth': hp.quniform("max_depth", 3, 18, 1), | |
# 'gamma': hp.uniform ('gamma', 1,9), | |
# 'reg_alpha' : hp.quniform('reg_alpha', 40,180,1), | |
# 'reg_lambda' : hp.uniform('reg_lambda', 0,1), | |
# 'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1), | |
# 'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1), | |
# 'n_estimators': 180, | |
# 'seed': 0 | |
# } | |
# best_hyperparams = fmin(fn = tune_hyperparameters, | |
# space = space, | |
# algo = tpe.suggest, | |
# max_evals = 100, | |
# trials = trials) | |
# with open(r'hyperparams.txt', 'w') as fp: | |
# for k,v in best_hyperparams.items(): | |
# # write each item on a new line | |
# fp.write(f"{k}={v},\n") | |
# print('Done') | |