Spaces:

nirmalya8
/

expense_tagging

Runtime error

App Files Files Community

expense_tagging / src /tfidf.py

nirmalya8

Restructured Files

4c4d71e over 1 year ago

raw

history blame contribute delete

No virus

8.07 kB

	#imports
	import os
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report
	from sklearn.linear_model import LogisticRegression
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.ensemble import VotingClassifier
	from xgboost import XGBClassifier
	from time import sleep
	from sklearn.model_selection import KFold
	from sklearn import metrics
	from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
	from lightgbm import LGBMClassifier
	import pickle
	import gensim
	from gensim.models import Word2Vec

	def set_paths():
	print("[+] Setting paths...")
	base_path = os.getcwd()
	data_path = os.path.join(base_path,'Data')
	models_path = os.path.join(base_path,'Models')
	file_path = os.path.join(data_path,'Consolidated Expense Tagging.xlsx')
	return file_path

	def read_data(file_path):
	sleep(2)
	print("[+] Reading data...")
	xls = pd.ExcelFile(file_path)
	data = pd.read_excel(xls)
	print(data.head())
	print(data["Category"].value_counts())
	return data
	#Preprocessing the data
	def preprocess(data):
	sleep(2)
	print("[+] Preprocessing...")
	map_dict = {"Food and Groceries": 0, "Medical and Healthcare": 1,"Education":2,"Lifestyle and Entertainment":3,"Travel & Transportation":4,"Clothing":5}#,"Eye":6,"Shoe":7}#"Housing and Utilities":3
	data["Category"] = data["Category"].map(map_dict)
	print(map_dict)
	return data
	#train-test split
	def training_utils(data):
	sleep(2)
	print("[+] Splitting data...")
	xtrain, xtest, ytrain, ytest = train_test_split(
	data['Name'],
	data["Category"],
	test_size=0.30,
	random_state=60,
	stratify=data["Category"],
	)
	return xtrain,xtest,ytrain,ytest

	def word_vec(xtrain,xtest):
	print("[+] Vectorizing...")
	w2v = Word2Vec(data, min_count = 1, vector_size = 100,window = 5, sg = 1)
	train_df = pd.DataFrame(
	tfidf.transform(xtrain).toarray(), columns=tfidf.vocabulary_
	)

	test_df = pd.DataFrame(
	tfidf.transform(xtest).toarray(), columns=tfidf.vocabulary_
	)

	# pickle.dump(tfidf, open("tfidf.pickle", "wb"))
	return tfidf,train_df, test_df
	#modeling
	def tfidf(xtrain,xtest):
	sleep(2)
	print("[+] Vectorizing...")
	tfidf = TfidfVectorizer(min_df = 5,sublinear_tf=True ,lowercase=True,max_features=300, stop_words="english")
	tfidf = tfidf.fit(xtrain)
	train_df = pd.DataFrame(
	tfidf.transform(xtrain).toarray(), columns=tfidf.vocabulary_
	)

	test_df = pd.DataFrame(
	tfidf.transform(xtest).toarray(), columns=tfidf.vocabulary_
	)

	# pickle.dump(tfidf, open("tfidf.pickle", "wb"))
	return tfidf,train_df, test_df

	def fit_model(model,train_df,test_df,y_train,ytest):
	sleep(2)
	print("[+] Fitting Model...")
	model.fit(train_df, y_train)
	preds = model.predict(test_df)
	print(model)
	print(classification_report(ytest, preds))
	return model

	def tune_hyperparameters(space):

	clf= XGBClassifier(
	n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
	reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
	colsample_bytree=int(space['colsample_bytree']))

	evaluation = [( train_df, y_train), ( test_df, ytest)]

	clf.fit(train_df, y_train,
	eval_set=evaluation, eval_metric="mlogloss",verbose=False)
	pred = clf.predict(test_df)
	accuracy = metrics.accuracy_score(ytest, pred)
	print ("SCORE:", accuracy)
	return {'loss': -accuracy, 'status': STATUS_OK }



	if __name__=="__main__":
	file_path = set_paths()
	data = read_data(file_path)
	data = preprocess(data)
	xtrain,xtest, y_train, ytest = training_utils(data)
	fitted_tfidf, train_df,test_df = tfidf(xtrain,xtest)
	st_x= StandardScaler()
	train_df = st_x.fit_transform(train_df)
	test_df = st_x.transform(test_df)

	# model = KNeighborsClassifier(n_neighbors=63)
	# knn = fit_model(model,train_df,test_df,y_train,ytest)

	#model = DecisionTreeClassifier()
	#dt = fit_model(model,train_df,test_df,y_train,ytest)

	#model = RandomForestClassifier(n_estimators= 300, criterion="entropy")
	# rf = fit_model(model,train_df,test_df,y_train,ytest)

	# final_model = VotingClassifier(estimators=[('dt',dt),('rf',rf)])
	# vote = fit_model(final_model,train_df,test_df,y_train,ytest)


	# model = XGBClassifier(booster = 'dart')
	# xgb = fit_model(model,train_df,test_df,y_train,ytest)

	# model = XGBClassifier()
	# xgb = fit_model(model,train_df,test_df,y_train,ytest)
	a=0
	# for row in test_df:
	# print(xgb.predict_proba(row.reshape(1,-1)))
	# print(xgb.predict(row.reshape(1,-1)))
	# a+=1
	# if a ==10:
	# break

	# pickle.dump(fitted_tfidf,open("tfidf2.pickle",'wb'))
	w = ["Arun Clothes","Pooja Medical","ABC Tours and Travels", "Dev Super Mart"]
	w1 = []
	for i in w:
	w1.append(fitted_tfidf.transform([i]))
	print(w1)

	w2 = []
	for i in w:
	w2.append(fitted_tfidf.transform([i]))
	print(w2)


	from sklearn.linear_model import LogisticRegression
	#multi_class="multinomial"
	model = LogisticRegression(solver='saga', max_iter=300,multi_class="multinomial",warm_start=True)
	lr = fit_model(model,train_df,test_df,y_train,ytest)
	print("LR")
	for i in range(len(w1)):
	print(w[i])
	print(lr.predict(w1[i]))
	print(lr.predict_proba(w1[i]))

	#model = KNeighborsClassifier(n_neighbors=75)
	#knn = fit_model(model,train_df,test_df,y_train,ytest)
	#print("KNN")
	#for i in range(len(w1)):
	# print(w[i])
	# print(knn.predict(w1[i]))
	# print(knn.predict_proba(w1[i]))

	#print("DT")
	#for i in range(len(w1)):
	# print(w[i])
	# print(dt.predict(w1[i]))
	# print(dt.predict_proba(w1[i]))
	filename = 'final_lr1.sav'


	# pickle.dump(lr, open(filename, 'wb'))
	# model = XGBClassifier(colsample_bytree=0.6059329304964837,
	# gamma=2.361923398781385,
	# max_depth=12,
	# min_child_weight=6,
	# reg_alpha=41.0,
	# reg_lambda=0.00474534836744336,
	# booster = 'dart'
	# )

	# xgb = fit_model(model,train_df,test_df,y_train,ytest)


	# final_model = VotingClassifier(estimators=[('dt',dt),('knn',knn),('lr',lr)])
	# final_model = fit_model(final_model,train_df,test_df,y_train,ytest)
	# print("Final")
	# for i in range(len(w1)):
	# print(w[i])
	# print(dt.predict_proba(w1[i]))

	# trials = Trials()
	# space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
	# 'gamma': hp.uniform ('gamma', 1,9),
	# 'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
	# 'reg_lambda' : hp.uniform('reg_lambda', 0,1),
	# 'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
	# 'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
	# 'n_estimators': 180,
	# 'seed': 0
	# }

	# best_hyperparams = fmin(fn = tune_hyperparameters,
	# space = space,
	# algo = tpe.suggest,
	# max_evals = 100,
	# trials = trials)

	# with open(r'hyperparams.txt', 'w') as fp:
	# for k,v in best_hyperparams.items():
	# # write each item on a new line
	# fp.write(f"{k}={v},\n")
	# print('Done')