expense_tagging / app.py
nirmalya8's picture
Updated Readme
d52fe35
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from src.tfidf import set_paths, read_data, preprocess, training_utils, tfidf
from src.check import find_closest_match
from src.check import is_str_in
#tfidf = TfidfVectorizer(max_features=300, stop_words="english")
import streamlit as st
st.title("Expense Tagging")
st.subheader("Upload a txt file with each line containing a brand, we'll tell you their categories")
file_name = "src/brands.json"
with open(file_name,'r') as f:
data = json.load(f)
brands = []
categories = []
for k in data.keys():
brands.append(data[k]["name"])
categories.append(data[k]["category"])
#i = input()
#print(len(brands), len(categories))
#bo,ind = is_str_in(i,brands)
#a,b,_,_ = find_closest_match(i,brands)
#print(a,b,brands[b])
#if bo:
# print(categories[ind])
#print(categories[b])
filename = 'src/Models/final_lr1.sav'
loaded_model = pickle.load(open(filename, 'rb'))
with open('src/Models/tfidf2.pickle','rb') as to_read:
fitted_tfidf = pickle.load(to_read)
map_dict = {0:"Food and Groceries", 1:"Medical and Healthcare",2:"Education",3:"Lifestyle and Entertainment",4:"Travel & Transportation",5:"Clothing"}
def predict_model(brand):
bo,ind = is_str_in(brand,brands)
if bo:
out = categories[ind]
else:
w = fitted_tfidf.transform([brand])
# print(w)
pred = loaded_model.predict(w)
out = map_dict[pred[0]]
return out
# print(loaded_model.predict(w))
#out = categories[out]
import time
# brand = st.text_input("Enter the name of the brand")
# submit = st.form_submit_button('Submit')
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file is not None:
uploaded_file = uploaded_file.getvalue().decode('utf-8').splitlines()
# st.write(uploaded_file)
# print the list
#print(content_list)
# remove new line characters
brand_list = [x.strip() for x in uploaded_file]
#st.write(" ".join(content_list))
st.subheader("Output File")
with st.spinner(text="This may take a moment..."):
time.sleep(2)
out_list = []
for brand in brand_list:
out_list.append(brand+" -> "+predict_model(brand))
# bo,ind = is_str_in(brand,brands)
# if bo:
# out = categories[ind]
# else:
# a,out1,_,_=find_closest_match(brand,brands)
# w = fitted_tfidf.transform([brand])
# # print(w)
# pred = loaded_model.predict(w)
# out = map_dict[pred[0]]
# out = "Normal String matching:"+str(categories[out1])+"\n"+" Model:"+out
# print(loaded_model.predict(w))
#out = categories[out]
out = "\n".join(out_list)
st.download_button('Download Outputs', out)
#'''
#while True:
#w =input()
#if w == 'b':
# break
#'''