Spaces:

rushidarge
/

Gallagher_App

Sleeping

App Files Files Community

Gallagher_App / app.py

rushidarge

Update app.py

24d48f2 over 1 year ago

raw

history blame

5.14 kB

	import streamlit as st
	import pandas as pd
	import pickle
	import joblib
	import re
	import pandas as pd
	import numpy as np
	import re
	import string
	from string import digits
	from sklearn import metrics
	import pickle
	import time
	from sentence_transformers import SentenceTransformer

	# Create a Streamlit app
	st.title("Text Classification and Excel Processing App")

	# File upload for Excel file
	uploaded_file = st.file_uploader("Upload an Excel file", type=["xlsx"])

	def pre_processing(data_frame):

	# Lowercase all characters
	data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: x.lower())

	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"won\'t", "will not", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"can\'t", "can not", x))

	# general
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"n\'t", " not", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'re", " are", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'s", " is", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'d", " would", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'ll", " will", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'t", " not", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'ve", " have", x))
	data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'m", " am", x))

	# Remove quotes
	data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: re.sub("'", '', x))



	exclude = set(string.punctuation) # Set of all special characters
	# Remove all the special characters
	data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))


	# Remove all numbers from text
	remove_digits = str.maketrans('', '', digits)
	data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: x.translate(remove_digits))


	# remove extra
	data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: re.sub('[-_.:;\[\]\\|,]', '', x))


	# Remove extra spaces
	data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: x.strip())

	data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: re.sub(" +", " ", x))

	return data_frame

	step_1_model_path = "output/lr_step_1.pickle"
	step_2_model_path = "output/lr_basemodel_step_2.pickle"

	step_1_model = pickle.load(open(step_1_model_path, 'rb'))
	step_2_model = pickle.load(open(step_2_model_path, 'rb'))
	count_vector_step_1 = joblib.load("output/count_vector_step_1.pkl")
	count_vector_step_2 = joblib.load("output/count_vector_step_2.pkl")
	fewer_class_dict = joblib.load("output/fewer_class_dictionary.pkl")
	acc_src_model = joblib.load("output/bert_acc_src.pickle")
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



	def predict(model_1,model_2,final_dict,query):
	# predict

	test_1 = count_vector_step_1.transform([query])
	y_pred = model_1.predict(test_1)
	if y_pred == 'med':
	test_2 = count_vector_step_2.transform([query])
	y_pred = model_2.predict(test_2)
	else:
	y_pred = y_pred

	if query in final_dict.keys():
	y_pred = final_dict[query]
	else:
	y_pred = y_pred

	return y_pred[0]

	if uploaded_file is not None:
	# Read the uploaded Excel file
	excel_data = pd.read_excel(uploaded_file)


	final_result= []
	print('Preprocessing Started')
	test_data = pre_processing(excel_data)
	x_test = test_data['Claim Description']
	print('Prediction Started')
	for query in x_test:
	result = predict(step_1_model,step_2_model,fewer_class_dict,query)
	final_result.append(result)
	excel_data['predicted_coverage_code'] = final_result


	X_bert_enc = model.encode(x_test.values, show_progress_bar=True,)
	accident_source_pred = acc_src_model.predict(X_bert_enc)
	excel_data['predicted_accident_src'] = accident_source_pred


	st.dataframe(excel_data) # Display the processed data

	# Add a link to download the processed data
	st.markdown(get_binary_file_downloader_link(excel_data, 'my_processed_file.xlsx', 'Download Processed Data'), unsafe_allow_html=True)


	# Create a new Excel file with the processed data
	output_filename = "processed_data.xlsx"
	excel_data.to_excel(output_filename, index=False)

	# Display a link to download the processed file
	st.markdown(f"Download Processed Data: [Processed Data](data:{output_filename})")



	# Add a placeholder for displaying "Done" after processing
	if uploaded_file is not None:
	st.write("Done")