rushidarge commited on
Commit
199b89f
1 Parent(s): 256a345

Upload 8 files

Browse files
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pickle
4
+ import joblib
5
+ import re
6
+ import pandas as pd
7
+ import numpy as np
8
+ import re
9
+ import string
10
+ from string import digits
11
+ from sklearn import metrics
12
+ import pickle
13
+ import time
14
+ from sentence_transformers import SentenceTransformer
15
+
16
+ # Create a Streamlit app
17
+ st.title("Text Classification and Excel Processing App")
18
+
19
+ # File upload for Excel file
20
+ uploaded_file = st.file_uploader("Upload an Excel file", type=["xlsx"])
21
+
22
+ def pre_processing(data_frame):
23
+
24
+ # Lowercase all characters
25
+ data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: x.lower())
26
+
27
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"won\'t", "will not", x))
28
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"can\'t", "can not", x))
29
+
30
+ # general
31
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"n\'t", " not", x))
32
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'re", " are", x))
33
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'s", " is", x))
34
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'d", " would", x))
35
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'ll", " will", x))
36
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'t", " not", x))
37
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'ve", " have", x))
38
+ data_frame['Claim Description'] = data_frame['Claim Description'].apply(lambda x: re.sub(r"\'m", " am", x))
39
+
40
+ # Remove quotes
41
+ data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: re.sub("'", '', x))
42
+
43
+
44
+
45
+ exclude = set(string.punctuation) # Set of all special characters
46
+ # Remove all the special characters
47
+ data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
48
+
49
+
50
+ # Remove all numbers from text
51
+ remove_digits = str.maketrans('', '', digits)
52
+ data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: x.translate(remove_digits))
53
+
54
+
55
+ # remove extra
56
+ data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: re.sub('[-_.:;\[\]\|,]', '', x))
57
+
58
+
59
+ # Remove extra spaces
60
+ data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: x.strip())
61
+
62
+ data_frame['Claim Description']=data_frame['Claim Description'].apply(lambda x: re.sub(" +", " ", x))
63
+
64
+ return data_frame
65
+
66
+ step_1_model_path = "output/lr_step_1.pickle"
67
+ step_2_model_path = "output/lr_basemodel_step_2.pickle"
68
+
69
+ step_1_model = pickle.load(open(step_1_model_path, 'rb'))
70
+ step_2_model = pickle.load(open(step_2_model_path, 'rb'))
71
+ count_vector_step_1 = joblib.load("output/count_vector_step_1.pkl")
72
+ count_vector_step_2 = joblib.load("output/count_vector_step_2.pkl")
73
+ fewer_class_dict = joblib.load("output/fewer_class_dictionary.pkl")
74
+ acc_src_model = joblib.load("output/bert_acc_src.pickle")
75
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
76
+
77
+
78
+
79
+ def predict(model_1,model_2,final_dict,query):
80
+ # predict
81
+
82
+ test_1 = count_vector_step_1.transform([query])
83
+ y_pred = model_1.predict(test_1)
84
+ if y_pred == 'med':
85
+ test_2 = count_vector_step_2.transform([query])
86
+ y_pred = model_2.predict(test_2)
87
+ else:
88
+ y_pred = y_pred
89
+
90
+ if query in final_dict.keys():
91
+ y_pred = final_dict[query]
92
+ else:
93
+ y_pred = y_pred
94
+
95
+ return y_pred[0]
96
+
97
+ if uploaded_file is not None:
98
+ # Read the uploaded Excel file
99
+ excel_data = pd.read_excel(uploaded_file)
100
+
101
+
102
+ final_result= []
103
+ print('Preprocessing Started')
104
+ test_data = pre_processing(excel_data)
105
+ x_test = test_data['Claim Description']
106
+ print('Prediction Started')
107
+ for query in x_test:
108
+ result = predict(step_1_model,step_2_model,fewer_class_dict,query)
109
+ final_result.append(result)
110
+ excel_data['predicted_coverage_code'] = final_result
111
+
112
+
113
+ X_bert_enc = model.encode(x_test.values, show_progress_bar=True,)
114
+ accident_source_pred = acc_src_model.predict(X_bert_enc)
115
+ excel_data['predicted_accident_src'] = accident_source_pred
116
+
117
+ # Create a new Excel file with the processed data
118
+ output_filename = "processed_data.xlsx"
119
+ excel_data.to_excel(output_filename, index=False)
120
+
121
+ # Display a link to download the processed file
122
+ st.markdown(f"Download Processed Data: [Processed Data](data:{output_filename})")
123
+
124
+
125
+
126
+ # Add a placeholder for displaying "Done" after processing
127
+ if uploaded_file is not None:
128
+ st.write("Done")
output/bert_acc_src.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dfe6bea7e8b9bee7801f0653dd191b2b030f512ef4b05624e2112011282ca60
3
+ size 969252
output/count_vector_step_1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db058f56e2185939cb35485acc242922e440365b06845dea9558dda5238585e1
3
+ size 1111318
output/count_vector_step_2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eea35f3601237ec71bf083d1bb9a548878f7ebc48649b784c87cd244c445712
3
+ size 136198
output/fewer_class_dictionary.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bf4ede12a0d37cef25165d6b32de7a60057129d98c346395ee5ee8cf2220490
3
+ size 1959
output/lr_basemodel_step_2.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85c831ca28039a004d57ca37e8b1a94a9b68863361ae9bfa997958e3b87922c7
3
+ size 2152799
output/lr_step_1.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9e7eceb902734e3f2050789c4565b3f91be4a2d9477b444b2c58c988e9eb269
3
+ size 8070547
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib 1.1.0
2
+ numpy 1.21.5
3
+ pandas 1.4.4
4
+ regex 2022.7.9
5
+ scikit-image 0.19.2
6
+ scikit-learn 1.0.2
7
+ scikit-learn-intelex 2021.20221004.171935
8
+ scipy 1.9.1
9
+ Scrapy 2.6.2
10
+ sentence-transformers 2.2.2
11
+ streamlit 1.28.0
12
+ tokenizers 0.14.1
13
+ tqdm 4.64.1
14
+ transformers 4.34.1