ppsingh commited on
Commit
5c08ad9
1 Parent(s): c51fd51

Update appStore/sector.py

Browse files
Files changed (1) hide show
  1. appStore/sector.py +45 -167
appStore/sector.py CHANGED
@@ -1,167 +1,45 @@
1
- # set path
2
- import glob, os, sys;
3
- sys.path.append('../utils')
4
-
5
- #import needed libraries
6
- import seaborn as sns
7
- import matplotlib.pyplot as plt
8
- import numpy as np
9
- import pandas as pd
10
- import streamlit as st
11
- from utils.sector_classifier import load_sectorClassifier, sector_classification
12
- import logging
13
- logger = logging.getLogger(__name__)
14
- from utils.config import get_classifier_params
15
- from utils.preprocessing import paraLengthCheck
16
- from io import BytesIO
17
- import xlsxwriter
18
- import plotly.express as px
19
-
20
-
21
- # Declare all the necessary variables
22
- classifier_identifier = 'sector'
23
- params = get_classifier_params(classifier_identifier)
24
-
25
- def app():
26
-
27
- ### Main app code ###
28
- with st.container():
29
-
30
- if 'key1' in st.session_state:
31
- df = st.session_state.key1
32
- classifier = load_sectorClassifier(classifier_name=params['model_name'])
33
- st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
34
-
35
- if sum(df['Target Label'] == 'TARGET') > 100:
36
- warning_msg = ": This might take sometime, please sit back and relax."
37
- else:
38
- warning_msg = ""
39
-
40
- df = sector_classification(haystack_doc=df,
41
- threshold= params['threshold'])
42
-
43
- st.session_state.key1 = df
44
-
45
- # @st.cache_data
46
- # def to_excel(df,sectorlist):
47
- # len_df = len(df)
48
- # output = BytesIO()
49
- # writer = pd.ExcelWriter(output, engine='xlsxwriter')
50
- # df.to_excel(writer, index=False, sheet_name='Sheet1')
51
- # workbook = writer.book
52
- # worksheet = writer.sheets['Sheet1']
53
- # worksheet.data_validation('S2:S{}'.format(len_df),
54
- # {'validate': 'list',
55
- # 'source': ['No', 'Yes', 'Discard']})
56
- # worksheet.data_validation('X2:X{}'.format(len_df),
57
- # {'validate': 'list',
58
- # 'source': sectorlist + ['Blank']})
59
- # worksheet.data_validation('T2:T{}'.format(len_df),
60
- # {'validate': 'list',
61
- # 'source': sectorlist + ['Blank']})
62
- # worksheet.data_validation('U2:U{}'.format(len_df),
63
- # {'validate': 'list',
64
- # 'source': sectorlist + ['Blank']})
65
- # worksheet.data_validation('V2:V{}'.format(len_df),
66
- # {'validate': 'list',
67
- # 'source': sectorlist + ['Blank']})
68
- # worksheet.data_validation('W2:U{}'.format(len_df),
69
- # {'validate': 'list',
70
- # 'source': sectorlist + ['Blank']})
71
- # writer.save()
72
- # processed_data = output.getvalue()
73
- # return processed_data
74
-
75
- # # st.write(df)
76
- # threshold= params['threshold']
77
- # truth_df = df.drop(['text'],axis=1)
78
- # truth_df = truth_df.astype(float) >= threshold
79
- # truth_df = truth_df.astype(str)
80
- # categories = list(truth_df.columns)
81
-
82
- # placeholder = {}
83
- # for val in categories:
84
- # placeholder[val] = dict(truth_df[val].value_counts())
85
- # count_df = pd.DataFrame.from_dict(placeholder)
86
- # count_df = count_df.T
87
- # count_df = count_df.reset_index()
88
- # # st.write(count_df)
89
- # placeholder = []
90
- # for i in range(len(count_df)):
91
- # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
92
- # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
93
- # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
94
- # # st.write("Total Paragraphs: {}".format(len(df)))
95
- # fig = px.bar(count_df, x='category', y='count',
96
- # color='truth_value')
97
- # # c1, c2 = st.columns([1,1])
98
- # # with c1:
99
- # st.plotly_chart(fig,use_container_width= True)
100
-
101
- # truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
102
- # truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
103
- # # st.write(truth_df)
104
- # df = pd.concat([df,truth_df['labels']],axis=1)
105
- # df['Validation'] = 'No'
106
- # df['Sector1'] = 'Blank'
107
- # df['Sector2'] = 'Blank'
108
- # df['Sector3'] = 'Blank'
109
- # df['Sector4'] = 'Blank'
110
- # df['Sector5'] = 'Blank'
111
- # df_xlsx = to_excel(df,categories)
112
- # st.download_button(label='📥 Download Current Result',
113
- # data=df_xlsx ,
114
- # # file_name= 'file_sector.xlsx')
115
- # else:
116
- # st.info("🤔 No document found, please try to upload it at the sidebar!")
117
- # logging.warning("Terminated as no document provided")
118
-
119
- # # Creating truth value dataframe
120
- # if 'key' in st.session_state:
121
- # if st.session_state.key is not None:
122
- # df = st.session_state.key
123
- # st.markdown("###### Select the threshold for classifier ######")
124
- # c4, c5 = st.columns([1,1])
125
-
126
- # with c4:
127
- # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
128
- # step=0.01, value=0.5,
129
- # help = "Keep High Value if want refined result, low if dont want to miss anything" )
130
- # sectors =set(df.columns)
131
- # removecols = {'Validation','Sector1','Sector2','Sector3','Sector4',
132
- # 'Sector5','text'}
133
- # sectors = list(sectors - removecols)
134
-
135
- # placeholder = {}
136
- # for val in sectors:
137
- # temp = df[val].astype(float) > threshold
138
- # temp = temp.astype(str)
139
- # placeholder[val] = dict(temp.value_counts())
140
-
141
- # count_df = pd.DataFrame.from_dict(placeholder)
142
- # count_df = count_df.T
143
- # count_df = count_df.reset_index()
144
- # placeholder = []
145
- # for i in range(len(count_df)):
146
- # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
147
- # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
148
-
149
- # count_df = pd.DataFrame(placeholder, columns = ['sector','count','truth_value'])
150
- # fig = px.bar(count_df, x='sector', y='count',
151
- # color='truth_value',
152
- # height=400)
153
- # st.write("")
154
- # st.plotly_chart(fig)
155
-
156
- # df['Validation'] = 'No'
157
- # df['Sector1'] = 'Blank'
158
- # df['Sector2'] = 'Blank'
159
- # df['Sector3'] = 'Blank'
160
- # df['Sector4'] = 'Blank'
161
- # df['Sector5'] = 'Blank'
162
- # df_xlsx = to_excel(df,sectors)
163
- # st.download_button(label='📥 Download Current Result',
164
- # data=df_xlsx ,
165
- # file_name= 'file_sector.xlsx')
166
-
167
-
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.sector_classifier import load_sectorClassifier, sector_classification
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from utils.preprocessing import paraLengthCheck
16
+ from io import BytesIO
17
+ import xlsxwriter
18
+ import plotly.express as px
19
+
20
+
21
+ # Declare all the necessary variables
22
+ classifier_identifier = 'sector'
23
+ params = get_classifier_params(classifier_identifier)
24
+
25
+ def app():
26
+
27
+ ### Main app code ###
28
+ with st.container():
29
+
30
+ if 'key1' in st.session_state:
31
+ df = st.session_state.key1
32
+ classifier = load_sectorClassifier(classifier_name=params['model_name'])
33
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
34
+
35
+ if sum(df['Target Label'] == 'TARGET') > 100:
36
+ warning_msg = ": This might take sometime, please sit back and relax."
37
+ else:
38
+ warning_msg = ""
39
+
40
+ df = sector_classification(haystack_doc=df,
41
+ threshold= params['threshold'])
42
+
43
+ st.session_state.key1 = df
44
+
45
+