leavoigt commited on
Commit
079c7c0
1 Parent(s): 2923360

Upload 3 files

Browse files
appStore/adapmit.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.adapmit_classifier import load_adapmitClassifier,adapmit_classification
12
+ # from utils.keyword_extraction import textrank
13
+ import logging
14
+ logger = logging.getLogger(__name__)
15
+ from utils.config import get_classifier_params
16
+ from utils.preprocessing import paraLengthCheck
17
+ from io import BytesIO
18
+ import xlsxwriter
19
+ import plotly.express as px
20
+
21
+ # Declare all the necessary variables
22
+ classifier_identifier = 'adapmit'
23
+ params = get_classifier_params(classifier_identifier)
24
+
25
+ @st.cache_data
26
+ def to_excel(df):
27
+ len_df = len(df)
28
+ output = BytesIO()
29
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
30
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
31
+ workbook = writer.book
32
+ worksheet = writer.sheets['Sheet1']
33
+ worksheet.data_validation('E2:E{}'.format(len_df),
34
+ {'validate': 'list',
35
+ 'source': ['No', 'Yes', 'Discard']})
36
+ worksheet.data_validation('F2:F{}'.format(len_df),
37
+ {'validate': 'list',
38
+ 'source': ['No', 'Yes', 'Discard']})
39
+ worksheet.data_validation('G2:G{}'.format(len_df),
40
+ {'validate': 'list',
41
+ 'source': ['No', 'Yes', 'Discard']})
42
+ writer.save()
43
+ processed_data = output.getvalue()
44
+ return processed_data
45
+
46
+ def app():
47
+
48
+ ### Main app code ###
49
+ with st.container():
50
+
51
+ if 'key1' in st.session_state:
52
+ df = st.session_state.key1
53
+
54
+ classifier = load_adapmitClassifier(classifier_name=params['model_name'])
55
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
56
+ if sum(df['Target Label'] == 'TARGET') > 100:
57
+ warning_msg = ": This might take sometime, please sit back and relax."
58
+ else:
59
+ warning_msg = ""
60
+
61
+ df = adapmit_classification(haystack_doc=df,
62
+ threshold= params['threshold'])
63
+
64
+ st.session_state.key1 = df
65
+
66
+
67
+
68
+
69
+
70
+ # threshold= params['threshold']
71
+ # truth_df = df.drop(['text'],axis=1)
72
+ # truth_df = truth_df.astype(float) >= threshold
73
+ # truth_df = truth_df.astype(str)
74
+ # categories = list(truth_df.columns)
75
+
76
+ # placeholder = {}
77
+ # for val in categories:
78
+ # placeholder[val] = dict(truth_df[val].value_counts())
79
+ # count_df = pd.DataFrame.from_dict(placeholder)
80
+ # count_df = count_df.T
81
+ # count_df = count_df.reset_index()
82
+ # # st.write(count_df)
83
+ # placeholder = []
84
+ # for i in range(len(count_df)):
85
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
86
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
87
+ # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
88
+ # # st.write("Total Paragraphs: {}".format(len(df)))
89
+ # fig = px.bar(count_df, y='category', x='count',
90
+ # color='truth_value',orientation='h', height =200)
91
+ # c1, c2 = st.columns([1,1])
92
+ # with c1:
93
+ # st.plotly_chart(fig,use_container_width= True)
94
+
95
+ # truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
96
+ # truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
97
+ # # st.write(truth_df)
98
+ # df = pd.concat([df,truth_df['labels']],axis=1)
99
+ # st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
100
+ # df = df.sort_values(by = ['Mitigation'], ascending=False)
101
+ # for i in range(3):
102
+ # if df.iloc[i]['Mitigation'] >= 0.50:
103
+ # st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
104
+ # st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
105
+
106
+ # st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
107
+ # df = df.sort_values(by = ['Adaptation'], ascending=False)
108
+ # for i in range(3):
109
+ # if df.iloc[i]['Adaptation'] > 0.5:
110
+ # st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
111
+ # st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
112
+ # # st.write(df[['text','labels']])
113
+ # df['Validation'] = 'No'
114
+ # df['Val-Mitigation'] = 'No'
115
+ # df['Val-Adaptation'] = 'No'
116
+ # df_xlsx = to_excel(df)
117
+ # st.download_button(label='📥 Download Current Result',
118
+ # data=df_xlsx ,
119
+ # file_name= 'file_adaptation-mitigation.xlsx')
120
+ # # st.session_state.key4 =
121
+
122
+ # # category =set(df.columns)
123
+ # # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
124
+ # # category = list(category - removecols)
125
+
126
+ # else:
127
+ # st.info("🤔 No document found, please try to upload it at the sidebar!")
128
+ # logging.warning("Terminated as no document provided")
129
+
130
+ # # Creating truth value dataframe
131
+ # if 'key4' in st.session_state:
132
+ # if st.session_state.key4 is not None:
133
+ # df = st.session_state.key4
134
+ # st.markdown("###### Select the threshold for classifier ######")
135
+ # c4, c5 = st.columns([1,1])
136
+
137
+ # with c4:
138
+ # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
139
+ # step=0.01, value=0.5,
140
+ # help = "Keep High Value if want refined result, low if dont want to miss anything" )
141
+ # category =set(df.columns)
142
+ # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
143
+ # category = list(category - removecols)
144
+
145
+ # placeholder = {}
146
+ # for val in category:
147
+ # temp = df[val].astype(float) > threshold
148
+ # temp = temp.astype(str)
149
+ # placeholder[val] = dict(temp.value_counts())
150
+
151
+ # count_df = pd.DataFrame.from_dict(placeholder)
152
+ # count_df = count_df.T
153
+ # count_df = count_df.reset_index()
154
+ # placeholder = []
155
+ # for i in range(len(count_df)):
156
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
157
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
158
+
159
+ # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
160
+ # fig = px.bar(count_df, x='category', y='count',
161
+ # color='truth_value',
162
+ # height=400)
163
+ # st.write("")
164
+ # st.plotly_chart(fig)
165
+
166
+ # df['Validation'] = 'No'
167
+ # df['Val-Mitigation'] = 'No'
168
+ # df['Val-Adaptation'] = 'No'
169
+ # df_xlsx = to_excel(df)
170
+ # st.download_button(label='📥 Download Current Result',
171
+ # data=df_xlsx ,
172
+ # file_name= 'file_adaptation-mitigation.xlsx')
173
+
174
+
appStore/doc_processing (1).py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+ from typing import List, Tuple
5
+ from typing_extensions import Literal
6
+ from haystack.schema import Document
7
+ from utils.config import get_classifier_params
8
+ from utils.preprocessing import processingpipeline,paraLengthCheck
9
+ import streamlit as st
10
+ import logging
11
+ import pandas as pd
12
+ params = get_classifier_params("preprocessing")
13
+
14
+ @st.cache_data
15
+ def runPreprocessingPipeline(file_name:str, file_path:str,
16
+ split_by: Literal["sentence", "word"] = 'sentence',
17
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
18
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
19
+ """
20
+ creates the pipeline and runs the preprocessing pipeline,
21
+ the params for pipeline are fetched from paramconfig
22
+ Params
23
+ ------------
24
+ file_name: filename, in case of streamlit application use
25
+ st.session_state['filename']
26
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
27
+ split_by: document splitting strategy either as word or sentence
28
+ split_length: when synthetically creating the paragrpahs from document,
29
+ it defines the length of paragraph.
30
+ split_respect_sentence_boundary: Used when using 'word' strategy for
31
+ splititng of text.
32
+ split_overlap: Number of words or sentences that overlap when creating
33
+ the paragraphs. This is done as one sentence or 'some words' make sense
34
+ when read in together with others. Therefore the overlap is used.
35
+ remove_punc: to remove all Punctuation including ',' and '.' or not
36
+ Return
37
+ --------------
38
+ List[Document]: When preprocessing pipeline is run, the output dictionary
39
+ has four objects. For the Haysatck implementation of SDG classification we,
40
+ need to use the List of Haystack Document, which can be fetched by
41
+ key = 'documents' on output.
42
+ """
43
+
44
+ processing_pipeline = processingpipeline()
45
+
46
+ output_pre = processing_pipeline.run(file_paths = file_path,
47
+ params= {"FileConverter": {"file_path": file_path, \
48
+ "file_name": file_name},
49
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
50
+ "split_by": split_by, \
51
+ "split_length":split_length,\
52
+ "split_overlap": split_overlap, \
53
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
54
+
55
+ return output_pre
56
+
57
+
58
+ def app():
59
+ with st.container():
60
+ if 'filepath' in st.session_state:
61
+ file_name = st.session_state['filename']
62
+ file_path = st.session_state['filepath']
63
+
64
+
65
+ all_documents = runPreprocessingPipeline(file_name= file_name,
66
+ file_path= file_path, split_by= params['split_by'],
67
+ split_length= params['split_length'],
68
+ split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
69
+ split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
70
+ paralist = paraLengthCheck(all_documents['documents'], 100)
71
+ df = pd.DataFrame(paralist,columns = ['text','page'])
72
+ # saving the dataframe to session state
73
+ st.session_state['key0'] = df
74
+
75
+ else:
76
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
77
+ logging.warning("Terminated as no document provided")
appStore/indicator.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.indicator_classifier import load_indicatorClassifier, indicator_classification
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from utils.preprocessing import paraLengthCheck
16
+ from io import BytesIO
17
+ import xlsxwriter
18
+ import plotly.express as px
19
+
20
+
21
+ # Declare all the necessary variables
22
+ classifier_identifier = 'indicator'
23
+ params = get_classifier_params(classifier_identifier)
24
+
25
+ @st.cache_data
26
+ def to_excel(df,sectorlist):
27
+ len_df = len(df)
28
+ output = BytesIO()
29
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
30
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
31
+ workbook = writer.book
32
+ worksheet = writer.sheets['Sheet1']
33
+ worksheet.data_validation('S2:S{}'.format(len_df),
34
+ {'validate': 'list',
35
+ 'source': ['No', 'Yes', 'Discard']})
36
+ worksheet.data_validation('X2:X{}'.format(len_df),
37
+ {'validate': 'list',
38
+ 'source': sectorlist + ['Blank']})
39
+ worksheet.data_validation('T2:T{}'.format(len_df),
40
+ {'validate': 'list',
41
+ 'source': sectorlist + ['Blank']})
42
+ worksheet.data_validation('U2:U{}'.format(len_df),
43
+ {'validate': 'list',
44
+ 'source': sectorlist + ['Blank']})
45
+ worksheet.data_validation('V2:V{}'.format(len_df),
46
+ {'validate': 'list',
47
+ 'source': sectorlist + ['Blank']})
48
+ worksheet.data_validation('W2:U{}'.format(len_df),
49
+ {'validate': 'list',
50
+ 'source': sectorlist + ['Blank']})
51
+ writer.save()
52
+ processed_data = output.getvalue()
53
+ return processed_data
54
+
55
+ def app():
56
+
57
+ ### Main app code ###
58
+ with st.container():
59
+
60
+ if 'key1' in st.session_state:
61
+ df = st.session_state.key1
62
+ classifier = load_indicatorClassifier(classifier_name=params['model_name'])
63
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
64
+
65
+ if sum(df['Target Label'] == 'TARGET') > 100:
66
+ warning_msg = ": This might take sometime, please sit back and relax."
67
+ else:
68
+ warning_msg = ""
69
+
70
+ df = indicator_classification(haystack_doc=df,
71
+ threshold= params['threshold'])
72
+
73
+ st.session_state.key1 = df
74
+
75
+
76
+ # # st.write(df)
77
+ # threshold= params['threshold']
78
+ # truth_df = df.drop(['text'],axis=1)
79
+ # truth_df = truth_df.astype(float) >= threshold
80
+ # truth_df = truth_df.astype(str)
81
+ # categories = list(truth_df.columns)
82
+
83
+ # placeholder = {}
84
+ # for val in categories:
85
+ # placeholder[val] = dict(truth_df[val].value_counts())
86
+ # count_df = pd.DataFrame.from_dict(placeholder)
87
+ # count_df = count_df.T
88
+ # count_df = count_df.reset_index()
89
+ # # st.write(count_df)
90
+ # placeholder = []
91
+ # for i in range(len(count_df)):
92
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
93
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
94
+ # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
95
+ # # st.write("Total Paragraphs: {}".format(len(df)))
96
+ # fig = px.bar(count_df, x='category', y='count',
97
+ # color='truth_value')
98
+ # # c1, c2 = st.columns([1,1])
99
+ # # with c1:
100
+ # st.plotly_chart(fig,use_container_width= True)
101
+
102
+ # truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
103
+ # truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
104
+ # # st.write(truth_df)
105
+ # df = pd.concat([df,truth_df['labels']],axis=1)
106
+ # df['Validation'] = 'No'
107
+ # df['Sector1'] = 'Blank'
108
+ # df['Sector2'] = 'Blank'
109
+ # df['Sector3'] = 'Blank'
110
+ # df['Sector4'] = 'Blank'
111
+ # df['Sector5'] = 'Blank'
112
+ # df_xlsx = to_excel(df,categories)
113
+ # st.download_button(label='📥 Download Current Result',
114
+ # data=df_xlsx ,
115
+ # # file_name= 'file_sector.xlsx')
116
+ # else:
117
+ # st.info("🤔 No document found, please try to upload it at the sidebar!")
118
+ # logging.warning("Terminated as no document provided")
119
+
120
+ # # Creating truth value dataframe
121
+ # if 'key' in st.session_state:
122
+ # if st.session_state.key is not None:
123
+ # df = st.session_state.key
124
+ # st.markdown("###### Select the threshold for classifier ######")
125
+ # c4, c5 = st.columns([1,1])
126
+
127
+ # with c4:
128
+ # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
129
+ # step=0.01, value=0.5,
130
+ # help = "Keep High Value if want refined result, low if dont want to miss anything" )
131
+ # sectors =set(df.columns)
132
+ # removecols = {'Validation','Sector1','Sector2','Sector3','Sector4',
133
+ # 'Sector5','text'}
134
+ # sectors = list(sectors - removecols)
135
+
136
+ # placeholder = {}
137
+ # for val in sectors:
138
+ # temp = df[val].astype(float) > threshold
139
+ # temp = temp.astype(str)
140
+ # placeholder[val] = dict(temp.value_counts())
141
+
142
+ # count_df = pd.DataFrame.from_dict(placeholder)
143
+ # count_df = count_df.T
144
+ # count_df = count_df.reset_index()
145
+ # placeholder = []
146
+ # for i in range(len(count_df)):
147
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
148
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
149
+
150
+ # count_df = pd.DataFrame(placeholder, columns = ['sector','count','truth_value'])
151
+ # fig = px.bar(count_df, x='sector', y='count',
152
+ # color='truth_value',
153
+ # height=400)
154
+ # st.write("")
155
+ # st.plotly_chart(fig)
156
+
157
+ # df['Validation'] = 'No'
158
+ # df['Sector1'] = 'Blank'
159
+ # df['Sector2'] = 'Blank'
160
+ # df['Sector3'] = 'Blank'
161
+ # df['Sector4'] = 'Blank'
162
+ # df['Sector5'] = 'Blank'
163
+ # df_xlsx = to_excel(df,sectors)
164
+ # st.download_button(label='📥 Download Current Result',
165
+ # data=df_xlsx ,
166
+ # file_name= 'file_sector.xlsx')