target code refactor
Browse files- appStore/adapmit.py +21 -23
- appStore/ghg.py +27 -28
- appStore/netzero.py +29 -29
- appStore/sector.py +45 -46
- utils/adapmit_classifier.py +7 -9
- utils/ghg_classifier.py +6 -5
- utils/netzero_classifier.py +5 -5
- utils/sector_classifier.py +7 -6
appStore/adapmit.py
CHANGED
@@ -22,32 +22,10 @@ import plotly.express as px
|
|
22 |
classifier_identifier = 'adapmit'
|
23 |
params = get_classifier_params(classifier_identifier)
|
24 |
|
25 |
-
@st.cache_data
|
26 |
-
def to_excel(df):
|
27 |
-
len_df = len(df)
|
28 |
-
output = BytesIO()
|
29 |
-
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
30 |
-
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
31 |
-
workbook = writer.book
|
32 |
-
worksheet = writer.sheets['Sheet1']
|
33 |
-
worksheet.data_validation('E2:E{}'.format(len_df),
|
34 |
-
{'validate': 'list',
|
35 |
-
'source': ['No', 'Yes', 'Discard']})
|
36 |
-
worksheet.data_validation('F2:F{}'.format(len_df),
|
37 |
-
{'validate': 'list',
|
38 |
-
'source': ['No', 'Yes', 'Discard']})
|
39 |
-
worksheet.data_validation('G2:G{}'.format(len_df),
|
40 |
-
{'validate': 'list',
|
41 |
-
'source': ['No', 'Yes', 'Discard']})
|
42 |
-
writer.save()
|
43 |
-
processed_data = output.getvalue()
|
44 |
-
return processed_data
|
45 |
|
46 |
def app():
|
47 |
-
|
48 |
### Main app code ###
|
49 |
-
with st.container():
|
50 |
-
|
51 |
if 'key1' in st.session_state:
|
52 |
df = st.session_state.key1
|
53 |
|
@@ -63,6 +41,26 @@ def app():
|
|
63 |
|
64 |
st.session_state.key1 = df
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
|
68 |
|
|
|
22 |
classifier_identifier = 'adapmit'
|
23 |
params = get_classifier_params(classifier_identifier)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def app():
|
|
|
27 |
### Main app code ###
|
28 |
+
with st.container():
|
|
|
29 |
if 'key1' in st.session_state:
|
30 |
df = st.session_state.key1
|
31 |
|
|
|
41 |
|
42 |
st.session_state.key1 = df
|
43 |
|
44 |
+
# @st.cache_data
|
45 |
+
# def to_excel(df):
|
46 |
+
# len_df = len(df)
|
47 |
+
# output = BytesIO()
|
48 |
+
# writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
49 |
+
# df.to_excel(writer, index=False, sheet_name='Sheet1')
|
50 |
+
# workbook = writer.book
|
51 |
+
# worksheet = writer.sheets['Sheet1']
|
52 |
+
# worksheet.data_validation('E2:E{}'.format(len_df),
|
53 |
+
# {'validate': 'list',
|
54 |
+
# 'source': ['No', 'Yes', 'Discard']})
|
55 |
+
# worksheet.data_validation('F2:F{}'.format(len_df),
|
56 |
+
# {'validate': 'list',
|
57 |
+
# 'source': ['No', 'Yes', 'Discard']})
|
58 |
+
# worksheet.data_validation('G2:G{}'.format(len_df),
|
59 |
+
# {'validate': 'list',
|
60 |
+
# 'source': ['No', 'Yes', 'Discard']})
|
61 |
+
# writer.save()
|
62 |
+
# processed_data = output.getvalue()
|
63 |
+
# return processed_data
|
64 |
|
65 |
|
66 |
|
appStore/ghg.py
CHANGED
@@ -29,41 +29,40 @@ _lab_dict = {
|
|
29 |
}
|
30 |
|
31 |
|
32 |
-
@st.cache_data
|
33 |
-
def to_excel(df):
|
34 |
-
len_df = len(df)
|
35 |
-
output = BytesIO()
|
36 |
-
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
37 |
-
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
38 |
-
workbook = writer.book
|
39 |
-
worksheet = writer.sheets['Sheet1']
|
40 |
-
worksheet.data_validation('E2:E{}'.format(len_df),
|
41 |
-
{'validate': 'list',
|
42 |
-
'source': ['No', 'Yes', 'Discard']})
|
43 |
-
writer.save()
|
44 |
-
processed_data = output.getvalue()
|
45 |
-
return processed_data
|
46 |
-
|
47 |
def app():
|
48 |
### Main app code ###
|
49 |
with st.container():
|
50 |
-
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
# def netzero_display():
|
68 |
# if 'key1' in st.session_state:
|
69 |
# df = st.session_state.key2
|
|
|
29 |
}
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def app():
|
33 |
### Main app code ###
|
34 |
with st.container():
|
35 |
+
if 'key1' in st.session_state:
|
36 |
+
df = st.session_state.key1
|
37 |
|
38 |
+
# Load the classifier model
|
39 |
+
classifier = load_ghgClassifier(classifier_name=params['model_name'])
|
40 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
41 |
|
42 |
+
if sum(df['Target Label'] == 'TARGET') > 100:
|
43 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
44 |
+
else:
|
45 |
+
warning_msg = ""
|
46 |
+
|
47 |
+
df = ghg_classification(haystack_doc=df,
|
48 |
+
threshold= params['threshold'])
|
49 |
+
st.session_state.key1 = df
|
50 |
|
51 |
|
52 |
+
# @st.cache_data
|
53 |
+
# def to_excel(df):
|
54 |
+
# len_df = len(df)
|
55 |
+
# output = BytesIO()
|
56 |
+
# writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
57 |
+
# df.to_excel(writer, index=False, sheet_name='Sheet1')
|
58 |
+
# workbook = writer.book
|
59 |
+
# worksheet = writer.sheets['Sheet1']
|
60 |
+
# worksheet.data_validation('E2:E{}'.format(len_df),
|
61 |
+
# {'validate': 'list',
|
62 |
+
# 'source': ['No', 'Yes', 'Discard']})
|
63 |
+
# writer.save()
|
64 |
+
# processed_data = output.getvalue()
|
65 |
+
# return processed_data
|
66 |
# def netzero_display():
|
67 |
# if 'key1' in st.session_state:
|
68 |
# df = st.session_state.key2
|
appStore/netzero.py
CHANGED
@@ -28,41 +28,41 @@ _lab_dict = {
|
|
28 |
'NETZERO':'NETZERO TARGET',
|
29 |
}
|
30 |
|
31 |
-
|
32 |
-
@st.cache_data
|
33 |
-
def to_excel(df):
|
34 |
-
len_df = len(df)
|
35 |
-
output = BytesIO()
|
36 |
-
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
37 |
-
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
38 |
-
workbook = writer.book
|
39 |
-
worksheet = writer.sheets['Sheet1']
|
40 |
-
worksheet.data_validation('E2:E{}'.format(len_df),
|
41 |
-
{'validate': 'list',
|
42 |
-
'source': ['No', 'Yes', 'Discard']})
|
43 |
-
writer.save()
|
44 |
-
processed_data = output.getvalue()
|
45 |
-
return processed_data
|
46 |
-
|
47 |
def app():
|
48 |
### Main app code ###
|
49 |
with st.container():
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
warning_msg = ": This might take sometime, please sit back and relax."
|
59 |
-
else:
|
60 |
-
warning_msg = ""
|
61 |
-
|
62 |
-
df = netzero_classification(haystack_doc=df,
|
63 |
-
threshold= params['threshold'])
|
64 |
-
st.session_state.key1 = df
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
# def netzero_display():
|
68 |
# if 'key1' in st.session_state:
|
|
|
28 |
'NETZERO':'NETZERO TARGET',
|
29 |
}
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def app():
|
32 |
### Main app code ###
|
33 |
with st.container():
|
34 |
+
if 'key1' in st.session_state:
|
35 |
+
df = st.session_state.key1
|
36 |
+
|
37 |
+
# Load the classifier model
|
38 |
+
classifier = load_netzeroClassifier(classifier_name=params['model_name'])
|
39 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
40 |
|
41 |
+
if sum(df['Target Label'] == 'TARGET') > 100:
|
42 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
43 |
+
else:
|
44 |
+
warning_msg = ""
|
45 |
+
|
46 |
+
df = netzero_classification(haystack_doc=df,
|
47 |
+
threshold= params['threshold'])
|
48 |
+
st.session_state.key1 = df
|
49 |
|
50 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
# @st.cache_data
|
53 |
+
# def to_excel(df):
|
54 |
+
# len_df = len(df)
|
55 |
+
# output = BytesIO()
|
56 |
+
# writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
57 |
+
# df.to_excel(writer, index=False, sheet_name='Sheet1')
|
58 |
+
# workbook = writer.book
|
59 |
+
# worksheet = writer.sheets['Sheet1']
|
60 |
+
# worksheet.data_validation('E2:E{}'.format(len_df),
|
61 |
+
# {'validate': 'list',
|
62 |
+
# 'source': ['No', 'Yes', 'Discard']})
|
63 |
+
# writer.save()
|
64 |
+
# processed_data = output.getvalue()
|
65 |
+
# return processed_data
|
66 |
|
67 |
# def netzero_display():
|
68 |
# if 'key1' in st.session_state:
|
appStore/sector.py
CHANGED
@@ -22,56 +22,55 @@ import plotly.express as px
|
|
22 |
classifier_identifier = 'sector'
|
23 |
params = get_classifier_params(classifier_identifier)
|
24 |
|
25 |
-
@st.cache_data
|
26 |
-
def to_excel(df,sectorlist):
|
27 |
-
len_df = len(df)
|
28 |
-
output = BytesIO()
|
29 |
-
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
30 |
-
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
31 |
-
workbook = writer.book
|
32 |
-
worksheet = writer.sheets['Sheet1']
|
33 |
-
worksheet.data_validation('S2:S{}'.format(len_df),
|
34 |
-
{'validate': 'list',
|
35 |
-
'source': ['No', 'Yes', 'Discard']})
|
36 |
-
worksheet.data_validation('X2:X{}'.format(len_df),
|
37 |
-
{'validate': 'list',
|
38 |
-
'source': sectorlist + ['Blank']})
|
39 |
-
worksheet.data_validation('T2:T{}'.format(len_df),
|
40 |
-
{'validate': 'list',
|
41 |
-
'source': sectorlist + ['Blank']})
|
42 |
-
worksheet.data_validation('U2:U{}'.format(len_df),
|
43 |
-
{'validate': 'list',
|
44 |
-
'source': sectorlist + ['Blank']})
|
45 |
-
worksheet.data_validation('V2:V{}'.format(len_df),
|
46 |
-
{'validate': 'list',
|
47 |
-
'source': sectorlist + ['Blank']})
|
48 |
-
worksheet.data_validation('W2:U{}'.format(len_df),
|
49 |
-
{'validate': 'list',
|
50 |
-
'source': sectorlist + ['Blank']})
|
51 |
-
writer.save()
|
52 |
-
processed_data = output.getvalue()
|
53 |
-
return processed_data
|
54 |
-
|
55 |
def app():
|
56 |
|
57 |
### Main app code ###
|
58 |
with st.container():
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
# # st.write(df)
|
77 |
# threshold= params['threshold']
|
|
|
22 |
classifier_identifier = 'sector'
|
23 |
params = get_classifier_params(classifier_identifier)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def app():
|
26 |
|
27 |
### Main app code ###
|
28 |
with st.container():
|
29 |
+
|
30 |
+
if 'key1' in st.session_state:
|
31 |
+
df = st.session_state.key1
|
32 |
+
classifier = load_sectorClassifier(classifier_name=params['model_name'])
|
33 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
34 |
+
|
35 |
+
if sum(df['Target Label'] == 'TARGET') > 100:
|
36 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
37 |
+
else:
|
38 |
+
warning_msg = ""
|
39 |
+
|
40 |
+
df = sector_classification(haystack_doc=df,
|
41 |
+
threshold= params['threshold'])
|
42 |
+
|
43 |
+
st.session_state.key1 = df
|
44 |
+
|
45 |
+
# @st.cache_data
|
46 |
+
# def to_excel(df,sectorlist):
|
47 |
+
# len_df = len(df)
|
48 |
+
# output = BytesIO()
|
49 |
+
# writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
50 |
+
# df.to_excel(writer, index=False, sheet_name='Sheet1')
|
51 |
+
# workbook = writer.book
|
52 |
+
# worksheet = writer.sheets['Sheet1']
|
53 |
+
# worksheet.data_validation('S2:S{}'.format(len_df),
|
54 |
+
# {'validate': 'list',
|
55 |
+
# 'source': ['No', 'Yes', 'Discard']})
|
56 |
+
# worksheet.data_validation('X2:X{}'.format(len_df),
|
57 |
+
# {'validate': 'list',
|
58 |
+
# 'source': sectorlist + ['Blank']})
|
59 |
+
# worksheet.data_validation('T2:T{}'.format(len_df),
|
60 |
+
# {'validate': 'list',
|
61 |
+
# 'source': sectorlist + ['Blank']})
|
62 |
+
# worksheet.data_validation('U2:U{}'.format(len_df),
|
63 |
+
# {'validate': 'list',
|
64 |
+
# 'source': sectorlist + ['Blank']})
|
65 |
+
# worksheet.data_validation('V2:V{}'.format(len_df),
|
66 |
+
# {'validate': 'list',
|
67 |
+
# 'source': sectorlist + ['Blank']})
|
68 |
+
# worksheet.data_validation('W2:U{}'.format(len_df),
|
69 |
+
# {'validate': 'list',
|
70 |
+
# 'source': sectorlist + ['Blank']})
|
71 |
+
# writer.save()
|
72 |
+
# processed_data = output.getvalue()
|
73 |
+
# return processed_data
|
74 |
|
75 |
# # st.write(df)
|
76 |
# threshold= params['threshold']
|
utils/adapmit_classifier.py
CHANGED
@@ -35,9 +35,7 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
|
|
35 |
doc_classifier = pipeline("text-classification",
|
36 |
model=classifier_name,
|
37 |
return_all_scores=True,
|
38 |
-
function_to_apply= "sigmoid")
|
39 |
-
|
40 |
-
|
41 |
return doc_classifier
|
42 |
|
43 |
|
@@ -61,14 +59,10 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
|
|
61 |
In case of streamlit avoid passing the model directly.
|
62 |
Returns
|
63 |
----------
|
64 |
-
df: Dataframe
|
65 |
-
x: Series object with the unique SDG covered in the document uploaded and
|
66 |
-
the number of times it is covered/discussed/count_of_paragraphs.
|
67 |
"""
|
68 |
logging.info("Working on Adaptation-Mitigation Identification")
|
69 |
haystack_doc['Adapt-Mitig Label'] = 'NA'
|
70 |
-
# df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
71 |
-
# df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
72 |
|
73 |
if not classifier_model:
|
74 |
classifier_model = st.session_state['adapmit_classifier']
|
@@ -86,14 +80,18 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
|
|
86 |
labels_ = [{**list_[l]} for l in range(len(predictions))]
|
87 |
truth_df = DataFrame.from_dict(labels_)
|
88 |
truth_df = truth_df.round(2)
|
|
|
89 |
truth_df = truth_df.astype(float) >= threshold
|
90 |
truth_df = truth_df.astype(str)
|
|
|
91 |
categories = list(truth_df.columns)
|
|
|
|
|
92 |
truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
|
93 |
else None for i in categories}, axis=1)
|
94 |
truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
|
95 |
list(x['Adapt-Mitig Label'] -{None}),axis=1)
|
|
|
96 |
haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
|
97 |
-
#df = pd.concat([df,df1])
|
98 |
|
99 |
return haystack_doc
|
|
|
35 |
doc_classifier = pipeline("text-classification",
|
36 |
model=classifier_name,
|
37 |
return_all_scores=True,
|
38 |
+
function_to_apply= "sigmoid")s
|
|
|
|
|
39 |
return doc_classifier
|
40 |
|
41 |
|
|
|
59 |
In case of streamlit avoid passing the model directly.
|
60 |
Returns
|
61 |
----------
|
62 |
+
df: Dataframe
|
|
|
|
|
63 |
"""
|
64 |
logging.info("Working on Adaptation-Mitigation Identification")
|
65 |
haystack_doc['Adapt-Mitig Label'] = 'NA'
|
|
|
|
|
66 |
|
67 |
if not classifier_model:
|
68 |
classifier_model = st.session_state['adapmit_classifier']
|
|
|
80 |
labels_ = [{**list_[l]} for l in range(len(predictions))]
|
81 |
truth_df = DataFrame.from_dict(labels_)
|
82 |
truth_df = truth_df.round(2)
|
83 |
+
# convert the labels score into boolean based on threshold value
|
84 |
truth_df = truth_df.astype(float) >= threshold
|
85 |
truth_df = truth_df.astype(str)
|
86 |
+
# list of labels
|
87 |
categories = list(truth_df.columns)
|
88 |
+
|
89 |
+
# collecting the labels, None is passed to overcome comprehension syntax
|
90 |
truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
|
91 |
else None for i in categories}, axis=1)
|
92 |
truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
|
93 |
list(x['Adapt-Mitig Label'] -{None}),axis=1)
|
94 |
+
# adding Adaptation-Mitigation label
|
95 |
haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
|
|
|
96 |
|
97 |
return haystack_doc
|
utils/ghg_classifier.py
CHANGED
@@ -55,8 +55,8 @@ def ghg_classification(haystack_doc:pd.DataFrame,
|
|
55 |
)->Tuple[DataFrame,Series]:
|
56 |
"""
|
57 |
Text-Classification on the list of texts provided. Classifier provides the
|
58 |
-
most appropriate label for each text.
|
59 |
-
|
60 |
Params
|
61 |
---------
|
62 |
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
@@ -68,13 +68,12 @@ def ghg_classification(haystack_doc:pd.DataFrame,
|
|
68 |
In case of streamlit avoid passing the model directly.
|
69 |
Returns
|
70 |
----------
|
71 |
-
df: Dataframe
|
72 |
-
x: Series object with the unique SDG covered in the document uploaded and
|
73 |
-
the number of times it is covered/discussed/count_of_paragraphs.
|
74 |
"""
|
75 |
logging.info("Working on GHG Extraction")
|
76 |
haystack_doc['GHG Label'] = 'NA'
|
77 |
haystack_doc['GHG Score'] = 'NA'
|
|
|
78 |
temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
79 |
temp = temp.reset_index(drop=True)
|
80 |
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
@@ -86,6 +85,8 @@ def ghg_classification(haystack_doc:pd.DataFrame,
|
|
86 |
results = classifier_model(list(temp.text))
|
87 |
labels_= [(l[0]['label'],l[0]['score']) for l in results]
|
88 |
temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
|
|
|
|
|
89 |
df = pd.concat([df,temp])
|
90 |
df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
|
91 |
df = df.reset_index(drop =True)
|
|
|
55 |
)->Tuple[DataFrame,Series]:
|
56 |
"""
|
57 |
Text-Classification on the list of texts provided. Classifier provides the
|
58 |
+
most appropriate label for each text. It identifies if text contains 'GHG'
|
59 |
+
related information or not.
|
60 |
Params
|
61 |
---------
|
62 |
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
|
|
68 |
In case of streamlit avoid passing the model directly.
|
69 |
Returns
|
70 |
----------
|
71 |
+
df: Dataframe
|
|
|
|
|
72 |
"""
|
73 |
logging.info("Working on GHG Extraction")
|
74 |
haystack_doc['GHG Label'] = 'NA'
|
75 |
haystack_doc['GHG Score'] = 'NA'
|
76 |
+
# applying GHG Identifier to only 'Target' paragraphs.
|
77 |
temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
78 |
temp = temp.reset_index(drop=True)
|
79 |
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
|
|
85 |
results = classifier_model(list(temp.text))
|
86 |
labels_= [(l[0]['label'],l[0]['score']) for l in results]
|
87 |
temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
|
88 |
+
|
89 |
+
# merge back Target and non-Target dataframe
|
90 |
df = pd.concat([df,temp])
|
91 |
df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
|
92 |
df = df.reset_index(drop =True)
|
utils/netzero_classifier.py
CHANGED
@@ -52,8 +52,8 @@ def netzero_classification(haystack_doc:pd.DataFrame,
|
|
52 |
)->Tuple[DataFrame,Series]:
|
53 |
"""
|
54 |
Text-Classification on the list of texts provided. Classifier provides the
|
55 |
-
most appropriate label for each text.
|
56 |
-
|
57 |
Params
|
58 |
---------
|
59 |
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
@@ -65,13 +65,12 @@ def netzero_classification(haystack_doc:pd.DataFrame,
|
|
65 |
In case of streamlit avoid passing the model directly.
|
66 |
Returns
|
67 |
----------
|
68 |
-
df: Dataframe
|
69 |
-
x: Series object with the unique SDG covered in the document uploaded and
|
70 |
-
the number of times it is covered/discussed/count_of_paragraphs.
|
71 |
"""
|
72 |
logging.info("Working on Netzero Extraction")
|
73 |
haystack_doc['Netzero Label'] = 'NA'
|
74 |
haystack_doc['Netzero Score'] = 'NA'
|
|
|
75 |
temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
76 |
temp = temp.reset_index(drop=True)
|
77 |
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
@@ -83,6 +82,7 @@ def netzero_classification(haystack_doc:pd.DataFrame,
|
|
83 |
results = classifier_model(list(temp.text))
|
84 |
labels_= [(l[0]['label'],l[0]['score']) for l in results]
|
85 |
temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
|
|
|
86 |
df = pd.concat([df,temp])
|
87 |
df = df.reset_index(drop =True)
|
88 |
df.index += 1
|
|
|
52 |
)->Tuple[DataFrame,Series]:
|
53 |
"""
|
54 |
Text-Classification on the list of texts provided. Classifier provides the
|
55 |
+
most appropriate label for each text. It informs if paragraph contains any
|
56 |
+
netzero information or not.
|
57 |
Params
|
58 |
---------
|
59 |
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
|
|
65 |
In case of streamlit avoid passing the model directly.
|
66 |
Returns
|
67 |
----------
|
68 |
+
df: Dataframe
|
|
|
|
|
69 |
"""
|
70 |
logging.info("Working on Netzero Extraction")
|
71 |
haystack_doc['Netzero Label'] = 'NA'
|
72 |
haystack_doc['Netzero Score'] = 'NA'
|
73 |
+
# we apply Netzero to only paragraphs which are classified as 'Target' related
|
74 |
temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
75 |
temp = temp.reset_index(drop=True)
|
76 |
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
|
|
82 |
results = classifier_model(list(temp.text))
|
83 |
labels_= [(l[0]['label'],l[0]['score']) for l in results]
|
84 |
temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
|
85 |
+
# merging Target with Non Target dataframe
|
86 |
df = pd.concat([df,temp])
|
87 |
df = df.reset_index(drop =True)
|
88 |
df.index += 1
|
utils/sector_classifier.py
CHANGED
@@ -70,19 +70,16 @@ def sector_classification(haystack_doc:pd.DataFrame,
|
|
70 |
In case of streamlit avoid passing the model directly.
|
71 |
Returns
|
72 |
----------
|
73 |
-
df: Dataframe
|
74 |
-
x: Series object with the unique SDG covered in the document uploaded and
|
75 |
-
the number of times it is covered/discussed/count_of_paragraphs.
|
76 |
"""
|
77 |
logging.info("Working on Sector Identification")
|
78 |
haystack_doc['Sector Label'] = 'NA'
|
79 |
-
# df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
80 |
-
# df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
81 |
if not classifier_model:
|
82 |
classifier_model = st.session_state['sector_classifier']
|
83 |
|
84 |
predictions = classifier_model(list(haystack_doc.text))
|
85 |
|
|
|
86 |
list_ = []
|
87 |
for i in range(len(predictions)):
|
88 |
|
@@ -94,13 +91,17 @@ def sector_classification(haystack_doc:pd.DataFrame,
|
|
94 |
labels_ = [{**list_[l]} for l in range(len(predictions))]
|
95 |
truth_df = DataFrame.from_dict(labels_)
|
96 |
truth_df = truth_df.round(2)
|
|
|
97 |
truth_df = truth_df.astype(float) >= threshold
|
98 |
truth_df = truth_df.astype(str)
|
|
|
99 |
categories = list(truth_df.columns)
|
|
|
|
|
100 |
truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
|
101 |
None for i in categories}, axis=1)
|
|
|
102 |
truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
|
103 |
-{None}),axis=1)
|
104 |
haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
|
105 |
-
# df = pd.concat([df,df1])
|
106 |
return haystack_doc
|
|
|
70 |
In case of streamlit avoid passing the model directly.
|
71 |
Returns
|
72 |
----------
|
73 |
+
df: Dataframe
|
|
|
|
|
74 |
"""
|
75 |
logging.info("Working on Sector Identification")
|
76 |
haystack_doc['Sector Label'] = 'NA'
|
|
|
|
|
77 |
if not classifier_model:
|
78 |
classifier_model = st.session_state['sector_classifier']
|
79 |
|
80 |
predictions = classifier_model(list(haystack_doc.text))
|
81 |
|
82 |
+
# getting the sector label and scores
|
83 |
list_ = []
|
84 |
for i in range(len(predictions)):
|
85 |
|
|
|
91 |
labels_ = [{**list_[l]} for l in range(len(predictions))]
|
92 |
truth_df = DataFrame.from_dict(labels_)
|
93 |
truth_df = truth_df.round(2)
|
94 |
+
# based on threshold value, we convert each sector score into boolean
|
95 |
truth_df = truth_df.astype(float) >= threshold
|
96 |
truth_df = truth_df.astype(str)
|
97 |
+
# collecting list of Sector Labels
|
98 |
categories = list(truth_df.columns)
|
99 |
+
# we collect the Sector Labels as set, None represent the value at the index
|
100 |
+
# in the list of Sector Labels.
|
101 |
truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
|
102 |
None for i in categories}, axis=1)
|
103 |
+
# we keep all Sector label except None
|
104 |
truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
|
105 |
-{None}),axis=1)
|
106 |
haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
|
|
|
107 |
return haystack_doc
|