ppsingh commited on
Commit
44da430
1 Parent(s): 5ac96cb

target code refactor

Browse files
appStore/adapmit.py CHANGED
@@ -22,32 +22,10 @@ import plotly.express as px
22
  classifier_identifier = 'adapmit'
23
  params = get_classifier_params(classifier_identifier)
24
 
25
- @st.cache_data
26
- def to_excel(df):
27
- len_df = len(df)
28
- output = BytesIO()
29
- writer = pd.ExcelWriter(output, engine='xlsxwriter')
30
- df.to_excel(writer, index=False, sheet_name='Sheet1')
31
- workbook = writer.book
32
- worksheet = writer.sheets['Sheet1']
33
- worksheet.data_validation('E2:E{}'.format(len_df),
34
- {'validate': 'list',
35
- 'source': ['No', 'Yes', 'Discard']})
36
- worksheet.data_validation('F2:F{}'.format(len_df),
37
- {'validate': 'list',
38
- 'source': ['No', 'Yes', 'Discard']})
39
- worksheet.data_validation('G2:G{}'.format(len_df),
40
- {'validate': 'list',
41
- 'source': ['No', 'Yes', 'Discard']})
42
- writer.save()
43
- processed_data = output.getvalue()
44
- return processed_data
45
 
46
  def app():
47
-
48
  ### Main app code ###
49
- with st.container():
50
-
51
  if 'key1' in st.session_state:
52
  df = st.session_state.key1
53
 
@@ -63,6 +41,26 @@ def app():
63
 
64
  st.session_state.key1 = df
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
 
 
22
  classifier_identifier = 'adapmit'
23
  params = get_classifier_params(classifier_identifier)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def app():
 
27
  ### Main app code ###
28
+ with st.container():
 
29
  if 'key1' in st.session_state:
30
  df = st.session_state.key1
31
 
 
41
 
42
  st.session_state.key1 = df
43
 
44
+ # @st.cache_data
45
+ # def to_excel(df):
46
+ # len_df = len(df)
47
+ # output = BytesIO()
48
+ # writer = pd.ExcelWriter(output, engine='xlsxwriter')
49
+ # df.to_excel(writer, index=False, sheet_name='Sheet1')
50
+ # workbook = writer.book
51
+ # worksheet = writer.sheets['Sheet1']
52
+ # worksheet.data_validation('E2:E{}'.format(len_df),
53
+ # {'validate': 'list',
54
+ # 'source': ['No', 'Yes', 'Discard']})
55
+ # worksheet.data_validation('F2:F{}'.format(len_df),
56
+ # {'validate': 'list',
57
+ # 'source': ['No', 'Yes', 'Discard']})
58
+ # worksheet.data_validation('G2:G{}'.format(len_df),
59
+ # {'validate': 'list',
60
+ # 'source': ['No', 'Yes', 'Discard']})
61
+ # writer.save()
62
+ # processed_data = output.getvalue()
63
+ # return processed_data
64
 
65
 
66
 
appStore/ghg.py CHANGED
@@ -29,41 +29,40 @@ _lab_dict = {
29
  }
30
 
31
 
32
- @st.cache_data
33
- def to_excel(df):
34
- len_df = len(df)
35
- output = BytesIO()
36
- writer = pd.ExcelWriter(output, engine='xlsxwriter')
37
- df.to_excel(writer, index=False, sheet_name='Sheet1')
38
- workbook = writer.book
39
- worksheet = writer.sheets['Sheet1']
40
- worksheet.data_validation('E2:E{}'.format(len_df),
41
- {'validate': 'list',
42
- 'source': ['No', 'Yes', 'Discard']})
43
- writer.save()
44
- processed_data = output.getvalue()
45
- return processed_data
46
-
47
  def app():
48
  ### Main app code ###
49
  with st.container():
50
- if 'key1' in st.session_state:
51
- df = st.session_state.key1
52
 
53
- # Load the classifier model
54
- classifier = load_ghgClassifier(classifier_name=params['model_name'])
55
- st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
56
 
57
- if sum(df['Target Label'] == 'TARGET') > 100:
58
- warning_msg = ": This might take sometime, please sit back and relax."
59
- else:
60
- warning_msg = ""
61
-
62
- df = ghg_classification(haystack_doc=df,
63
- threshold= params['threshold'])
64
- st.session_state.key1 = df
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # def netzero_display():
68
  # if 'key1' in st.session_state:
69
  # df = st.session_state.key2
 
29
  }
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def app():
33
  ### Main app code ###
34
  with st.container():
35
+ if 'key1' in st.session_state:
36
+ df = st.session_state.key1
37
 
38
+ # Load the classifier model
39
+ classifier = load_ghgClassifier(classifier_name=params['model_name'])
40
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
41
 
42
+ if sum(df['Target Label'] == 'TARGET') > 100:
43
+ warning_msg = ": This might take sometime, please sit back and relax."
44
+ else:
45
+ warning_msg = ""
46
+
47
+ df = ghg_classification(haystack_doc=df,
48
+ threshold= params['threshold'])
49
+ st.session_state.key1 = df
50
 
51
 
52
+ # @st.cache_data
53
+ # def to_excel(df):
54
+ # len_df = len(df)
55
+ # output = BytesIO()
56
+ # writer = pd.ExcelWriter(output, engine='xlsxwriter')
57
+ # df.to_excel(writer, index=False, sheet_name='Sheet1')
58
+ # workbook = writer.book
59
+ # worksheet = writer.sheets['Sheet1']
60
+ # worksheet.data_validation('E2:E{}'.format(len_df),
61
+ # {'validate': 'list',
62
+ # 'source': ['No', 'Yes', 'Discard']})
63
+ # writer.save()
64
+ # processed_data = output.getvalue()
65
+ # return processed_data
66
  # def netzero_display():
67
  # if 'key1' in st.session_state:
68
  # df = st.session_state.key2
appStore/netzero.py CHANGED
@@ -28,41 +28,41 @@ _lab_dict = {
28
  'NETZERO':'NETZERO TARGET',
29
  }
30
 
31
-
32
- @st.cache_data
33
- def to_excel(df):
34
- len_df = len(df)
35
- output = BytesIO()
36
- writer = pd.ExcelWriter(output, engine='xlsxwriter')
37
- df.to_excel(writer, index=False, sheet_name='Sheet1')
38
- workbook = writer.book
39
- worksheet = writer.sheets['Sheet1']
40
- worksheet.data_validation('E2:E{}'.format(len_df),
41
- {'validate': 'list',
42
- 'source': ['No', 'Yes', 'Discard']})
43
- writer.save()
44
- processed_data = output.getvalue()
45
- return processed_data
46
-
47
  def app():
48
  ### Main app code ###
49
  with st.container():
50
- if 'key1' in st.session_state:
51
- df = st.session_state.key1
 
 
 
 
52
 
53
- # Load the classifier model
54
- classifier = load_netzeroClassifier(classifier_name=params['model_name'])
55
- st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
 
 
 
 
 
56
 
57
- if sum(df['Target Label'] == 'TARGET') > 100:
58
- warning_msg = ": This might take sometime, please sit back and relax."
59
- else:
60
- warning_msg = ""
61
-
62
- df = netzero_classification(haystack_doc=df,
63
- threshold= params['threshold'])
64
- st.session_state.key1 = df
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  # def netzero_display():
68
  # if 'key1' in st.session_state:
 
28
  'NETZERO':'NETZERO TARGET',
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def app():
32
  ### Main app code ###
33
  with st.container():
34
+ if 'key1' in st.session_state:
35
+ df = st.session_state.key1
36
+
37
+ # Load the classifier model
38
+ classifier = load_netzeroClassifier(classifier_name=params['model_name'])
39
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
40
 
41
+ if sum(df['Target Label'] == 'TARGET') > 100:
42
+ warning_msg = ": This might take sometime, please sit back and relax."
43
+ else:
44
+ warning_msg = ""
45
+
46
+ df = netzero_classification(haystack_doc=df,
47
+ threshold= params['threshold'])
48
+ st.session_state.key1 = df
49
 
50
+
 
 
 
 
 
 
 
51
 
52
+ # @st.cache_data
53
+ # def to_excel(df):
54
+ # len_df = len(df)
55
+ # output = BytesIO()
56
+ # writer = pd.ExcelWriter(output, engine='xlsxwriter')
57
+ # df.to_excel(writer, index=False, sheet_name='Sheet1')
58
+ # workbook = writer.book
59
+ # worksheet = writer.sheets['Sheet1']
60
+ # worksheet.data_validation('E2:E{}'.format(len_df),
61
+ # {'validate': 'list',
62
+ # 'source': ['No', 'Yes', 'Discard']})
63
+ # writer.save()
64
+ # processed_data = output.getvalue()
65
+ # return processed_data
66
 
67
  # def netzero_display():
68
  # if 'key1' in st.session_state:
appStore/sector.py CHANGED
@@ -22,56 +22,55 @@ import plotly.express as px
22
  classifier_identifier = 'sector'
23
  params = get_classifier_params(classifier_identifier)
24
 
25
- @st.cache_data
26
- def to_excel(df,sectorlist):
27
- len_df = len(df)
28
- output = BytesIO()
29
- writer = pd.ExcelWriter(output, engine='xlsxwriter')
30
- df.to_excel(writer, index=False, sheet_name='Sheet1')
31
- workbook = writer.book
32
- worksheet = writer.sheets['Sheet1']
33
- worksheet.data_validation('S2:S{}'.format(len_df),
34
- {'validate': 'list',
35
- 'source': ['No', 'Yes', 'Discard']})
36
- worksheet.data_validation('X2:X{}'.format(len_df),
37
- {'validate': 'list',
38
- 'source': sectorlist + ['Blank']})
39
- worksheet.data_validation('T2:T{}'.format(len_df),
40
- {'validate': 'list',
41
- 'source': sectorlist + ['Blank']})
42
- worksheet.data_validation('U2:U{}'.format(len_df),
43
- {'validate': 'list',
44
- 'source': sectorlist + ['Blank']})
45
- worksheet.data_validation('V2:V{}'.format(len_df),
46
- {'validate': 'list',
47
- 'source': sectorlist + ['Blank']})
48
- worksheet.data_validation('W2:U{}'.format(len_df),
49
- {'validate': 'list',
50
- 'source': sectorlist + ['Blank']})
51
- writer.save()
52
- processed_data = output.getvalue()
53
- return processed_data
54
-
55
  def app():
56
 
57
  ### Main app code ###
58
  with st.container():
59
-
60
- if 'key1' in st.session_state:
61
- df = st.session_state.key1
62
- classifier = load_sectorClassifier(classifier_name=params['model_name'])
63
- st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
64
-
65
- if sum(df['Target Label'] == 'TARGET') > 100:
66
- warning_msg = ": This might take sometime, please sit back and relax."
67
- else:
68
- warning_msg = ""
69
-
70
- df = sector_classification(haystack_doc=df,
71
- threshold= params['threshold'])
72
-
73
- st.session_state.key1 = df
74
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  # # st.write(df)
77
  # threshold= params['threshold']
 
22
  classifier_identifier = 'sector'
23
  params = get_classifier_params(classifier_identifier)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def app():
26
 
27
  ### Main app code ###
28
  with st.container():
29
+
30
+ if 'key1' in st.session_state:
31
+ df = st.session_state.key1
32
+ classifier = load_sectorClassifier(classifier_name=params['model_name'])
33
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
34
+
35
+ if sum(df['Target Label'] == 'TARGET') > 100:
36
+ warning_msg = ": This might take sometime, please sit back and relax."
37
+ else:
38
+ warning_msg = ""
39
+
40
+ df = sector_classification(haystack_doc=df,
41
+ threshold= params['threshold'])
42
+
43
+ st.session_state.key1 = df
44
+
45
+ # @st.cache_data
46
+ # def to_excel(df,sectorlist):
47
+ # len_df = len(df)
48
+ # output = BytesIO()
49
+ # writer = pd.ExcelWriter(output, engine='xlsxwriter')
50
+ # df.to_excel(writer, index=False, sheet_name='Sheet1')
51
+ # workbook = writer.book
52
+ # worksheet = writer.sheets['Sheet1']
53
+ # worksheet.data_validation('S2:S{}'.format(len_df),
54
+ # {'validate': 'list',
55
+ # 'source': ['No', 'Yes', 'Discard']})
56
+ # worksheet.data_validation('X2:X{}'.format(len_df),
57
+ # {'validate': 'list',
58
+ # 'source': sectorlist + ['Blank']})
59
+ # worksheet.data_validation('T2:T{}'.format(len_df),
60
+ # {'validate': 'list',
61
+ # 'source': sectorlist + ['Blank']})
62
+ # worksheet.data_validation('U2:U{}'.format(len_df),
63
+ # {'validate': 'list',
64
+ # 'source': sectorlist + ['Blank']})
65
+ # worksheet.data_validation('V2:V{}'.format(len_df),
66
+ # {'validate': 'list',
67
+ # 'source': sectorlist + ['Blank']})
68
+ # worksheet.data_validation('W2:U{}'.format(len_df),
69
+ # {'validate': 'list',
70
+ # 'source': sectorlist + ['Blank']})
71
+ # writer.save()
72
+ # processed_data = output.getvalue()
73
+ # return processed_data
74
 
75
  # # st.write(df)
76
  # threshold= params['threshold']
utils/adapmit_classifier.py CHANGED
@@ -35,9 +35,7 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
35
  doc_classifier = pipeline("text-classification",
36
  model=classifier_name,
37
  return_all_scores=True,
38
- function_to_apply= "sigmoid")
39
-
40
-
41
  return doc_classifier
42
 
43
 
@@ -61,14 +59,10 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
61
  In case of streamlit avoid passing the model directly.
62
  Returns
63
  ----------
64
- df: Dataframe with two columns['SDG:int', 'text']
65
- x: Series object with the unique SDG covered in the document uploaded and
66
- the number of times it is covered/discussed/count_of_paragraphs.
67
  """
68
  logging.info("Working on Adaptation-Mitigation Identification")
69
  haystack_doc['Adapt-Mitig Label'] = 'NA'
70
- # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
71
- # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
72
 
73
  if not classifier_model:
74
  classifier_model = st.session_state['adapmit_classifier']
@@ -86,14 +80,18 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
86
  labels_ = [{**list_[l]} for l in range(len(predictions))]
87
  truth_df = DataFrame.from_dict(labels_)
88
  truth_df = truth_df.round(2)
 
89
  truth_df = truth_df.astype(float) >= threshold
90
  truth_df = truth_df.astype(str)
 
91
  categories = list(truth_df.columns)
 
 
92
  truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
93
  else None for i in categories}, axis=1)
94
  truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
95
  list(x['Adapt-Mitig Label'] -{None}),axis=1)
 
96
  haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
97
- #df = pd.concat([df,df1])
98
 
99
  return haystack_doc
 
35
  doc_classifier = pipeline("text-classification",
36
  model=classifier_name,
37
  return_all_scores=True,
38
+ function_to_apply= "sigmoid")s
 
 
39
  return doc_classifier
40
 
41
 
 
59
  In case of streamlit avoid passing the model directly.
60
  Returns
61
  ----------
62
+ df: Dataframe
 
 
63
  """
64
  logging.info("Working on Adaptation-Mitigation Identification")
65
  haystack_doc['Adapt-Mitig Label'] = 'NA'
 
 
66
 
67
  if not classifier_model:
68
  classifier_model = st.session_state['adapmit_classifier']
 
80
  labels_ = [{**list_[l]} for l in range(len(predictions))]
81
  truth_df = DataFrame.from_dict(labels_)
82
  truth_df = truth_df.round(2)
83
+ # convert the labels score into boolean based on threshold value
84
  truth_df = truth_df.astype(float) >= threshold
85
  truth_df = truth_df.astype(str)
86
+ # list of labels
87
  categories = list(truth_df.columns)
88
+
89
+ # collecting the labels, None is passed to overcome comprehension syntax
90
  truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
91
  else None for i in categories}, axis=1)
92
  truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
93
  list(x['Adapt-Mitig Label'] -{None}),axis=1)
94
+ # adding Adaptation-Mitigation label
95
  haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
 
96
 
97
  return haystack_doc
utils/ghg_classifier.py CHANGED
@@ -55,8 +55,8 @@ def ghg_classification(haystack_doc:pd.DataFrame,
55
  )->Tuple[DataFrame,Series]:
56
  """
57
  Text-Classification on the list of texts provided. Classifier provides the
58
- most appropriate label for each text. these labels are in terms of if text
59
- belongs to which particular Sustainable Devleopment Goal (SDG).
60
  Params
61
  ---------
62
  haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
@@ -68,13 +68,12 @@ def ghg_classification(haystack_doc:pd.DataFrame,
68
  In case of streamlit avoid passing the model directly.
69
  Returns
70
  ----------
71
- df: Dataframe with two columns['SDG:int', 'text']
72
- x: Series object with the unique SDG covered in the document uploaded and
73
- the number of times it is covered/discussed/count_of_paragraphs.
74
  """
75
  logging.info("Working on GHG Extraction")
76
  haystack_doc['GHG Label'] = 'NA'
77
  haystack_doc['GHG Score'] = 'NA'
 
78
  temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
79
  temp = temp.reset_index(drop=True)
80
  df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
@@ -86,6 +85,8 @@ def ghg_classification(haystack_doc:pd.DataFrame,
86
  results = classifier_model(list(temp.text))
87
  labels_= [(l[0]['label'],l[0]['score']) for l in results]
88
  temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
 
 
89
  df = pd.concat([df,temp])
90
  df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
91
  df = df.reset_index(drop =True)
 
55
  )->Tuple[DataFrame,Series]:
56
  """
57
  Text-Classification on the list of texts provided. Classifier provides the
58
+ most appropriate label for each text. It identifies if text contains 'GHG'
59
+ related information or not.
60
  Params
61
  ---------
62
  haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
 
68
  In case of streamlit avoid passing the model directly.
69
  Returns
70
  ----------
71
+ df: Dataframe
 
 
72
  """
73
  logging.info("Working on GHG Extraction")
74
  haystack_doc['GHG Label'] = 'NA'
75
  haystack_doc['GHG Score'] = 'NA'
76
+ # applying GHG Identifier to only 'Target' paragraphs.
77
  temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
78
  temp = temp.reset_index(drop=True)
79
  df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
 
85
  results = classifier_model(list(temp.text))
86
  labels_= [(l[0]['label'],l[0]['score']) for l in results]
87
  temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
88
+
89
+ # merge back Target and non-Target dataframe
90
  df = pd.concat([df,temp])
91
  df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
92
  df = df.reset_index(drop =True)
utils/netzero_classifier.py CHANGED
@@ -52,8 +52,8 @@ def netzero_classification(haystack_doc:pd.DataFrame,
52
  )->Tuple[DataFrame,Series]:
53
  """
54
  Text-Classification on the list of texts provided. Classifier provides the
55
- most appropriate label for each text. these labels are in terms of if text
56
- belongs to which particular Sustainable Devleopment Goal (SDG).
57
  Params
58
  ---------
59
  haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
@@ -65,13 +65,12 @@ def netzero_classification(haystack_doc:pd.DataFrame,
65
  In case of streamlit avoid passing the model directly.
66
  Returns
67
  ----------
68
- df: Dataframe with two columns['SDG:int', 'text']
69
- x: Series object with the unique SDG covered in the document uploaded and
70
- the number of times it is covered/discussed/count_of_paragraphs.
71
  """
72
  logging.info("Working on Netzero Extraction")
73
  haystack_doc['Netzero Label'] = 'NA'
74
  haystack_doc['Netzero Score'] = 'NA'
 
75
  temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
76
  temp = temp.reset_index(drop=True)
77
  df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
@@ -83,6 +82,7 @@ def netzero_classification(haystack_doc:pd.DataFrame,
83
  results = classifier_model(list(temp.text))
84
  labels_= [(l[0]['label'],l[0]['score']) for l in results]
85
  temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
 
86
  df = pd.concat([df,temp])
87
  df = df.reset_index(drop =True)
88
  df.index += 1
 
52
  )->Tuple[DataFrame,Series]:
53
  """
54
  Text-Classification on the list of texts provided. Classifier provides the
55
+ most appropriate label for each text. It informs if paragraph contains any
56
+ netzero information or not.
57
  Params
58
  ---------
59
  haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
 
65
  In case of streamlit avoid passing the model directly.
66
  Returns
67
  ----------
68
+ df: Dataframe
 
 
69
  """
70
  logging.info("Working on Netzero Extraction")
71
  haystack_doc['Netzero Label'] = 'NA'
72
  haystack_doc['Netzero Score'] = 'NA'
73
+ # we apply Netzero to only paragraphs which are classified as 'Target' related
74
  temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
75
  temp = temp.reset_index(drop=True)
76
  df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
 
82
  results = classifier_model(list(temp.text))
83
  labels_= [(l[0]['label'],l[0]['score']) for l in results]
84
  temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
85
+ # merging Target with Non Target dataframe
86
  df = pd.concat([df,temp])
87
  df = df.reset_index(drop =True)
88
  df.index += 1
utils/sector_classifier.py CHANGED
@@ -70,19 +70,16 @@ def sector_classification(haystack_doc:pd.DataFrame,
70
  In case of streamlit avoid passing the model directly.
71
  Returns
72
  ----------
73
- df: Dataframe with two columns['SDG:int', 'text']
74
- x: Series object with the unique SDG covered in the document uploaded and
75
- the number of times it is covered/discussed/count_of_paragraphs.
76
  """
77
  logging.info("Working on Sector Identification")
78
  haystack_doc['Sector Label'] = 'NA'
79
- # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
80
- # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
  if not classifier_model:
82
  classifier_model = st.session_state['sector_classifier']
83
 
84
  predictions = classifier_model(list(haystack_doc.text))
85
 
 
86
  list_ = []
87
  for i in range(len(predictions)):
88
 
@@ -94,13 +91,17 @@ def sector_classification(haystack_doc:pd.DataFrame,
94
  labels_ = [{**list_[l]} for l in range(len(predictions))]
95
  truth_df = DataFrame.from_dict(labels_)
96
  truth_df = truth_df.round(2)
 
97
  truth_df = truth_df.astype(float) >= threshold
98
  truth_df = truth_df.astype(str)
 
99
  categories = list(truth_df.columns)
 
 
100
  truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
101
  None for i in categories}, axis=1)
 
102
  truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
103
  -{None}),axis=1)
104
  haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
105
- # df = pd.concat([df,df1])
106
  return haystack_doc
 
70
  In case of streamlit avoid passing the model directly.
71
  Returns
72
  ----------
73
+ df: Dataframe
 
 
74
  """
75
  logging.info("Working on Sector Identification")
76
  haystack_doc['Sector Label'] = 'NA'
 
 
77
  if not classifier_model:
78
  classifier_model = st.session_state['sector_classifier']
79
 
80
  predictions = classifier_model(list(haystack_doc.text))
81
 
82
+ # getting the sector label and scores
83
  list_ = []
84
  for i in range(len(predictions)):
85
 
 
91
  labels_ = [{**list_[l]} for l in range(len(predictions))]
92
  truth_df = DataFrame.from_dict(labels_)
93
  truth_df = truth_df.round(2)
94
+ # based on threshold value, we convert each sector score into boolean
95
  truth_df = truth_df.astype(float) >= threshold
96
  truth_df = truth_df.astype(str)
97
+ # collecting list of Sector Labels
98
  categories = list(truth_df.columns)
99
+ # we collect the Sector Labels as set, None represent the value at the index
100
+ # in the list of Sector Labels.
101
  truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
102
  None for i in categories}, axis=1)
103
+ # we keep all Sector label except None
104
  truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
105
  -{None}),axis=1)
106
  haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
 
107
  return haystack_doc