leavoigt commited on
Commit
9cbcff4
1 Parent(s): 48bf795

Update utils/sdg_classifier.py

Browse files
Files changed (1) hide show
  1. utils/sdg_classifier.py +13 -13
utils/sdg_classifier.py CHANGED
@@ -95,7 +95,7 @@ def classification(haystack_doc:List[Document],
95
  the number of times it is covered/discussed/count_of_paragraphs.
96
 
97
  """
98
- logging.info("Working on Vulnerability Classification")
99
  if not classifier_model:
100
  if check_streamlit():
101
  classifier_model = st.session_state['vulnerability_classifier']
@@ -109,27 +109,27 @@ def classification(haystack_doc:List[Document],
109
  labels_= [(l.meta['classification']['label'],
110
  l.meta['classification']['score'],l.content,) for l in results]
111
 
112
- df = DataFrame(labels_, columns=["Vulnerability","Relevancy","text"])
113
 
114
  df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
115
  df.index += 1
116
  df =df[df['Relevancy']>threshold]
117
 
118
  # creating the dataframe for value counts of SDG, along with 'title' of SDGs
119
- x = df['Vulnerability'].value_counts()
120
  x = x.rename('count')
121
- x = x.rename_axis('Vulnerability').reset_index()
122
- x["Vulnerability"] = pd.to_numeric(x["Vulnerability"])
123
  x = x.sort_values(by=['count'], ascending=False)
124
- x['SDG_name'] = x['Vulnerability'].apply(lambda x: _lab_dict[x])
125
- x['SDG_Num'] = x['Vulnerability'].apply(lambda x: "Vulnerability "+str(x))
126
 
127
- df['Vulnerability'] = pd.to_numeric(df['Vulnerability'])
128
- df = df.sort_values('Vulnerability')
129
 
130
  return df, x
131
 
132
- def runSDGPreprocessingPipeline(file_name:str, file_path:str,
133
  split_by: Literal["sentence", "word"] = 'sentence',
134
  split_length:int = 2, split_respect_sentence_boundary:bool = False,
135
  split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
@@ -163,9 +163,9 @@ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
163
 
164
  """
165
 
166
- sdg_processing_pipeline = processingpipeline()
167
 
168
- output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
169
  params= {"FileConverter": {"file_path": file_path, \
170
  "file_name": file_name},
171
  "UdfPreProcessor": {"remove_punc": remove_punc, \
@@ -174,4 +174,4 @@ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
174
  "split_overlap": split_overlap, \
175
  "split_respect_sentence_boundary":split_respect_sentence_boundary}})
176
 
177
- return output_sdg_pre
 
95
  the number of times it is covered/discussed/count_of_paragraphs.
96
 
97
  """
98
+ logging.info("Working on vulnerability Classification")
99
  if not classifier_model:
100
  if check_streamlit():
101
  classifier_model = st.session_state['vulnerability_classifier']
 
109
  labels_= [(l.meta['classification']['label'],
110
  l.meta['classification']['score'],l.content,) for l in results]
111
 
112
+ df = DataFrame(labels_, columns=["vulnerability","Relevancy","text"])
113
 
114
  df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
115
  df.index += 1
116
  df =df[df['Relevancy']>threshold]
117
 
118
  # creating the dataframe for value counts of SDG, along with 'title' of SDGs
119
+ x = df['vulnerability'].value_counts()
120
  x = x.rename('count')
121
+ x = x.rename_axis('vulnerability').reset_index()
122
+ x["Vulnerability"] = pd.to_numeric(x["vulnerability"])
123
  x = x.sort_values(by=['count'], ascending=False)
124
+ x['vulnerability_name'] = x['vulnerability'].apply(lambda x: _lab_dict[x])
125
+ x['vulnerability_Num'] = x['vulnerability'].apply(lambda x: "vulnerability "+str(x))
126
 
127
+ df['vulnerability'] = pd.to_numeric(df['vulnerability'])
128
+ df = df.sort_values('vulnerability')
129
 
130
  return df, x
131
 
132
+ def runPreprocessingPipeline(file_name:str, file_path:str,
133
  split_by: Literal["sentence", "word"] = 'sentence',
134
  split_length:int = 2, split_respect_sentence_boundary:bool = False,
135
  split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
 
163
 
164
  """
165
 
166
+ processing_pipeline = processingpipeline()
167
 
168
+ output_pre = processing_pipeline.run(file_paths = file_path,
169
  params= {"FileConverter": {"file_path": file_path, \
170
  "file_name": file_name},
171
  "UdfPreProcessor": {"remove_punc": remove_punc, \
 
174
  "split_overlap": split_overlap, \
175
  "split_respect_sentence_boundary":split_respect_sentence_boundary}})
176
 
177
+ return output_pre