leavoigt commited on
Commit
1a4c853
1 Parent(s): 31b5a19

Delete utils

Browse files
utils/__init__ DELETED
File without changes
utils/checkconfig.py DELETED
@@ -1,15 +0,0 @@
1
- import configparser
2
- import logging
3
-
4
- def getconfig(configfile_path:str):
5
- """
6
- configfile_path: file path of .cfg file
7
- """
8
-
9
- config = configparser.ConfigParser()
10
-
11
- try:
12
- config.read_file(open(configfile_path))
13
- return config
14
- except:
15
- logging.warning("config file not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/preprocessing.py DELETED
@@ -1,260 +0,0 @@
1
- from haystack.nodes.base import BaseComponent
2
- from haystack.schema import Document
3
- from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
- from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
- from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
- from typing_extensions import Literal
7
- import pandas as pd
8
- import logging
9
- import re
10
- import string
11
- from haystack.pipelines import Pipeline
12
-
13
- def useOCR(file_path: str)-> Text:
14
- """
15
- Converts image pdfs into text, Using the Farm-haystack[OCR]
16
-
17
- Params
18
- ----------
19
- file_path: file_path of uploade file, returned by add_upload function in
20
- uploadAndExample.py
21
-
22
- Returns the text file as string.
23
- """
24
-
25
-
26
- converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
- valid_languages=["eng"])
28
- docs = converter.convert(file_path=file_path, meta=None)
29
- return docs[0].content
30
-
31
-
32
-
33
-
34
- class FileConverter(BaseComponent):
35
- """
36
- Wrapper class to convert uploaded document into text by calling appropriate
37
- Converter class, will use internally haystack PDFToTextOCR in case of image
38
- pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
- label/output class for image.
40
-
41
- 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
- 2. https://docs.haystack.deepset.ai/docs/file_converters
43
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
- 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
-
46
-
47
- """
48
-
49
- outgoing_edges = 1
50
-
51
- def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
- id_hash_keys: Optional[List[str]] = None,
53
- ) -> Tuple[dict,str]:
54
- """ this is required method to invoke the component in
55
- the pipeline implementation.
56
-
57
- Params
58
- ----------
59
- file_name: name of file
60
- file_path: file_path of uploade file, returned by add_upload function in
61
- uploadAndExample.py
62
-
63
- See the links provided in Class docstring/description to see other params
64
-
65
- Return
66
- ---------
67
- output: dictionary, with key as identifier and value could be anything
68
- we need to return. In this case its the List of Hasyatck Document
69
-
70
- output_1: As there is only one outgoing edge, we pass 'output_1' string
71
- """
72
- try:
73
- if file_name.endswith('.pdf'):
74
- converter = PDFToTextConverter(remove_numeric_tables=True)
75
- if file_name.endswith('.txt'):
76
- converter = TextConverter(remove_numeric_tables=True)
77
- if file_name.endswith('.docx'):
78
- converter = DocxToTextConverter()
79
- except Exception as e:
80
- logging.error(e)
81
- return
82
-
83
-
84
-
85
- documents = []
86
-
87
- document = converter.convert(
88
- file_path=file_path, meta=None,
89
- encoding=encoding, id_hash_keys=id_hash_keys
90
- )[0]
91
-
92
- text = document.content
93
-
94
- # if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
95
- # subsitute this substring with '',and check if content is empty string
96
-
97
- text = re.sub(r'\x0c', '', text)
98
- documents.append(Document(content=text,
99
- meta={"name": file_name},
100
- id_hash_keys=id_hash_keys))
101
-
102
-
103
- # check if text is empty and apply pdfOCR converter.
104
- for i in documents:
105
- if i.content == "":
106
- logging.info("Using OCR")
107
- i.content = useOCR(file_path)
108
-
109
- logging.info('file conversion succesful')
110
- output = {'documents': documents}
111
- return output, 'output_1'
112
-
113
- def run_batch():
114
- """
115
- we dont have requirement to process the multiple files in one go
116
- therefore nothing here, however to use the custom node we need to have
117
- this method for the class.
118
- """
119
-
120
- return
121
-
122
-
123
- def basic(s:str, remove_punc:bool = False):
124
-
125
- """
126
- Performs basic cleaning of text.
127
-
128
- Params
129
- ----------
130
- s: string to be processed
131
- removePunc: to remove all Punctuation including ',' and '.' or not
132
-
133
- Returns: processed string: see comments in the source code for more info
134
- """
135
-
136
- # Remove URLs
137
- s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
- s = re.sub(r"http\S+", " ", s)
139
-
140
- # Remove new line characters
141
- s = re.sub('\n', ' ', s)
142
-
143
- # Remove punctuations
144
- if remove_punc == True:
145
- translator = str.maketrans(' ', ' ', string.punctuation)
146
- s = s.translate(translator)
147
- # Remove distracting single quotes and dotted pattern
148
- s = re.sub("\'", " ", s)
149
- s = s.replace("..","")
150
-
151
- return s.strip()
152
-
153
-
154
- class UdfPreProcessor(BaseComponent):
155
- """
156
- class to preprocess the document returned by FileConverter. It will check
157
- for splitting strategy and splits the document by word or sentences and then
158
- synthetically create the paragraphs.
159
-
160
- 1. https://docs.haystack.deepset.ai/docs/preprocessor
161
- 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
162
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
163
-
164
- """
165
- outgoing_edges = 1
166
-
167
- def run(self, documents:List[Document], remove_punc:bool=False,
168
- split_by: Literal["sentence", "word"] = 'sentence',
169
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
170
- split_overlap:int = 0):
171
-
172
- """ this is required method to invoke the component in
173
- the pipeline implementation.
174
-
175
- Params
176
- ----------
177
- documents: documents from the output dictionary returned by Fileconverter
178
- remove_punc: to remove all Punctuation including ',' and '.' or not
179
- split_by: document splitting strategy either as word or sentence
180
- split_length: when synthetically creating the paragrpahs from document,
181
- it defines the length of paragraph.
182
- split_respect_sentence_boundary: Used when using 'word' strategy for
183
- splititng of text.
184
- split_overlap: Number of words or sentences that overlap when creating
185
- the paragraphs. This is done as one sentence or 'some words' make sense
186
- when read in together with others. Therefore the overlap is used.
187
-
188
- Return
189
- ---------
190
- output: dictionary, with key as identifier and value could be anything
191
- we need to return. In this case the output will contain 4 objects
192
- the paragraphs text list as List, Haystack document, Dataframe and
193
- one raw text file.
194
-
195
- output_1: As there is only one outgoing edge, we pass 'output_1' string
196
-
197
- """
198
-
199
- if split_by == 'sentence':
200
- split_respect_sentence_boundary = False
201
-
202
- else:
203
- split_respect_sentence_boundary = split_respect_sentence_boundary
204
-
205
- preprocessor = PreProcessor(
206
- clean_empty_lines=True,
207
- clean_whitespace=True,
208
- clean_header_footer=True,
209
- split_by=split_by,
210
- split_length=split_length,
211
- split_respect_sentence_boundary= split_respect_sentence_boundary,
212
- split_overlap=split_overlap,
213
-
214
- # will add page number only in case of PDF not for text/docx file.
215
- add_page_number=True
216
- )
217
-
218
- for i in documents:
219
- # # basic cleaning before passing it to preprocessor.
220
- # i = basic(i)
221
- docs_processed = preprocessor.process([i])
222
- for item in docs_processed:
223
- item.content = basic(item.content, remove_punc= remove_punc)
224
-
225
- df = pd.DataFrame(docs_processed)
226
- all_text = " ".join(df.content.to_list())
227
- para_list = df.content.to_list()
228
- logging.info('document split into {} paragraphs'.format(len(para_list)))
229
- output = {'documents': docs_processed,
230
- 'dataframe': df,
231
- 'text': all_text,
232
- 'paraList': para_list
233
- }
234
- return output, "output_1"
235
- def run_batch():
236
- """
237
- we dont have requirement to process the multiple files in one go
238
- therefore nothing here, however to use the custom node we need to have
239
- this method for the class.
240
- """
241
- return
242
-
243
- def processingpipeline():
244
- """
245
- Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
246
- from utils.preprocessing
247
-
248
- """
249
-
250
- preprocessing_pipeline = Pipeline()
251
- file_converter = FileConverter()
252
- custom_preprocessor = UdfPreProcessor()
253
-
254
- preprocessing_pipeline.add_node(component=file_converter,
255
- name="FileConverter", inputs=["File"])
256
- preprocessing_pipeline.add_node(component = custom_preprocessor,
257
- name ='UdfPreProcessor', inputs=["FileConverter"])
258
-
259
- return preprocessing_pipeline
260
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/streamlitcheck.py DELETED
@@ -1,42 +0,0 @@
1
- import logging
2
- try:
3
- import streamlit as st
4
- except ImportError:
5
- logging.info("Streamlit not installed")
6
-
7
-
8
- def check_streamlit():
9
- """
10
- Function to check whether python code is run within streamlit
11
-
12
- Returns
13
- -------
14
- use_streamlit : boolean
15
- True if code is run within streamlit, else False
16
- """
17
- try:
18
- from streamlit.scriptrunner.script_run_context import get_script_run_ctx
19
- if not get_script_run_ctx():
20
- use_streamlit = False
21
- else:
22
- use_streamlit = True
23
- except ModuleNotFoundError:
24
- use_streamlit = False
25
- return use_streamlit
26
-
27
- def disable_other_checkboxes(*other_checkboxes_keys):
28
- for checkbox_key in other_checkboxes_keys:
29
- st.session_state[checkbox_key] = False
30
-
31
- def checkbox_without_preselect(keylist):
32
- dict_ = {}
33
- for i,key_val in enumerate(keylist):
34
- dict_[i] = st.checkbox(key_val,key = key_val,
35
- on_change = disable_other_checkboxes,
36
- args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
37
-
38
- for key,val in dict_.items():
39
- if val == True:
40
- return keylist[int(key)]
41
-
42
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/uploadAndExample.py DELETED
@@ -1,33 +0,0 @@
1
- import streamlit as st
2
- import tempfile
3
- import json
4
-
5
- def add_upload(choice):
6
- """
7
- Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
- Based on user choice runs streamlit processes and save the path and name of
9
- the 'file' to streamlit session_state which then can be fetched later.
10
-
11
- """
12
-
13
- if choice == 'Upload Document':
14
- uploaded_file = st.sidebar.file_uploader('Upload the File',
15
- type=['pdf', 'docx', 'txt'])
16
- if uploaded_file is not None:
17
- with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
18
- bytes_data = uploaded_file.getvalue()
19
- temp.write(bytes_data)
20
- st.session_state['filename'] = uploaded_file.name
21
- st.session_state['filepath'] = temp.name
22
-
23
-
24
- else:
25
- # listing the options
26
- with open('docStore/sample/files.json','r') as json_file:
27
- files = json.load(json_file)
28
-
29
- option = st.sidebar.selectbox('Select the example document',
30
- list(files.keys()))
31
- file_name = file_path = files[option]
32
- st.session_state['filename'] = file_name
33
- st.session_state['filepath'] = file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/vulnerability_classifier.py DELETED
@@ -1,177 +0,0 @@
1
- from haystack.nodes import TransformersDocumentClassifier
2
- from haystack.schema import Document
3
- from typing import List, Tuple
4
- from typing_extensions import Literal
5
- import logging
6
- import pandas as pd
7
- from pandas import DataFrame, Series
8
- from utils.checkconfig import getconfig
9
- from utils.streamlitcheck import check_streamlit
10
- from utils.preprocessing import processingpipeline
11
- try:
12
- import streamlit as st
13
- except ImportError:
14
- logging.info("Streamlit not installed")
15
-
16
- ## Labels dictionary ###
17
- _lab_dict = {0: 'Agricultural communities',
18
- 1: 'Children',
19
- 2: 'Coastal communities',
20
- 3: 'Ethnic, racial or other minorities',
21
- 4: 'Fishery communities',
22
- 5: 'Informal sector workers',
23
- 6: 'Members of indigenous and local communities',
24
- 7: 'Migrants and displaced persons',
25
- 8: 'Older persons',
26
- 9: 'Other',
27
- 10: 'Persons living in poverty',
28
- 11: 'Persons with disabilities',
29
- 12: 'Persons with pre-existing health conditions',
30
- 13: 'Residents of drought-prone regions',
31
- 14: 'Rural populations',
32
- 15: 'Sexual minorities (LGBTQI+)',
33
- 16: 'Urban populations',
34
- 17: 'Women and other genders'}
35
-
36
- @st.cache(allow_output_mutation=True)
37
- def load_Classifier(config_file:str = None, classifier_name:str = None):
38
- """
39
- loads the document classifier using haystack, where the name/path of model
40
- in HF-hub as string is used to fetch the model object.Either configfile or
41
- model should be passed.
42
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
43
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
44
-
45
- Params
46
- --------
47
- config_file: config file path from which to read the model name
48
- classifier_name: if modelname is passed, it takes a priority if not \
49
- found then will look for configfile, else raise error.
50
-
51
-
52
- Return: document classifier model
53
- """
54
- if not classifier_name:
55
- if not config_file:
56
- logging.warning("Pass either model name or config file")
57
- return
58
- else:
59
- config = getconfig(config_file)
60
- classifier_name = config.get('vulnerability','MODEL')
61
-
62
- logging.info("Loading classifier")
63
- doc_classifier = TransformersDocumentClassifier(
64
- model_name_or_path=classifier_name,
65
- task="text-classification")
66
-
67
- return doc_classifier
68
-
69
-
70
- @st.cache(allow_output_mutation=True)
71
- def vulnerability_classification(haystack_doc:List[Document],
72
- threshold:float = 0.8,
73
- classifier_model:TransformersDocumentClassifier= None
74
- )->Tuple[DataFrame,Series]:
75
- """
76
- Text-Classification on the list of texts provided. Classifier provides the
77
- most appropriate label for each text. these labels are in terms of if text
78
- belongs to which particular Sustainable Devleopment Goal (SDG).
79
-
80
- Params
81
- ---------
82
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
83
- contains the list of paragraphs in different format,here the list of
84
- Haystack Documents is used.
85
- threshold: threshold value for the model to keep the results from classifier
86
- classifiermodel: you can pass the classifier model directly,which takes priority
87
- however if not then looks for model in streamlit session.
88
- In case of streamlit avoid passing the model directly.
89
-
90
-
91
- Returns
92
- ----------
93
- df: Dataframe with two columns['SDG:int', 'text']
94
- x: Series object with the unique SDG covered in the document uploaded and
95
- the number of times it is covered/discussed/count_of_paragraphs.
96
-
97
- """
98
- logging.info("Working on vulnerability Classification")
99
- if not classifier_model:
100
- if check_streamlit():
101
- classifier_model = st.session_state['vulnerability_classifier']
102
- else:
103
- logging.warning("No streamlit envinornment found, Pass the classifier")
104
- return
105
-
106
- results = classifier_model.predict(haystack_doc)
107
-
108
-
109
- labels_= [(l.meta['classification']['label'],
110
- l.meta['classification']['score'],l.content,) for l in results]
111
-
112
- df = DataFrame(labels_, columns=["vulnerability","Relevancy","text"])
113
-
114
- df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
115
- df.index += 1
116
- df =df[df['Relevancy']>threshold]
117
-
118
- # creating the dataframe for value counts of SDG, along with 'title' of SDGs
119
- x = df['vulnerability'].value_counts()
120
- x = x.rename('count')
121
- x = x.rename_axis('vulnerability').reset_index()
122
- x["Vulnerability"] = pd.to_numeric(x["vulnerability"])
123
- x = x.sort_values(by=['count'], ascending=False)
124
- x['vulnerability_name'] = x['vulnerability'].apply(lambda x: _lab_dict[x])
125
- x['vulnerability_Num'] = x['vulnerability'].apply(lambda x: "vulnerability "+str(x))
126
-
127
- df['vulnerability'] = pd.to_numeric(df['vulnerability'])
128
- df = df.sort_values('vulnerability')
129
-
130
- return df, x
131
-
132
- def runPreprocessingPipeline(file_name:str, file_path:str,
133
- split_by: Literal["sentence", "word"] = 'sentence',
134
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
135
- split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
136
- """
137
- creates the pipeline and runs the preprocessing pipeline,
138
- the params for pipeline are fetched from paramconfig
139
-
140
- Params
141
- ------------
142
-
143
- file_name: filename, in case of streamlit application use
144
- st.session_state['filename']
145
- file_path: filepath, in case of streamlit application use st.session_state['filepath']
146
- split_by: document splitting strategy either as word or sentence
147
- split_length: when synthetically creating the paragrpahs from document,
148
- it defines the length of paragraph.
149
- split_respect_sentence_boundary: Used when using 'word' strategy for
150
- splititng of text.
151
- split_overlap: Number of words or sentences that overlap when creating
152
- the paragraphs. This is done as one sentence or 'some words' make sense
153
- when read in together with others. Therefore the overlap is used.
154
- remove_punc: to remove all Punctuation including ',' and '.' or not
155
-
156
-
157
- Return
158
- --------------
159
- List[Document]: When preprocessing pipeline is run, the output dictionary
160
- has four objects. For the Haysatck implementation of SDG classification we,
161
- need to use the List of Haystack Document, which can be fetched by
162
- key = 'documents' on output.
163
-
164
- """
165
-
166
- processing_pipeline = processingpipeline()
167
-
168
- output_pre = processing_pipeline.run(file_paths = file_path,
169
- params= {"FileConverter": {"file_path": file_path, \
170
- "file_name": file_name},
171
- "UdfPreProcessor": {"remove_punc": remove_punc, \
172
- "split_by": split_by, \
173
- "split_length":split_length,\
174
- "split_overlap": split_overlap, \
175
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
176
-
177
- return output_pre