Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
fc3b461
1 Parent(s): 99ae6d0

ifidf models

Browse files
docStore/sdg1/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg1/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg10/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg10/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg11/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg11/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg12/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg12/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg13/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg13/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg14/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg14/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg15/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg15/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg2/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg2/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg3/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg3/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg4/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg4/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg5/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg5/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg6/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg6/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg7/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg7/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg8/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg8/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg9/tfidfmodel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
docStore/sdg9/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
+ size 2520
utils/keyword_extraction.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
3
+ import nltk
4
+ nltk.download('stopwords')
5
+ from nltk.corpus import stopwords
6
+ import pickle
7
+
8
+
9
+ def sort_coo(coo_matrix):
10
+ tuples = zip(coo_matrix.col, coo_matrix.data)
11
+ return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
12
+
13
+ def extract_topn_from_vector(feature_names, sorted_items, topn=10):
14
+ """get the feature names and tf-idf score of top n items"""
15
+
16
+ #use only topn items from vector
17
+ sorted_items = sorted_items[:topn]
18
+ score_vals = []
19
+ feature_vals = []
20
+
21
+ # word index and corresponding tf-idf score
22
+ for idx, score in sorted_items:
23
+
24
+ #keep track of feature name and its corresponding score
25
+ score_vals.append(round(score, 3))
26
+ feature_vals.append(feature_names[idx])
27
+ #create a tuples of feature,score
28
+ #results = zip(feature_vals,score_vals)
29
+ results= {}
30
+ for idx in range(len(feature_vals)):
31
+ results[feature_vals[idx]]=score_vals[idx]
32
+
33
+ return results
34
+
35
+ def keyword_extraction(sdg:int,sdgdata):
36
+ model_path = "docStore/sdg{}/".format(sdg)
37
+ vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
38
+ tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
39
+ features = vectorizer.get_feature_names_out()
40
+
41
+
42
+
utils/preprocessing.py CHANGED
@@ -79,7 +79,7 @@ class FileConverter(BaseComponent):
79
  if file_name.endswith('.txt'):
80
  converter = TextConverter(remove_numeric_tables=True)
81
  if file_name.endswith('.docx'):
82
- converter = DocxToTextConverter(remove_numeric_tables=True)
83
  except Exception as e:
84
  logging.error(e)
85
  return
 
79
  if file_name.endswith('.txt'):
80
  converter = TextConverter(remove_numeric_tables=True)
81
  if file_name.endswith('.docx'):
82
+ converter = DocxToTextConverter()
83
  except Exception as e:
84
  logging.error(e)
85
  return
utils/semantic_search.py CHANGED
@@ -188,6 +188,66 @@ def semanticSearchPipeline(documents:List[Document]):
188
  return semanticsearch_pipeline, document_store
189
 
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  # if 'document_store' in st.session_state:
193
  # document_store = st.session_state['document_store']
@@ -264,63 +324,4 @@ def semanticSearchPipeline(documents:List[Document]):
264
  # semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
265
  # inputs= ["EmbeddingRetriever"])
266
 
267
- # return semanticsearch_pipeline, document_store
268
-
269
- def semanticsearchAnnotator(matches: List[List[int]], document):
270
- """
271
- Annotates the text in the document defined by list of [start index, end index]
272
- Example: "How are you today", if document type is text, matches = [[0,3]]
273
- will give answer = "How", however in case we used the spacy matcher then the
274
- matches = [[0,3]] will give answer = "How are you". However if spacy is used
275
- to find "How" then the matches = [[0,1]] for the string defined above.
276
-
277
- """
278
- start = 0
279
- annotated_text = ""
280
- for match in matches:
281
- start_idx = match[0]
282
- end_idx = match[1]
283
- if check_streamlit():
284
- annotated_text = (annotated_text + document[start:start_idx]
285
- + str(annotation(body=document[start_idx:end_idx],
286
- label="ANSWER", background="#964448", color='#ffffff')))
287
- else:
288
- annotated_text = (annotated_text + document[start:start_idx]
289
- + colored(document[start_idx:end_idx],
290
- "green", attrs = ['bold']))
291
- start = end_idx
292
-
293
- annotated_text = annotated_text + document[end_idx:]
294
-
295
- if check_streamlit():
296
-
297
- st.write(
298
- markdown(annotated_text),
299
- unsafe_allow_html=True,
300
- )
301
- else:
302
- print(annotated_text)
303
-
304
-
305
- def semantic_search(query:Text,documents:List[Document]):
306
- """
307
- Performs the Semantic search on the List of haystack documents which is
308
- returned by preprocessing Pipeline.
309
-
310
- Params
311
- -------
312
- query: Keywords that need to be searche in documents.
313
- documents: List fo Haystack documents returned by preprocessing pipeline.
314
-
315
- """
316
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents)
317
- results = semanticsearch_pipeline.run(query = query)
318
- st.markdown("##### Top few semantic search results #####")
319
- for i,answer in enumerate(results['answers']):
320
- temp = answer.to_dict()
321
- start_idx = temp['offsets_in_document'][0]['start']
322
- end_idx = temp['offsets_in_document'][0]['end']
323
- match = [[start_idx,end_idx]]
324
- doc = doc_store.get_document_by_id(temp['document_id']).content
325
- st.write("Result {}".format(i+1))
326
- semanticsearchAnnotator(match, doc)
 
188
  return semanticsearch_pipeline, document_store
189
 
190
 
191
+ def semanticsearchAnnotator(matches: List[List[int]], document):
192
+ """
193
+ Annotates the text in the document defined by list of [start index, end index]
194
+ Example: "How are you today", if document type is text, matches = [[0,3]]
195
+ will give answer = "How", however in case we used the spacy matcher then the
196
+ matches = [[0,3]] will give answer = "How are you". However if spacy is used
197
+ to find "How" then the matches = [[0,1]] for the string defined above.
198
+
199
+ """
200
+ start = 0
201
+ annotated_text = ""
202
+ for match in matches:
203
+ start_idx = match[0]
204
+ end_idx = match[1]
205
+ if check_streamlit():
206
+ annotated_text = (annotated_text + document[start:start_idx]
207
+ + str(annotation(body=document[start_idx:end_idx],
208
+ label="Context", background="#964448", color='#ffffff')))
209
+ else:
210
+ annotated_text = (annotated_text + document[start:start_idx]
211
+ + colored(document[start_idx:end_idx],
212
+ "green", attrs = ['bold']))
213
+ start = end_idx
214
+
215
+ annotated_text = annotated_text + document[end_idx:]
216
+
217
+ if check_streamlit():
218
+
219
+ st.write(
220
+ markdown(annotated_text),
221
+ unsafe_allow_html=True,
222
+ )
223
+ else:
224
+ print(annotated_text)
225
+
226
+
227
+ def semantic_search(query:Text,documents:List[Document]):
228
+ """
229
+ Performs the Semantic search on the List of haystack documents which is
230
+ returned by preprocessing Pipeline.
231
+
232
+ Params
233
+ -------
234
+ query: Keywords that need to be searche in documents.
235
+ documents: List fo Haystack documents returned by preprocessing pipeline.
236
+
237
+ """
238
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents)
239
+ results = semanticsearch_pipeline.run(query = query)
240
+ st.markdown("##### Top few semantic search results #####")
241
+ for i,answer in enumerate(results['answers']):
242
+ temp = answer.to_dict()
243
+ start_idx = temp['offsets_in_document'][0]['start']
244
+ end_idx = temp['offsets_in_document'][0]['end']
245
+ match = [[start_idx,end_idx]]
246
+ doc = doc_store.get_document_by_id(temp['document_id']).content
247
+ st.write("Result {}".format(i+1))
248
+ semanticsearchAnnotator(match, doc)
249
+
250
+
251
 
252
  # if 'document_store' in st.session_state:
253
  # document_store = st.session_state['document_store']
 
324
  # semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
325
  # inputs= ["EmbeddingRetriever"])
326
 
327
+ # return semanticsearch_pipeline, document_store