Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
a4bf4e8
1 Parent(s): 2bccbcb

adding semantic search

Browse files
appStore/keyword_search.py CHANGED
@@ -6,6 +6,7 @@ import streamlit as st
6
  import json
7
  import logging
8
  from utils.search import runLexicalPreprocessingPipeline, lexical_search
 
9
 
10
  def app():
11
 
@@ -46,11 +47,13 @@ def app():
46
  else:
47
  keywordList = None
48
 
49
- searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
 
50
 
51
  with st.container():
52
  if keywordList is not None:
53
- queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
 
54
  value="{}".format(keywordList))
55
  else:
56
  queryList = st.text_input("Please enter here your question and we will look \
@@ -67,13 +70,19 @@ def app():
67
  logging.warning("Terminated as no keyword provided")
68
  else:
69
  if 'filepath' in st.session_state:
70
- paraList = runLexicalPreprocessingPipeline()
71
 
72
  if searchtype == 'Exact Matches':
73
- # queryList = list(queryList.split(","))
74
  logging.info("performing lexical search")
75
- # token_list = tokenize_lexical_query(queryList)
76
  with st.spinner("Performing Exact matching search (Lexical search) for you"):
77
  st.markdown("##### Top few lexical search (TFIDF) hits #####")
78
  lexical_search(queryList,paraList)
 
 
 
 
 
 
 
79
 
 
6
  import json
7
  import logging
8
  from utils.search import runLexicalPreprocessingPipeline, lexical_search
9
+ from utils.search import runSemanticPreprocessingPipeline, semantic_search
10
 
11
  def app():
12
 
 
47
  else:
48
  keywordList = None
49
 
50
+ searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
51
+ ['Exact Matches', 'Similar context/meaning'])
52
 
53
  with st.container():
54
  if keywordList is not None:
55
+ queryList = st.text_input("You selcted the {} category we \
56
+ will look for these keywords in document".format(genre),
57
  value="{}".format(keywordList))
58
  else:
59
  queryList = st.text_input("Please enter here your question and we will look \
 
70
  logging.warning("Terminated as no keyword provided")
71
  else:
72
  if 'filepath' in st.session_state:
73
+
74
 
75
  if searchtype == 'Exact Matches':
76
+ paraList = runLexicalPreprocessingPipeline()
77
  logging.info("performing lexical search")
 
78
  with st.spinner("Performing Exact matching search (Lexical search) for you"):
79
  st.markdown("##### Top few lexical search (TFIDF) hits #####")
80
  lexical_search(queryList,paraList)
81
+ else:
82
+ paraList = runSemanticPreprocessingPipeline()
83
+ logging.info("starting semantic search")
84
+ with st.spinner("Performing Similar/Contextual search"):
85
+ st.markdown("##### Top few semantic search results #####")
86
+ semantic_search(queryList,paraList,show_answers=True)
87
+
88
 
appStore/sdg_analysis.py CHANGED
@@ -47,7 +47,7 @@ def app():
47
  if 'filepath' in st.session_state:
48
  paraList = runSDGPreprocessingPipeline()
49
  if len(paraList) > 150:
50
- warning_msg = ": This might take some, please sit back and relax."
51
  else:
52
  warning_msg = ""
53
 
 
47
  if 'filepath' in st.session_state:
48
  paraList = runSDGPreprocessingPipeline()
49
  if len(paraList) > 150:
50
+ warning_msg = ": This might take sometime, please sit back and relax."
51
  else:
52
  warning_msg = ""
53
 
paramconfig.cfg CHANGED
@@ -6,9 +6,13 @@ SPLIT_LENGTH = 3
6
  SPLIT_OVERLAP = 0
7
 
8
  [semantic_search]
9
- TOP_K = 10
10
  MAX_SEQ_LENGTH = 64
11
- MODEL_NAME = msmarco-distilbert-cos-v5
 
 
 
 
12
  THRESHOLD = 0.1
13
  SPLIT_BY = sentence
14
  SPLIT_LENGTH = 3
 
6
  SPLIT_OVERLAP = 0
7
 
8
  [semantic_search]
9
+ RETRIEVER_TOP_K = 10
10
  MAX_SEQ_LENGTH = 64
11
+ RETRIEVER = msmarco-bert-base-dot-v5
12
+ RETRIEVER_FORMAT = sentence_transformers
13
+ RETRIEVER_EMB_LAYER = -1
14
+ READER = deepset/tinyroberta-squad2
15
+ READER_TOP_K = 5
16
  THRESHOLD = 0.1
17
  SPLIT_BY = sentence
18
  SPLIT_LENGTH = 3
utils/search.py CHANGED
@@ -1,4 +1,6 @@
1
- from haystack.nodes import TfidfRetriever
 
 
2
  from haystack.document_stores import InMemoryDocumentStore
3
  import configparser
4
  import spacy
@@ -8,8 +10,9 @@ import streamlit as st
8
  from markdown import markdown
9
  from annotated_text import annotation
10
  from haystack.schema import Document
11
- from typing import List, Tuple, Text
12
  from utils.preprocessing import processingpipeline
 
13
 
14
  config = configparser.ConfigParser()
15
  config.read_file(open('paramconfig.cfg'))
@@ -142,7 +145,7 @@ def lexical_search(query:Text,documents:List[Document]):
142
  # if result.content != "":
143
  matches, doc = runSpacyMatcher(query_tokens,result.content)
144
  if len(matches) != 0:
145
- st.write("Result {}".format(count))
146
  searchAnnotator(matches, doc)
147
 
148
  def runLexicalPreprocessingPipeline()->List[Document]:
@@ -153,19 +156,19 @@ def runLexicalPreprocessingPipeline()->List[Document]:
153
  Return
154
  --------------
155
  List[Document]: When preprocessing pipeline is run, the output dictionary
156
- has four objects. For the Haysatck implementation of SDG classification we,
157
  need to use the List of Haystack Document, which can be fetched by
158
  key = 'documents' on output.
159
 
160
  """
161
  file_path = st.session_state['filepath']
162
  file_name = st.session_state['filename']
163
- sdg_processing_pipeline = processingpipeline()
164
  split_by = config.get('lexical_search','SPLIT_BY')
165
  split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
166
  split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
167
 
168
- output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
169
  params= {"FileConverter": {"file_path": file_path, \
170
  "file_name": file_name},
171
  "UdfPreProcessor": {"removePunc": False, \
@@ -183,19 +186,19 @@ def runSemanticPreprocessingPipeline()->List[Document]:
183
  Return
184
  --------------
185
  List[Document]: When preprocessing pipeline is run, the output dictionary
186
- has four objects. For the Haysatck implementation of SDG classification we,
187
  need to use the List of Haystack Document, which can be fetched by
188
  key = 'documents' on output.
189
 
190
  """
191
  file_path = st.session_state['filepath']
192
  file_name = st.session_state['filename']
193
- sdg_processing_pipeline = processingpipeline()
194
- split_by = config.get('lexical_search','SPLIT_BY')
195
- split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
196
- split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
197
 
198
- output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
199
  params= {"FileConverter": {"file_path": file_path, \
200
  "file_name": file_name},
201
  "UdfPreProcessor": {"removePunc": False, \
@@ -203,4 +206,108 @@ def runSemanticPreprocessingPipeline()->List[Document]:
203
  "split_length":split_length,\
204
  "split_overlap": split_overlap}})
205
 
206
- return output_lexical_pre['documents']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TfidfRetriever, TransformersQueryClassifier
2
+ from haystack.nodes import EmbeddingRetriever, FARMReader
3
+ from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
5
  import configparser
6
  import spacy
 
10
  from markdown import markdown
11
  from annotated_text import annotation
12
  from haystack.schema import Document
13
+ from typing import List, Text
14
  from utils.preprocessing import processingpipeline
15
+ from haystack.pipelines import Pipeline
16
 
17
  config = configparser.ConfigParser()
18
  config.read_file(open('paramconfig.cfg'))
 
145
  # if result.content != "":
146
  matches, doc = runSpacyMatcher(query_tokens,result.content)
147
  if len(matches) != 0:
148
+ st.write("Result {}".format(count+1))
149
  searchAnnotator(matches, doc)
150
 
151
  def runLexicalPreprocessingPipeline()->List[Document]:
 
156
  Return
157
  --------------
158
  List[Document]: When preprocessing pipeline is run, the output dictionary
159
+ has four objects. For the lexicaal search using TFIDFRetriever we
160
  need to use the List of Haystack Document, which can be fetched by
161
  key = 'documents' on output.
162
 
163
  """
164
  file_path = st.session_state['filepath']
165
  file_name = st.session_state['filename']
166
+ lexical_processing_pipeline = processingpipeline()
167
  split_by = config.get('lexical_search','SPLIT_BY')
168
  split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
169
  split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
170
 
171
+ output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
172
  params= {"FileConverter": {"file_path": file_path, \
173
  "file_name": file_name},
174
  "UdfPreProcessor": {"removePunc": False, \
 
186
  Return
187
  --------------
188
  List[Document]: When preprocessing pipeline is run, the output dictionary
189
+ has four objects. For the Haysatck implementation of semantic search we,
190
  need to use the List of Haystack Document, which can be fetched by
191
  key = 'documents' on output.
192
 
193
  """
194
  file_path = st.session_state['filepath']
195
  file_name = st.session_state['filename']
196
+ semantic_processing_pipeline = processingpipeline()
197
+ split_by = config.get('semantic_search','SPLIT_BY')
198
+ split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
199
+ split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
200
 
201
+ output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
202
  params= {"FileConverter": {"file_path": file_path, \
203
  "file_name": file_name},
204
  "UdfPreProcessor": {"removePunc": False, \
 
206
  "split_length":split_length,\
207
  "split_overlap": split_overlap}})
208
 
209
+ return output_semantic_pre['documents']
210
+
211
+ class QueryCheck(BaseComponent):
212
+
213
+ outgoing_edges = 1
214
+
215
+ def run(self, query):
216
+
217
+ query_classifier = TransformersQueryClassifier(model_name_or_path=
218
+ "shahrukhx01/bert-mini-finetune-question-detection")
219
+
220
+
221
+ result = query_classifier.run(query=query)
222
+
223
+ if result[1] == "output_1":
224
+ output = {"query":query,
225
+ "query_type": 'question/statement'}
226
+ else:
227
+ output = {"query": "find all issues related to {}".format(query),
228
+ "query_type": 'statements/keyword'}
229
+
230
+ return output, "output_1"
231
+
232
+ def run_batch(self, query):
233
+ pass
234
+
235
+
236
+ def semanticSearchPipeline(documents, show_answers = False):
237
+ document_store = InMemoryDocumentStore()
238
+ document_store.write_documents(documents)
239
+
240
+ embedding_model = config.get('semantic_search','RETRIEVER')
241
+ embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
242
+ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
243
+ retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
244
+
245
+
246
+
247
+ querycheck = QueryCheck()
248
+ retriever = EmbeddingRetriever(
249
+ document_store=document_store,
250
+ embedding_model=embedding_model,top_k = retriever_top_k,
251
+ emb_extraction_layer=embedding_layer, scale_score =True,
252
+ model_format=embedding_model_format, use_gpu = True)
253
+ document_store.update_embeddings(retriever)
254
+
255
+
256
+ semanticsearch_pipeline = Pipeline()
257
+ semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
258
+ inputs = ["Query"])
259
+ semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
260
+ inputs = ["QueryCheck.output_1"])
261
+ if show_answers == True:
262
+ reader_model = config.get('semantic_search','READER')
263
+ reader_top_k = retriever_top_k
264
+ reader = FARMReader(model_name_or_path=reader_model,
265
+ top_k = reader_top_k, use_gpu=True)
266
+
267
+ semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
268
+ inputs= ["EmbeddingRetriever"])
269
+
270
+ return semanticsearch_pipeline, document_store
271
+
272
+ def semantic_search(query:Text,documents:List[Document],show_answers = False):
273
+ """
274
+ Performs the Lexical search on the List of haystack documents which is
275
+ returned by preprocessing Pipeline.
276
+ """
277
+ threshold = 0.4
278
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
279
+ show_answers=show_answers)
280
+ results = semanticsearch_pipeline.run(query = query)
281
+
282
+
283
+ if show_answers == False:
284
+ results = results['documents']
285
+ for i,queryhit in enumerate(results):
286
+
287
+ if queryhit.score > threshold:
288
+ st.write("\t {}: \t {}".format(i+1, queryhit.content.replace("\n", " ")))
289
+ st.markdown("---")
290
+
291
+ else:
292
+ matches = []
293
+ doc = []
294
+ for answer in results['answers']:
295
+ if answer.score >0.01:
296
+ temp = answer.to_dict()
297
+ start_idx = temp['offsets_in_document'][0]['start']
298
+ end_idx = temp['offsets_in_document'][0]['end']
299
+
300
+ matches.append([start_idx,end_idx])
301
+ doc.append(doc_store.get_document_by_id(temp['document_id']).content)
302
+ searchAnnotator(matches,doc)
303
+
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+ return results
312
+
313
+