Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
7d78a3b
1 Parent(s): f59362a

semantic updates

Browse files
appStore/keyword_search.py CHANGED
@@ -7,6 +7,21 @@ import json
7
  import logging
8
  from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
9
  from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def app():
12
 
@@ -77,19 +92,26 @@ def app():
77
 
78
 
79
  if searchtype == 'Exact Matches':
80
- allDocuments = runLexicalPreprocessingPipeline(
81
- st.session_state['filepath'],
82
- st.session_state['filename'])
83
- logging.info("performing lexical search")
84
- with st.spinner("Performing Exact matching search \
85
- (Lexical search) for you"):
86
- st.markdown("##### Top few lexical search (TFIDF) hits #####")
87
- lexical_search(queryList,allDocuments['documents'])
 
88
  else:
89
  allDocuments = runSemanticPreprocessingPipeline(
90
- st.session_state['filepath'],
91
- st.session_state['filename'])
 
 
 
 
 
92
 
 
93
  logging.info("starting semantic search")
94
  with st.spinner("Performing Similar/Contextual search"):
95
  semantic_search(queryList,allDocuments['documents'])
 
7
  import logging
8
  from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
9
  from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_search
10
+ from utils.checkconfig import getconfig
11
+
12
+ # Declare all the necessary variables
13
+ config = getconfig('paramconfig.cfg')
14
+ split_by = config.get('semantic_search','SPLIT_BY')
15
+ split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
16
+ split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
17
+ split_respect_sentence_boundary = bool(int(config.get('semantic_search','RESPECT_SENTENCE_BOUNDARY')))
18
+ remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
19
+ embedding_model = config.get('semantic_search','RETRIEVER')
20
+ embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
21
+ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
22
+ retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
23
+ reader_model = config.get('semantic_search','READER')
24
+ reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
25
 
26
  def app():
27
 
 
92
 
93
 
94
  if searchtype == 'Exact Matches':
95
+ # allDocuments = runLexicalPreprocessingPipeline(
96
+ # st.session_state['filepath'],
97
+ # st.session_state['filename'])
98
+ # logging.info("performing lexical search")
99
+ # with st.spinner("Performing Exact matching search \
100
+ # (Lexical search) for you"):
101
+ # st.markdown("##### Top few lexical search (TFIDF) hits #####")
102
+ # lexical_search(queryList,allDocuments['documents'])
103
+ pass
104
  else:
105
  allDocuments = runSemanticPreprocessingPipeline(
106
+ file_path= st.session_state['filepath'],
107
+ file_name = st.session_state['filename'],
108
+ split_by=split_by,
109
+ split_length= split_length,
110
+ split_overlap=split_overlap,
111
+ removePunc= remove_punc,
112
+ split_respect_sentence_boundary=split_respect_sentence_boundary)
113
 
114
+
115
  logging.info("starting semantic search")
116
  with st.spinner("Performing Similar/Contextual search"):
117
  semantic_search(queryList,allDocuments['documents'])
paramconfig.cfg CHANGED
@@ -16,6 +16,8 @@ THRESHOLD = 0.1
16
  SPLIT_BY = sentence
17
  SPLIT_LENGTH = 3
18
  SPLIT_OVERLAP = 0
 
 
19
 
20
  [sdg]
21
  THRESHOLD = 0.85
 
16
  SPLIT_BY = sentence
17
  SPLIT_LENGTH = 3
18
  SPLIT_OVERLAP = 0
19
+ RESPECT_SENTENCE_BOUNDARY = 1
20
+ REMOVE_PUNC = 0
21
 
22
  [sdg]
23
  THRESHOLD = 0.85
utils/sdg_classifier.py CHANGED
@@ -2,7 +2,6 @@ from haystack.nodes import TransformersDocumentClassifier
2
  from haystack.schema import Document
3
  from typing import List, Tuple
4
  from typing_extensions import Literal
5
- import configparser
6
  import logging
7
  import pandas as pd
8
  from pandas import DataFrame, Series
 
2
  from haystack.schema import Document
3
  from typing import List, Tuple
4
  from typing_extensions import Literal
 
5
  import logging
6
  import pandas as pd
7
  from pandas import DataFrame, Series
utils/semantic_search.py CHANGED
@@ -2,11 +2,11 @@ from haystack.nodes import TransformersQueryClassifier
2
  from haystack.nodes import EmbeddingRetriever, FARMReader
3
  from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
5
- import configparser
6
  from markdown import markdown
7
  from annotated_text import annotation
8
  from haystack.schema import Document
9
  from typing import List, Text
 
10
  from utils.preprocessing import processingpipeline
11
  from utils.streamlitcheck import check_streamlit
12
  from haystack.pipelines import Pipeline
@@ -19,16 +19,15 @@ try:
19
  import streamlit as st
20
  except ImportError:
21
  logging.info("Streamlit not installed")
22
- config = configparser.ConfigParser()
23
- try:
24
- config.read_file(open('paramconfig.cfg'))
25
- except Exception:
26
- logging.info("paramconfig file not found")
27
- st.info("Please place the paramconfig file in the same directory as app.py")
28
 
29
 
30
  @st.cache(allow_output_mutation=True)
31
  def loadQueryClassifier():
 
 
 
 
 
32
  query_classifier = TransformersQueryClassifier(model_name_or_path=
33
  "shahrukhx01/bert-mini-finetune-question-detection")
34
  return query_classifier
@@ -63,8 +62,12 @@ class QueryCheck(BaseComponent):
63
  def run_batch(self, query):
64
  pass
65
 
66
-
67
- def runSemanticPreprocessingPipeline(file_path, file_name)->List[Document]:
 
 
 
 
68
  """
69
  creates the pipeline and runs the preprocessing pipeline,
70
  the params for pipeline are fetched from paramconfig
@@ -76,6 +79,12 @@ def runSemanticPreprocessingPipeline(file_path, file_name)->List[Document]:
76
  st.session_state['filename']
77
  file_path: filepath, in case of streamlit application use
78
  st.session_state['filepath']
 
 
 
 
 
 
79
 
80
  Return
81
  --------------
@@ -87,61 +96,90 @@ def runSemanticPreprocessingPipeline(file_path, file_name)->List[Document]:
87
  """
88
 
89
  semantic_processing_pipeline = processingpipeline()
90
- split_by = config.get('semantic_search','SPLIT_BY')
91
- split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
92
- split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
93
 
94
  output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
95
  params= {"FileConverter": {"file_path": file_path, \
96
  "file_name": file_name},
97
- "UdfPreProcessor": {"removePunc": False, \
98
  "split_by": split_by, \
99
  "split_length":split_length,\
100
- "split_overlap": split_overlap}})
 
101
 
102
  return output_semantic_pre
103
 
104
 
105
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
106
- def loadRetriever(embedding_model = None, embedding_model_format = None,
107
- embedding_layer = None, retriever_top_k = 10, document_store = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  logging.info("loading retriever")
109
  if document_store is None:
110
  logging.warning("Retriever initialization requires the DocumentStore")
111
  return
112
-
113
-
114
- if embedding_model is None:
115
- try:
116
- embedding_model = config.get('semantic_search','RETRIEVER')
117
- embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
118
- embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
119
- retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
120
- except Exception as e:
121
- logging.info(e)
122
- st.info(e)
123
 
124
  retriever = EmbeddingRetriever(
125
  embedding_model=embedding_model,top_k = retriever_top_k,
126
  document_store = document_store,
127
  emb_extraction_layer=embedding_layer, scale_score =True,
128
  model_format=embedding_model_format, use_gpu = True)
129
- st.session_state['retriever'] = retriever
 
130
  return retriever
131
 
132
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
133
  def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  document_store = InMemoryDocumentStore(similarity = similarity)
135
  document_store.write_documents(documents)
136
- if 'retriever' in st.session_state:
137
- retriever = st.session_state['retriever']
138
- document_store.update_embeddings(retriever)
 
139
 
140
  return document_store
141
 
142
 
143
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
144
- def semanticSearchPipeline(documents:List[Document]):
 
 
 
145
  """
146
  creates the semantic search pipeline and document Store object from the
147
  list of haystack documents. Retriever and Reader model are read from
@@ -149,32 +187,66 @@ def semanticSearchPipeline(documents:List[Document]):
149
  all the results returned by Retriever are used, however the context is
150
  extracted by Reader for each retrieved result. The querycheck is added as
151
  node to process the query.
 
 
 
 
 
152
 
153
 
154
  Params
155
  ----------
156
  documents: list of Haystack Documents, returned by preprocessig pipeline.
 
 
 
 
 
 
 
 
 
 
157
 
158
  Return
159
  ---------
160
  semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
161
  nodes [QueryCheck, Retriever, Reader]
162
 
163
- document_store: As retriever cna work only with Haystack Document Store, the
164
  list of document returned by preprocessing pipeline.
165
 
166
  """
167
  document_store = createDocumentStore(documents)
168
- retriever = loadRetriever(document_store=document_store)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  document_store.update_embeddings(retriever)
 
170
  querycheck = QueryCheck()
171
- if 'reader' in st.session_state:
172
- reader = st.session_state['reader']
 
 
173
  else:
174
- reader_model = config.get('semantic_search','READER')
175
- reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
176
- reader = FARMReader(model_name_or_path=reader_model,
177
- top_k = reader_top_k, use_gpu=True)
178
  st.session_state['reader'] = reader
179
 
180
  semanticsearch_pipeline = Pipeline()
@@ -224,7 +296,10 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
224
  print(annotated_text)
225
 
226
 
227
- def semantic_search(query:Text,documents:List[Document]):
 
 
 
228
  """
229
  Performs the Semantic search on the List of haystack documents which is
230
  returned by preprocessing Pipeline.
@@ -235,9 +310,19 @@ def semantic_search(query:Text,documents:List[Document]):
235
  documents: List fo Haystack documents returned by preprocessing pipeline.
236
 
237
  """
238
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents)
 
 
 
 
 
 
239
  results = semanticsearch_pipeline.run(query = query)
240
- st.markdown("##### Top few semantic search results #####")
 
 
 
 
241
  for i,answer in enumerate(results['answers']):
242
  temp = answer.to_dict()
243
  start_idx = temp['offsets_in_document'][0]['start']
 
2
  from haystack.nodes import EmbeddingRetriever, FARMReader
3
  from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
 
5
  from markdown import markdown
6
  from annotated_text import annotation
7
  from haystack.schema import Document
8
  from typing import List, Text
9
+ from typing_extensions import Literal
10
  from utils.preprocessing import processingpipeline
11
  from utils.streamlitcheck import check_streamlit
12
  from haystack.pipelines import Pipeline
 
19
  import streamlit as st
20
  except ImportError:
21
  logging.info("Streamlit not installed")
 
 
 
 
 
 
22
 
23
 
24
  @st.cache(allow_output_mutation=True)
25
  def loadQueryClassifier():
26
+ """
27
+ retuns the haystack query classifier model
28
+ model = shahrukhx01/bert-mini-finetune-question-detection
29
+
30
+ """
31
  query_classifier = TransformersQueryClassifier(model_name_or_path=
32
  "shahrukhx01/bert-mini-finetune-question-detection")
33
  return query_classifier
 
62
  def run_batch(self, query):
63
  pass
64
 
65
+ @st.cache(allow_output_mutation=True)
66
+ def runSemanticPreprocessingPipeline(file_path, file_name,
67
+ split_by: Literal["sentence", "word"] = 'sentence',
68
+ split_respect_sentence_boundary = False,
69
+ split_length:int = 2, split_overlap = 0,
70
+ removePunc = False)->List[Document]:
71
  """
72
  creates the pipeline and runs the preprocessing pipeline,
73
  the params for pipeline are fetched from paramconfig
 
79
  st.session_state['filename']
80
  file_path: filepath, in case of streamlit application use
81
  st.session_state['filepath']
82
+ removePunc: to remove all Punctuation including ',' and '.' or not
83
+ split_by: document splitting strategy either as word or sentence
84
+ split_length: when synthetically creating the paragrpahs from document,
85
+ it defines the length of paragraph.
86
+ split_respect_sentence_boundary: Used when using 'word' strategy for
87
+ splititng of text.
88
 
89
  Return
90
  --------------
 
96
  """
97
 
98
  semantic_processing_pipeline = processingpipeline()
 
 
 
99
 
100
  output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
101
  params= {"FileConverter": {"file_path": file_path, \
102
  "file_name": file_name},
103
+ "UdfPreProcessor": {"removePunc": removePunc, \
104
  "split_by": split_by, \
105
  "split_length":split_length,\
106
+ "split_overlap": split_overlap,
107
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
108
 
109
  return output_semantic_pre
110
 
111
 
112
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
113
+ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = None,
114
+ embedding_layer:int = None, retriever_top_k:int = 10,
115
+ document_store:InMemoryDocumentStore = None):
116
+ """
117
+ Returns the Retriever model based on params provided.
118
+ 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
119
+ 2. https://www.sbert.net/examples/applications/semantic-search/README.html
120
+ 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
121
+
122
+
123
+ Params
124
+ ---------
125
+ embedding_model: Name of the model to be used for embedding. Check the links
126
+ provided in documentation
127
+ embedding_model_format: check the github link of Haystack provided in documentation
128
+ embedding_layer: check the github link of Haystack provided in documentation
129
+ retriever_top_k: Number of Top results to be returned by retriever
130
+ document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
131
+ and pass the same to function call. Can be done using createDocumentStore from utils.
132
+
133
+ Return
134
+ -------
135
+ retriever: emebedding model
136
+ """
137
  logging.info("loading retriever")
138
  if document_store is None:
139
  logging.warning("Retriever initialization requires the DocumentStore")
140
  return
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  retriever = EmbeddingRetriever(
143
  embedding_model=embedding_model,top_k = retriever_top_k,
144
  document_store = document_store,
145
  emb_extraction_layer=embedding_layer, scale_score =True,
146
  model_format=embedding_model_format, use_gpu = True)
147
+ if check_streamlit:
148
+ st.session_state['retriever'] = retriever
149
  return retriever
150
 
151
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
152
  def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
153
+ """
154
+ Creates the InMemory Document Store frm haystack list of Documents.
155
+ It is mandatory component for Retriever to work in Haystack frame work.
156
+
157
+ Params
158
+ -------
159
+ documents: List of haystack document. If using the preprocessing pipeline,
160
+ can be fetched key = 'documents; on output of preprocessing pipeline.
161
+ similarity: scoring function, can be either 'cosine' or 'dot_product'
162
+
163
+ Return
164
+ -------
165
+ document_store: InMemory Document Store object type.
166
+
167
+ """
168
  document_store = InMemoryDocumentStore(similarity = similarity)
169
  document_store.write_documents(documents)
170
+ # if check_streamlit:
171
+ # if 'retriever' in st.session_state:
172
+ # retriever = st.session_state['retriever']
173
+ # document_store.update_embeddings(retriever)
174
 
175
  return document_store
176
 
177
 
178
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
179
+ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
180
+ embedding_model_format:Text = None,
181
+ embedding_layer:int = None, retriever_top_k:int = 10,
182
+ reader_model:str = None, reader_top_k:int = 10):
183
  """
184
  creates the semantic search pipeline and document Store object from the
185
  list of haystack documents. Retriever and Reader model are read from
 
187
  all the results returned by Retriever are used, however the context is
188
  extracted by Reader for each retrieved result. The querycheck is added as
189
  node to process the query.
190
+ 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
191
+ 2. https://www.sbert.net/examples/applications/semantic-search/README.html
192
+ 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
193
+ 4. https://docs.haystack.deepset.ai/docs/reader
194
+
195
 
196
 
197
  Params
198
  ----------
199
  documents: list of Haystack Documents, returned by preprocessig pipeline.
200
+ embedding_model: Name of the model to be used for embedding. Check the links
201
+ provided in documentation
202
+ embedding_model_format: check the github link of Haystack provided in documentation
203
+ embedding_layer: check the github link of Haystack provided in documentation
204
+ retriever_top_k: Number of Top results to be returned by retriever
205
+ reader_model: Name of the model to be used for Reader node in hasyatck
206
+ Pipeline. Check the links provided in documentation
207
+ reader_top_k: Reader will use retrieved results to further find better matches.
208
+ As purpose here is to use reader to extract context, the value is
209
+ same as retriever_top_k.
210
 
211
  Return
212
  ---------
213
  semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
214
  nodes [QueryCheck, Retriever, Reader]
215
 
216
+ document_store: As retriever can work only with Haystack Document Store, the
217
  list of document returned by preprocessing pipeline.
218
 
219
  """
220
  document_store = createDocumentStore(documents)
221
+ if check_streamlit:
222
+ if retriever in st.session_state:
223
+ if st.session_state['retriever']:
224
+ retriever = st.session_state['retriever']
225
+ else:
226
+ if embedding_model:
227
+ retriever = loadRetriever(embedding_model = embedding_model,
228
+ embedding_model_format=embedding_model_format,
229
+ embedding_layer=embedding_layer,
230
+ retriever_top_k= retriever_top_k,
231
+ document_store = document_store)
232
+
233
+ st.session_state['retriever'] = retriever
234
+ else:
235
+ logging.warning("no streamlit enviornment found, neither embedding model \
236
+ provided")
237
+ return
238
+
239
  document_store.update_embeddings(retriever)
240
+ retriever.document_store = document_store
241
  querycheck = QueryCheck()
242
+ if check_streamlit:
243
+ if 'reader' in st.session_state:
244
+ reader = st.session_state['reader']
245
+
246
  else:
247
+ if reader_model:
248
+ reader = FARMReader(model_name_or_path=reader_model,
249
+ top_k = reader_top_k, use_gpu=True)
 
250
  st.session_state['reader'] = reader
251
 
252
  semanticsearch_pipeline = Pipeline()
 
296
  print(annotated_text)
297
 
298
 
299
+ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
300
+ embedding_model_format:Text,
301
+ embedding_layer:int, reader_model:str,
302
+ retriever_top_k:int = 10, reader_top_k:int = 10):
303
  """
304
  Performs the Semantic search on the List of haystack documents which is
305
  returned by preprocessing Pipeline.
 
310
  documents: List fo Haystack documents returned by preprocessing pipeline.
311
 
312
  """
313
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
314
+ embedding_model= embedding_model,
315
+ embedding_layer= embedding_layer,
316
+ embedding_model_format= embedding_model_format,
317
+ reader_model= reader_model, retriever_top_k= retriever_top_k,
318
+ reader_top_k= reader_top_k)
319
+
320
  results = semanticsearch_pipeline.run(query = query)
321
+
322
+ if check_streamlit:
323
+ st.markdown("##### Top few semantic search results #####")
324
+ else:
325
+ print("Top few semantic search results")
326
  for i,answer in enumerate(results['answers']):
327
  temp = answer.to_dict()
328
  start_idx = temp['offsets_in_document'][0]['start']