leavoigt commited on
Commit
a29c372
1 Parent(s): 9acfc38

Delete utils/semantic_search.py

Browse files
Files changed (1) hide show
  1. utils/semantic_search.py +0 -582
utils/semantic_search.py DELETED
@@ -1,582 +0,0 @@
1
- from haystack.nodes import TransformersQueryClassifier, Docs2Answers
2
- from haystack.nodes import EmbeddingRetriever, FARMReader
3
- from haystack.nodes.base import BaseComponent
4
- from haystack.document_stores import InMemoryDocumentStore
5
- from markdown import markdown
6
- from annotated_text import annotation
7
- from haystack.schema import Document
8
- from typing import List, Text, Union
9
- from typing_extensions import Literal
10
- from utils.preprocessing import processingpipeline
11
- from utils.streamlitcheck import check_streamlit
12
- from haystack.pipelines import Pipeline
13
- import pandas as pd
14
- import logging
15
- try:
16
- from termcolor import colored
17
- except:
18
- pass
19
- try:
20
- import streamlit as st
21
- except ImportError:
22
- logging.info("Streamlit not installed")
23
-
24
-
25
- @st.cache(allow_output_mutation=True)
26
- def loadQueryClassifier():
27
- """
28
- retuns the haystack query classifier model
29
- model = shahrukhx01/bert-mini-finetune-question-detection
30
-
31
- """
32
- query_classifier = TransformersQueryClassifier(model_name_or_path=
33
- "shahrukhx01/bert-mini-finetune-question-detection")
34
- return query_classifier
35
-
36
- class QueryCheck(BaseComponent):
37
- """
38
- Uses Query Classifier from Haystack, process the query based on query type.
39
- Ability to determine the statements is not so good, therefore the chances
40
- statement also get modified. Ex: "List water related issues" will be
41
- identified by the model as keywords, and therefore it be processed as "what
42
- are the 'list all water related issues' related issues and discussions?".
43
- This is one shortcoming but is igonred for now, as semantic search will not
44
- get affected a lot, by this. If you want to pass keywords list and want to
45
- do batch processing use. run_batch. Example: if you want to find relevant
46
- passages for water, food security, poverty then querylist = ["water", "food
47
- security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
48
-
49
- 1. https://docs.haystack.deepset.ai/docs/query_classifier
50
-
51
- """
52
-
53
- outgoing_edges = 1
54
-
55
- def run(self, query:str):
56
- """
57
- mandatory method to use the custom node. Determines the query type, if
58
- if the query is of type keyword/statement will modify it to make it more
59
- useful for sentence transoformers.
60
-
61
- Params
62
- --------
63
- query: query/statement/keywords in form of string
64
-
65
- Return
66
- ------
67
- output: dictionary, with key as identifier and value could be anything
68
- we need to return. In this case the output contain key = 'query'.
69
-
70
- output_1: As there is only one outgoing edge, we pass 'output_1' string
71
-
72
- """
73
- query_classifier = loadQueryClassifier()
74
- result = query_classifier.run(query=query)
75
-
76
- if result[1] == "output_1":
77
- output = {"query":query,
78
- "query_type": 'question/statement'}
79
- else:
80
- output = {"query": "what are the {} related issues and \
81
- discussions?".format(query),
82
- "query_type": 'statements/keyword'}
83
- logging.info(output)
84
- return output, "output_1"
85
-
86
- def run_batch(self, queries:List[str]):
87
- """
88
- running multiple queries in one go, howeevr need the queries to be passed
89
- as list of string. Example: if you want to find relevant passages for
90
- water, food security, poverty then querylist = ["water", "food security",
91
- "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
92
-
93
- Params
94
- --------
95
- queries: queries/statements/keywords in form of string encapsulated
96
- within List
97
-
98
- Return
99
- ------
100
- output: dictionary, with key as identifier and value could be anything
101
- we need to return. In this case the output contain key = 'queries'.
102
-
103
- output_1: As there is only one outgoing edge, we pass 'output_1' string
104
- """
105
- query_classifier = loadQueryClassifier()
106
- query_list = []
107
- for query in queries:
108
- result = query_classifier.run(query=query)
109
- if result[1] == "output_1":
110
- query_list.append(query)
111
- else:
112
- query_list.append("what are the {} related issues and \
113
- discussions?".format(query))
114
- output = {'queries':query_list}
115
- logging.info(output)
116
- return output, "output_1"
117
-
118
-
119
- @st.cache(allow_output_mutation=True)
120
- def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
121
- split_by: Literal["sentence", "word"] = 'sentence',
122
- split_length:int = 2, split_overlap:int = 0,
123
- split_respect_sentence_boundary:bool = False,
124
- remove_punc:bool = False)->List[Document]:
125
- """
126
- creates the pipeline and runs the preprocessing pipeline.
127
-
128
- Params
129
- ------------
130
-
131
- file_name: filename, in case of streamlit application use
132
- st.session_state['filename']
133
- file_path: filepath, in case of streamlit application use
134
- st.session_state['filepath']
135
- split_by: document splitting strategy either as word or sentence
136
- split_length: when synthetically creating the paragrpahs from document,
137
- it defines the length of paragraph.
138
- split_overlap: Number of words or sentences that overlap when creating the
139
- paragraphs. This is done as one sentence or 'some words' make sense
140
- when read in together with others. Therefore the overlap is used.
141
- split_respect_sentence_boundary: Used when using 'word' strategy for
142
- splititng of text.
143
- remove_punc: to remove all Punctuation including ',' and '.' or not
144
-
145
- Return
146
- --------------
147
- List[Document]: When preprocessing pipeline is run, the output dictionary
148
- has four objects. For the Haysatck implementation of semantic search we,
149
- need to use the List of Haystack Document, which can be fetched by
150
- key = 'documents' on output.
151
-
152
- """
153
-
154
- semantic_processing_pipeline = processingpipeline()
155
-
156
- output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
157
- params= {"FileConverter": {"file_path": file_path, \
158
- "file_name": file_name},
159
- "UdfPreProcessor": {"remove_punc": remove_punc, \
160
- "split_by": split_by, \
161
- "split_length":split_length,\
162
- "split_overlap": split_overlap,
163
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
164
-
165
- return output_semantic_pre
166
-
167
-
168
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
169
- allow_output_mutation=True)
170
- def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
171
- embedding_layer:int = None, retriever_top_k:int = 10,
172
- max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
173
- """
174
- Returns the Retriever model based on params provided.
175
- 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
176
- 2. https://www.sbert.net/examples/applications/semantic-search/README.html
177
- 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
178
-
179
-
180
- Params
181
- ---------
182
- embedding_model: Name of the model to be used for embedding. Check the links
183
- provided in documentation
184
- embedding_model_format: check the github link of Haystack provided in
185
- documentation embedding_layer: check the github link of Haystack
186
- provided in documentation retriever_top_k: Number of Top results to
187
- be returned by
188
- retriever max_seq_len: everymodel has max seq len it can handle, check in
189
- model card. Needed to hanlde the edge cases.
190
- document_store: InMemoryDocumentStore, write haystack Document list to
191
- DocumentStore and pass the same to function call. Can be done using
192
- createDocumentStore from utils.
193
-
194
- Return
195
- -------
196
- retriever: embedding model
197
- """
198
- logging.info("loading retriever")
199
- if document_store is None:
200
- logging.warning("Retriever initialization requires the DocumentStore")
201
- return
202
-
203
- retriever = EmbeddingRetriever(
204
- embedding_model=embedding_model,top_k = retriever_top_k,
205
- document_store = document_store,
206
- emb_extraction_layer=embedding_layer, scale_score =True,
207
- model_format=embedding_model_format, use_gpu = True,
208
- max_seq_len = max_seq_len )
209
- if check_streamlit:
210
- st.session_state['retriever'] = retriever
211
- return retriever
212
-
213
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
214
- allow_output_mutation=True)
215
- def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
216
- embedding_dim:int = 768):
217
- """
218
- Creates the InMemory Document Store from haystack list of Documents.
219
- It is mandatory component for Retriever to work in Haystack frame work.
220
-
221
- Params
222
- -------
223
- documents: List of haystack document. If using the preprocessing pipeline,
224
- can be fetched key = 'documents; on output of preprocessing pipeline.
225
- similarity: scoring function, can be either 'cosine' or 'dot_product'
226
- embedding_dim: Document store has default value of embedding size = 768, and
227
- update_embeddings method of Docstore cannot infer the embedding size of
228
- retiever automatically, therefore set this value as per the model card.
229
-
230
- Return
231
- -------
232
- document_store: InMemory Document Store object type.
233
-
234
- """
235
- document_store = InMemoryDocumentStore(similarity = similarity,
236
- embedding_dim = embedding_dim )
237
- document_store.write_documents(documents)
238
-
239
- return document_store
240
-
241
-
242
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
243
- allow_output_mutation=True)
244
- def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
245
- embedding_model_format:Text = None,embedding_layer:int = None,
246
- embedding_dim:int = 768,retriever_top_k:int = 10,
247
- reader_model:str = None, reader_top_k:int = 10,
248
- max_seq_len:int =512,useQueryCheck = True,
249
- top_k_per_candidate:int = 1):
250
- """
251
- creates the semantic search pipeline and document Store object from the
252
- list of haystack documents. The top_k for the Reader and Retirever are kept
253
- same, so that all the results returned by Retriever are used, however the
254
- context is extracted by Reader for each retrieved result. The querycheck is
255
- added as node to process the query. This pipeline is suited for keyword search,
256
- and to some extent extractive QA purpose. The purpose of Reader is strictly to
257
- highlight the context for retrieved result and not for QA, however as stated
258
- it can work for QA too in limited sense.
259
- There are 4 variants of pipeline it can return
260
- 1.QueryCheck > Retriever > Reader
261
- 2.Retriever > Reader
262
- 3.QueryCheck > Retriever > Docs2Answers : If reader is None,
263
- then Doc2answer is used to keep the output of pipeline structurally same.
264
- 4.Retriever > Docs2Answers
265
-
266
- Links
267
-
268
- 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
269
- 2. https://www.sbert.net/examples/applications/semantic-search/README.html
270
- 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
271
- 4. https://docs.haystack.deepset.ai/docs/reader
272
-
273
-
274
- Params
275
- ----------
276
- documents: list of Haystack Documents, returned by preprocessig pipeline.
277
- embedding_model: Name of the model to be used for embedding. Check the links
278
- provided in documentation
279
- embedding_model_format: check the github link of Haystack provided in
280
- documentation
281
- embedding_layer: check the github link of Haystack provided in documentation
282
- embedding_dim: Document store has default value of embedding size = 768, and
283
- update_embeddings method of Docstore cannot infer the embedding size of
284
- retiever automatically, therefore set this value as per the model card.
285
- retriever_top_k: Number of Top results to be returned by retriever
286
- reader_model: Name of the model to be used for Reader node in hasyatck
287
- Pipeline. Check the links provided in documentation
288
- reader_top_k: Reader will use retrieved results to further find better matches.
289
- As purpose here is to use reader to extract context, the value is
290
- same as retriever_top_k.
291
- max_seq_len:everymodel has max seq len it can handle, check in model card.
292
- Needed to hanlde the edge cases
293
- useQueryCheck: Whether to use the querycheck which modifies the query or not.
294
- top_k_per_candidate:How many answers to extract for each candidate doc
295
- that is coming from the retriever
296
-
297
- Return
298
- ---------
299
- semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
300
- nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
301
- then Doc2answer is used to keep the output of pipeline structurally
302
- same.
303
-
304
- document_store: As retriever can work only with Haystack Document Store, the
305
- list of document returned by preprocessing pipeline are fed into to
306
- get InMemmoryDocumentStore object type, with retriever updating the
307
- embeddings of each paragraph in document store.
308
-
309
- """
310
- document_store = createDocumentStore(documents=documents,
311
- embedding_dim=embedding_dim)
312
- retriever = loadRetriever(embedding_model = embedding_model,
313
- embedding_model_format=embedding_model_format,
314
- embedding_layer=embedding_layer,
315
- retriever_top_k= retriever_top_k,
316
- document_store = document_store,
317
- max_seq_len=max_seq_len)
318
- document_store.update_embeddings(retriever)
319
- semantic_search_pipeline = Pipeline()
320
- if useQueryCheck and reader_model:
321
- querycheck = QueryCheck()
322
- reader = FARMReader(model_name_or_path=reader_model,
323
- top_k = reader_top_k, use_gpu=True,
324
- top_k_per_candidate = top_k_per_candidate)
325
- semantic_search_pipeline.add_node(component = querycheck,
326
- name = "QueryCheck",inputs = ["Query"])
327
- semantic_search_pipeline.add_node(component = retriever,
328
- name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
329
- semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
330
- inputs= ["EmbeddingRetriever"])
331
-
332
- elif reader_model :
333
- reader = FARMReader(model_name_or_path=reader_model,
334
- top_k = reader_top_k, use_gpu=True,
335
- top_k_per_candidate = top_k_per_candidate)
336
- semantic_search_pipeline.add_node(component = retriever,
337
- name = "EmbeddingRetriever",inputs = ["Query"])
338
- semantic_search_pipeline.add_node(component = reader,
339
- name = "FARMReader",inputs= ["EmbeddingRetriever"])
340
- elif useQueryCheck and not reader_model:
341
- querycheck = QueryCheck()
342
- docs2answers = Docs2Answers()
343
- semantic_search_pipeline.add_node(component = querycheck,
344
- name = "QueryCheck",inputs = ["Query"])
345
- semantic_search_pipeline.add_node(component = retriever,
346
- name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
347
- semantic_search_pipeline.add_node(component = docs2answers,
348
- name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
349
- elif not useQueryCheck and not reader_model:
350
- docs2answers = Docs2Answers()
351
- semantic_search_pipeline.add_node(component = retriever,
352
- name = "EmbeddingRetriever",inputs = ["Query"])
353
- semantic_search_pipeline.add_node(component = docs2answers,
354
- name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
355
-
356
- logging.info(semantic_search_pipeline.components)
357
- return semantic_search_pipeline, document_store
358
-
359
- def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
360
- """
361
- will use the haystack run or run_batch based on if single query is passed
362
- as string or multiple queries as List[str]
363
-
364
- Params
365
- -------
366
- pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
367
- from utils.semanticsearch
368
-
369
- queries: Either a single query or list of queries.
370
-
371
- Return
372
- -------
373
- results: Dict containing answers and documents as key and their respective
374
- values
375
-
376
- """
377
-
378
- if type(queries) == list:
379
- results = pipeline.run_batch(queries=queries)
380
- elif type(queries) == str:
381
- results = pipeline.run(query=queries)
382
- else:
383
- logging.info("Please check the input type for the queries")
384
- return
385
-
386
- return results
387
-
388
- def process_query_output(results:dict)->pd.DataFrame:
389
- """
390
- Returns the dataframe with necessary information like including
391
- ['query','answer','answer_offset','context_offset','context','content',
392
- 'reader_score','retriever_score','id',]. This is designed for output given
393
- by semantic search pipeline with single query and final node as reader.
394
- The output of pipeline having Docs2Answers as final node or multiple queries
395
- need to be handled separately. In these other cases, use process_semantic_output
396
- from utils.semantic_search which uses this function internally to make one
397
- combined dataframe.
398
-
399
- Params
400
- ---------
401
- results: this dictionary should have key,values with
402
- keys = [query,answers,documents], however answers is optional.
403
- in case of [Doc2Answers as final node], process_semantic_output
404
- doesnt return answers thereby setting all values contained in
405
- answers to 'None'
406
-
407
- Return
408
- --------
409
- df: dataframe with all the columns mentioned in function description.
410
-
411
- """
412
- query_text = results['query']
413
- if 'answers' in results.keys():
414
- answer_dict = {}
415
-
416
- for answer in results['answers']:
417
- answer_dict[answer.document_id] = answer.to_dict()
418
- else:
419
- answer_dict = {}
420
- docs = results['documents']
421
- df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
422
- 'context','content','reader_score','retriever_score',
423
- 'id'])
424
- for doc in docs:
425
- row_list = {}
426
- row_list['query'] = query_text
427
- row_list['retriever_score'] = doc.score
428
- row_list['id'] = doc.id
429
- row_list['content'] = doc.content
430
- if doc.id in answer_dict.keys():
431
- row_list['answer'] = answer_dict[doc.id]['answer']
432
- row_list['context'] = answer_dict[doc.id]['context']
433
- row_list['reader_score'] = answer_dict[doc.id]['score']
434
- answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
435
- row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
436
- start_idx = doc.content.find(row_list['context'])
437
- end_idx = start_idx + len(row_list['context'])
438
- row_list['context_offset'] = [start_idx, end_idx]
439
- else:
440
- row_list['answer'] = None
441
- row_list['context'] = None
442
- row_list['reader_score'] = None
443
- row_list['answer_offset'] = None
444
- row_list['context_offset'] = None
445
- df_dictionary = pd.DataFrame([row_list])
446
- df = pd.concat([df, df_dictionary], ignore_index=True)
447
-
448
- return df
449
-
450
- def process_semantic_output(results):
451
- """
452
- Returns the dataframe with necessary information like including
453
- ['query','answer','answer_offset','context_offset','context','content',
454
- 'reader_score','retriever_score','id',]. Distingushes if its single query or
455
- multi queries by reading the pipeline output dictionary keys.
456
- Uses the process_query_output to get the dataframe for each query and create
457
- one concataneted dataframe. In case of Docs2Answers as final node, deletes
458
- the answers part. See documentations of process_query_output.
459
-
460
- Params
461
- ---------
462
- results: raw output of runSemanticPipeline.
463
-
464
- Return
465
- --------
466
- df: dataframe with all the columns mentioned in function description.
467
-
468
- """
469
- output = {}
470
- if 'query' in results.keys():
471
- output['query'] = results['query']
472
- output['documents'] = results['documents']
473
- if results['node_id'] == 'Docs2Answers':
474
- pass
475
- else:
476
- output['answers'] = results['answers']
477
- df = process_query_output(output)
478
- return df
479
- if 'queries' in results.keys():
480
- df = pd.DataFrame(columns=['query','answer','answer_offset',
481
- 'context_offset','context','content',
482
- 'reader_score','retriever_score','id'])
483
- for query,answers,documents in zip(results['queries'],
484
- results['answers'],results['documents']):
485
- output = {}
486
- output['query'] = query
487
- output['documents'] = documents
488
- if results['node_id'] == 'Docs2Answers':
489
- pass
490
- else:
491
- output['answers'] = answers
492
-
493
- temp = process_query_output(output)
494
- df = pd.concat([df, temp], ignore_index=True)
495
-
496
-
497
- return df
498
-
499
- def semanticsearchAnnotator(matches:List[List[int]], document:Text):
500
- """
501
- Annotates the text in the document defined by list of [start index, end index]
502
- Example: "How are you today", if document type is text, matches = [[0,3]]
503
- will give answer = "How", however in case we used the spacy matcher then the
504
- matches = [[0,3]] will give answer = "How are you". However if spacy is used
505
- to find "How" then the matches = [[0,1]] for the string defined above.
506
-
507
- """
508
- start = 0
509
- annotated_text = ""
510
- for match in matches:
511
- start_idx = match[0]
512
- end_idx = match[1]
513
- if check_streamlit():
514
- annotated_text = (annotated_text + document[start:start_idx]
515
- + str(annotation(body=document[start_idx:end_idx],
516
- label="Context", background="#964448", color='#ffffff')))
517
- else:
518
- annotated_text = (annotated_text + document[start:start_idx]
519
- + colored(document[start_idx:end_idx],
520
- "green", attrs = ['bold']))
521
- start = end_idx
522
-
523
- annotated_text = annotated_text + document[end_idx:]
524
-
525
- if check_streamlit():
526
-
527
- st.write(
528
- markdown(annotated_text),
529
- unsafe_allow_html=True,
530
- )
531
- else:
532
- print(annotated_text)
533
-
534
-
535
- def semantic_keywordsearch(query:Text,documents:List[Document],
536
- embedding_model:Text,
537
- embedding_model_format:Text,
538
- embedding_layer:int, reader_model:str,
539
- retriever_top_k:int = 10, reader_top_k:int = 10,
540
- return_results:bool = False, embedding_dim:int = 768,
541
- max_seq_len:int = 512,top_k_per_candidate:int =1,
542
- sort_by:Literal["retriever", "reader"] = 'retriever'):
543
- """
544
- Performs the Semantic search on the List of haystack documents which is
545
- returned by preprocessing Pipeline.
546
-
547
- Params
548
- -------
549
- query: Keywords that need to be searche in documents.
550
- documents: List fo Haystack documents returned by preprocessing pipeline.
551
-
552
- """
553
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
554
- embedding_model= embedding_model,
555
- embedding_layer= embedding_layer,
556
- embedding_model_format= embedding_model_format,
557
- reader_model= reader_model, retriever_top_k= retriever_top_k,
558
- reader_top_k= reader_top_k, embedding_dim=embedding_dim,
559
- max_seq_len=max_seq_len,
560
- top_k_per_candidate=top_k_per_candidate)
561
-
562
- raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
563
- results_df = process_semantic_output(raw_output)
564
- if sort_by == 'retriever':
565
- results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
566
- else:
567
- results_df = results_df.sort_values(by=['reader_score'], ascending=False)
568
-
569
- if return_results:
570
- return results_df
571
- else:
572
- if check_streamlit:
573
- st.markdown("##### Top few semantic search results #####")
574
- else:
575
- print("Top few semantic search results")
576
- for i in range(len(results_df)):
577
- if check_streamlit:
578
- st.write("Result {}".format(i+1))
579
- else:
580
- print("Result {}".format(i+1))
581
- semanticsearchAnnotator([results_df.loc[i]['context_offset']],
582
- results_df.loc[i]['content'] )