Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
9f55059
1 Parent(s): 949b596

refactoring semantic pep edits in other

Browse files
appStore/multiapp.py CHANGED
@@ -46,7 +46,7 @@ class MultiApp:
46
 
47
  st.sidebar.write(format_func=lambda app: app['title'])
48
  image = Image.open('docStore/img/giz_sdsn_small.jpg')
49
- st.sidebar.image(image, width =150)
50
 
51
  with st.sidebar:
52
  selected = option_menu(None, [page["title"] for page in self.apps],
 
46
 
47
  st.sidebar.write(format_func=lambda app: app['title'])
48
  image = Image.open('docStore/img/giz_sdsn_small.jpg')
49
+ st.sidebar.image(image, width =200)
50
 
51
  with st.sidebar:
52
  selected = option_menu(None, [page["title"] for page in self.apps],
appStore/sdg_analysis.py CHANGED
@@ -93,12 +93,11 @@ def app():
93
  file_path = st.session_state['filepath']
94
  classifier = load_sdgClassifier(classifier_name=model_name)
95
  st.session_state['sdg_classifier'] = classifier
96
- all_documents = runSDGPreprocessingPipeline(fileName= file_name,
97
- filePath= file_path, split_by= split_by,
98
  split_length= split_length,
99
- split_overlap= split_overlap,
100
  split_respect_sentence_boundary= split_respect_sentence_boundary,
101
- remove_punc= remove_punc)
102
 
103
  if len(all_documents['documents']) > 100:
104
  warning_msg = ": This might take sometime, please sit back and relax."
@@ -110,14 +109,14 @@ def app():
110
  df, x = sdg_classification(haystack_doc=all_documents['documents'],
111
  threshold= threshold)
112
  df = df.drop(['Relevancy'], axis = 1)
113
- sdg_labels = x.SDG.unique()[::-1]
114
  textrank_keyword_list = []
115
  for label in sdg_labels:
116
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
117
  textranklist_ = textrank(textdata=sdgdata, words= top_n)
118
  if len(textranklist_) > 0:
119
  textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
120
- tRkeywordsDf = pd.DataFrame(textrank_keyword_list)
121
 
122
 
123
  plt.rcParams['font.size'] = 25
@@ -145,7 +144,7 @@ def app():
145
  st.write("")
146
  st.markdown("###### What keywords are present under SDG classified text? ######")
147
 
148
- AgGrid(tRkeywordsDf, reload_data = False,
149
  update_mode="value_changed",
150
  columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
151
  st.write("")
 
93
  file_path = st.session_state['filepath']
94
  classifier = load_sdgClassifier(classifier_name=model_name)
95
  st.session_state['sdg_classifier'] = classifier
96
+ all_documents = runSDGPreprocessingPipeline(file_name= file_name,
97
+ file_path= file_path, split_by= split_by,
98
  split_length= split_length,
 
99
  split_respect_sentence_boundary= split_respect_sentence_boundary,
100
+ split_overlap= split_overlap, remove_punc= remove_punc)
101
 
102
  if len(all_documents['documents']) > 100:
103
  warning_msg = ": This might take sometime, please sit back and relax."
 
109
  df, x = sdg_classification(haystack_doc=all_documents['documents'],
110
  threshold= threshold)
111
  df = df.drop(['Relevancy'], axis = 1)
112
+ sdg_labels = x.SDG.unique()
113
  textrank_keyword_list = []
114
  for label in sdg_labels:
115
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
116
  textranklist_ = textrank(textdata=sdgdata, words= top_n)
117
  if len(textranklist_) > 0:
118
  textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
119
+ textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
120
 
121
 
122
  plt.rcParams['font.size'] = 25
 
144
  st.write("")
145
  st.markdown("###### What keywords are present under SDG classified text? ######")
146
 
147
+ AgGrid(textrank_keywords_df, reload_data = False,
148
  update_mode="value_changed",
149
  columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
150
  st.write("")
utils/checkconfig.py CHANGED
@@ -1,12 +1,15 @@
1
  import configparser
2
  import logging
3
 
4
- def getconfig(configFilePath):
 
 
 
5
 
6
  config = configparser.ConfigParser()
7
 
8
  try:
9
- config.read_file(open(configFilePath))
10
  return config
11
  except:
12
  logging.warning("config file not found")
 
1
  import configparser
2
  import logging
3
 
4
+ def getconfig(configfile_path:str):
5
+ """
6
+ configfile_path: file path of .cfg file
7
+ """
8
 
9
  config = configparser.ConfigParser()
10
 
11
  try:
12
+ config.read_file(open(configfile_path))
13
  return config
14
  except:
15
  logging.warning("config file not found")
utils/keyword_extraction.py CHANGED
@@ -58,7 +58,7 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
58
  return results
59
 
60
 
61
- def tfidf_keyword(textdata, vectorizer, tfidfmodel, top_n):
62
  """
63
  TFIDF based keywords extraction
64
 
@@ -108,7 +108,7 @@ def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
108
  return keywords
109
 
110
  @st.cache(allow_output_mutation=True)
111
- def textrank(textdata:Text, ratio:float = 0.1, words = 0):
112
  """
113
  wrappper function to perform textrank, uses either ratio or wordcount to
114
  extract top keywords limited by words or ratio.
 
58
  return results
59
 
60
 
61
+ def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
62
  """
63
  TFIDF based keywords extraction
64
 
 
108
  return keywords
109
 
110
  @st.cache(allow_output_mutation=True)
111
+ def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
112
  """
113
  wrappper function to perform textrank, uses either ratio or wordcount to
114
  extract top keywords limited by words or ratio.
utils/lexical_search.py CHANGED
@@ -7,7 +7,7 @@ import streamlit as st
7
  from markdown import markdown
8
  from annotated_text import annotation
9
  from haystack.schema import Document
10
- from typing import List, Text
11
  from typing_extensions import Literal
12
  from utils.preprocessing import processingpipeline
13
  from utils.streamlitcheck import check_streamlit
@@ -23,10 +23,10 @@ except ImportError:
23
  logging.info("Streamlit not installed")
24
 
25
 
26
- def runLexicalPreprocessingPipeline(file_path,file_name,
27
  split_by: Literal["sentence", "word"] = 'word',
28
- split_length:int = 80, remove_punc:bool = False,
29
- split_overlap:int = 0 )->List[Document]:
30
  """
31
  creates the pipeline and runs the preprocessing pipeline,
32
  the params for pipeline are fetched from paramconfig. As lexical doesnt gets
@@ -40,11 +40,14 @@ def runLexicalPreprocessingPipeline(file_path,file_name,
40
  st.session_state['filename']
41
  file_path: filepath, in case of streamlit application use
42
  st.session_state['filepath']
43
- removePunc: to remove all Punctuation including ',' and '.' or not
44
  split_by: document splitting strategy either as word or sentence
45
  split_length: when synthetically creating the paragrpahs from document,
46
  it defines the length of paragraph.
 
 
 
47
  splititng of text.
 
48
 
49
  Return
50
  --------------
@@ -91,7 +94,8 @@ def tokenize_lexical_query(query:str)-> List[str]:
91
  if not (token.is_stop or token.is_punct)]
92
  return token_list
93
 
94
- def runSpacyMatcher(token_list:List[str], document:Text):
 
95
  """
96
  Using the spacy in backend finds the keywords in the document using the
97
  Matcher class from spacy. We can alternatively use the regex, but spacy
@@ -203,7 +207,7 @@ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
203
  else:
204
  print(annotated_text)
205
 
206
- def lexical_search(query:Text,top_k:int, documents:List[Document]):
207
  """
208
  Performs the Lexical search on the List of haystack documents which is
209
  returned by preprocessing Pipeline.
 
7
  from markdown import markdown
8
  from annotated_text import annotation
9
  from haystack.schema import Document
10
+ from typing import List, Text, Tuple
11
  from typing_extensions import Literal
12
  from utils.preprocessing import processingpipeline
13
  from utils.streamlitcheck import check_streamlit
 
23
  logging.info("Streamlit not installed")
24
 
25
 
26
+ def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
27
  split_by: Literal["sentence", "word"] = 'word',
28
+ split_length:int = 80, split_overlap:int = 0,
29
+ remove_punc:bool = False,)->List[Document]:
30
  """
31
  creates the pipeline and runs the preprocessing pipeline,
32
  the params for pipeline are fetched from paramconfig. As lexical doesnt gets
 
40
  st.session_state['filename']
41
  file_path: filepath, in case of streamlit application use
42
  st.session_state['filepath']
 
43
  split_by: document splitting strategy either as word or sentence
44
  split_length: when synthetically creating the paragrpahs from document,
45
  it defines the length of paragraph.
46
+ split_overlap: Number of words or sentences that overlap when creating
47
+ the paragraphs. This is done as one sentence or 'some words' make sense
48
+ when read in together with others. Therefore the overlap is used.
49
  splititng of text.
50
+ removePunc: to remove all Punctuation including ',' and '.' or not
51
 
52
  Return
53
  --------------
 
94
  if not (token.is_stop or token.is_punct)]
95
  return token_list
96
 
97
+ def runSpacyMatcher(token_list:List[str], document:Text
98
+ )->Tuple(List[List[int]],spacy.tokens.doc.Doc):
99
  """
100
  Using the spacy in backend finds the keywords in the document using the
101
  Matcher class from spacy. We can alternatively use the regex, but spacy
 
207
  else:
208
  print(annotated_text)
209
 
210
+ def lexical_search(query:Text, documents:List[Document],top_k:int):
211
  """
212
  Performs the Lexical search on the List of haystack documents which is
213
  returned by preprocessing Pipeline.
utils/preprocessing.py CHANGED
@@ -120,7 +120,7 @@ class FileConverter(BaseComponent):
120
  return
121
 
122
 
123
- def basic(s, remove_punc:bool = False):
124
 
125
  """
126
  Performs basic cleaning of text.
@@ -164,10 +164,10 @@ class UdfPreProcessor(BaseComponent):
164
  """
165
  outgoing_edges = 1
166
 
167
- def run(self, documents:List[Document], remove_punc:bool,
168
  split_by: Literal["sentence", "word"] = 'sentence',
169
- split_respect_sentence_boundary = False,
170
- split_length:int = 2, split_overlap:int = 0):
171
 
172
  """ this is required method to invoke the component in
173
  the pipeline implementation.
@@ -175,7 +175,7 @@ class UdfPreProcessor(BaseComponent):
175
  Params
176
  ----------
177
  documents: documents from the output dictionary returned by Fileconverter
178
- removePunc: to remove all Punctuation including ',' and '.' or not
179
  split_by: document splitting strategy either as word or sentence
180
  split_length: when synthetically creating the paragrpahs from document,
181
  it defines the length of paragraph.
 
120
  return
121
 
122
 
123
+ def basic(s:str, remove_punc:bool = False):
124
 
125
  """
126
  Performs basic cleaning of text.
 
164
  """
165
  outgoing_edges = 1
166
 
167
+ def run(self, documents:List[Document], remove_punc:bool=False,
168
  split_by: Literal["sentence", "word"] = 'sentence',
169
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
170
+ split_overlap:int = 0):
171
 
172
  """ this is required method to invoke the component in
173
  the pipeline implementation.
 
175
  Params
176
  ----------
177
  documents: documents from the output dictionary returned by Fileconverter
178
+ remove_punc: to remove all Punctuation including ',' and '.' or not
179
  split_by: document splitting strategy either as word or sentence
180
  split_length: when synthetically creating the paragrpahs from document,
181
  it defines the length of paragraph.
utils/sdg_classifier.py CHANGED
@@ -34,7 +34,7 @@ _lab_dict = {0: 'no_cat',
34
  17:'SDG 17 - Partnership for the goals',}
35
 
36
  @st.cache(allow_output_mutation=True)
37
- def load_sdgClassifier(config_file = None, classifier_name = None):
38
  """
39
  loads the document classifier using haystack, where the name/path of model
40
  in HF-hub as string is used to fetch the model object.Either configfile or
@@ -44,8 +44,8 @@ def load_sdgClassifier(config_file = None, classifier_name = None):
44
 
45
  Params
46
  --------
47
- configFile: config file from which to read the model name
48
- docClassifierModel: if modelname is passed, it takes a priority if not \
49
  found then will look for configfile, else raise error.
50
 
51
 
@@ -69,7 +69,9 @@ def load_sdgClassifier(config_file = None, classifier_name = None):
69
 
70
  @st.cache(allow_output_mutation=True)
71
  def sdg_classification(haystack_doc:List[Document],
72
- threshold:float, classifier_model= None)->Tuple[DataFrame,Series]:
 
 
73
  """
74
  Text-Classification on the list of texts provided. Classifier provides the
75
  most appropriate label for each text. these labels are in terms of if text
@@ -77,12 +79,13 @@ def sdg_classification(haystack_doc:List[Document],
77
 
78
  Params
79
  ---------
80
- haystackdoc: List of haystack Documents. The output of Preprocessing Pipeline
81
  contains the list of paragraphs in different format,here the list of
82
  Haystack Documents is used.
83
  threshold: threshold value for the model to keep the results from classifier
84
- classifiermodel: you can pass the classifier model directly, however in case of
85
- streamlit avoid it.
 
86
 
87
 
88
  Returns
@@ -117,7 +120,7 @@ def sdg_classification(haystack_doc:List[Document],
117
  x = x.rename('count')
118
  x = x.rename_axis('SDG').reset_index()
119
  x["SDG"] = pd.to_numeric(x["SDG"])
120
- x = x.sort_values(by=['count'])
121
  x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
122
  x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
123
 
@@ -126,11 +129,10 @@ def sdg_classification(haystack_doc:List[Document],
126
 
127
  return df, x
128
 
129
- def runSDGPreprocessingPipeline(filePath, fileName,
130
  split_by: Literal["sentence", "word"] = 'sentence',
131
- split_respect_sentence_boundary = False,
132
- split_length:int = 2, split_overlap = 0,
133
- remove_punc = False)->List[Document]:
134
  """
135
  creates the pipeline and runs the preprocessing pipeline,
136
  the params for pipeline are fetched from paramconfig
@@ -140,13 +142,16 @@ def runSDGPreprocessingPipeline(filePath, fileName,
140
 
141
  file_name: filename, in case of streamlit application use
142
  st.session_state['filename']
143
- file_path: filepath, in case of streamlit application use
144
- removePunc: to remove all Punctuation including ',' and '.' or not
145
  split_by: document splitting strategy either as word or sentence
146
  split_length: when synthetically creating the paragrpahs from document,
147
  it defines the length of paragraph.
148
  split_respect_sentence_boundary: Used when using 'word' strategy for
149
  splititng of text.
 
 
 
 
150
 
151
 
152
  Return
@@ -160,9 +165,9 @@ def runSDGPreprocessingPipeline(filePath, fileName,
160
 
161
  sdg_processing_pipeline = processingpipeline()
162
 
163
- output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
164
- params= {"FileConverter": {"file_path": filePath, \
165
- "file_name": fileName},
166
  "UdfPreProcessor": {"remove_punc": remove_punc, \
167
  "split_by": split_by, \
168
  "split_length":split_length,\
 
34
  17:'SDG 17 - Partnership for the goals',}
35
 
36
  @st.cache(allow_output_mutation=True)
37
+ def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
38
  """
39
  loads the document classifier using haystack, where the name/path of model
40
  in HF-hub as string is used to fetch the model object.Either configfile or
 
44
 
45
  Params
46
  --------
47
+ config_file: config file path from which to read the model name
48
+ classifier_name: if modelname is passed, it takes a priority if not \
49
  found then will look for configfile, else raise error.
50
 
51
 
 
69
 
70
  @st.cache(allow_output_mutation=True)
71
  def sdg_classification(haystack_doc:List[Document],
72
+ threshold:float = 0.8,
73
+ classifier_model:TransformersDocumentClassifier= None
74
+ )->Tuple[DataFrame,Series]:
75
  """
76
  Text-Classification on the list of texts provided. Classifier provides the
77
  most appropriate label for each text. these labels are in terms of if text
 
79
 
80
  Params
81
  ---------
82
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
83
  contains the list of paragraphs in different format,here the list of
84
  Haystack Documents is used.
85
  threshold: threshold value for the model to keep the results from classifier
86
+ classifiermodel: you can pass the classifier model directly,which takes priority
87
+ however if not then looks for model in streamlit session.
88
+ In case of streamlit avoid passing the model directly.
89
 
90
 
91
  Returns
 
120
  x = x.rename('count')
121
  x = x.rename_axis('SDG').reset_index()
122
  x["SDG"] = pd.to_numeric(x["SDG"])
123
+ x = x.sort_values(by=['count'], ascending=False)
124
  x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
125
  x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
126
 
 
129
 
130
  return df, x
131
 
132
+ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
133
  split_by: Literal["sentence", "word"] = 'sentence',
134
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
135
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
 
136
  """
137
  creates the pipeline and runs the preprocessing pipeline,
138
  the params for pipeline are fetched from paramconfig
 
142
 
143
  file_name: filename, in case of streamlit application use
144
  st.session_state['filename']
145
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
 
146
  split_by: document splitting strategy either as word or sentence
147
  split_length: when synthetically creating the paragrpahs from document,
148
  it defines the length of paragraph.
149
  split_respect_sentence_boundary: Used when using 'word' strategy for
150
  splititng of text.
151
+ split_overlap: Number of words or sentences that overlap when creating
152
+ the paragraphs. This is done as one sentence or 'some words' make sense
153
+ when read in together with others. Therefore the overlap is used.
154
+ remove_punc: to remove all Punctuation including ',' and '.' or not
155
 
156
 
157
  Return
 
165
 
166
  sdg_processing_pipeline = processingpipeline()
167
 
168
+ output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
169
+ params= {"FileConverter": {"file_path": file_path, \
170
+ "file_name": file_name},
171
  "UdfPreProcessor": {"remove_punc": remove_punc, \
172
  "split_by": split_by, \
173
  "split_length":split_length,\
utils/semantic_search.py CHANGED
@@ -1,15 +1,16 @@
1
- from haystack.nodes import TransformersQueryClassifier
2
  from haystack.nodes import EmbeddingRetriever, FARMReader
3
  from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
5
  from markdown import markdown
6
  from annotated_text import annotation
7
  from haystack.schema import Document
8
- from typing import List, Text
9
  from typing_extensions import Literal
10
  from utils.preprocessing import processingpipeline
11
  from utils.streamlitcheck import check_streamlit
12
  from haystack.pipelines import Pipeline
 
13
  import logging
14
  try:
15
  from termcolor import colored
@@ -37,9 +38,13 @@ class QueryCheck(BaseComponent):
37
  Uses Query Classifier from Haystack, process the query based on query type.
38
  Ability to determine the statements is not so good, therefore the chances
39
  statement also get modified. Ex: "List water related issues" will be
40
- identified by the model as keywords, and therefore it be processed as "what are
41
- the 'list all water related issues' related issues and discussions?". This is one shortcoming
42
- but is igonred for now, as semantic search will not get affected a lot, by this.
 
 
 
 
43
 
44
  1. https://docs.haystack.deepset.ai/docs/query_classifier
45
 
@@ -47,11 +52,22 @@ class QueryCheck(BaseComponent):
47
 
48
  outgoing_edges = 1
49
 
50
- def run(self, query):
51
  """
52
- mandatory method to use the cusotm node. Determines the query type, if
53
  if the query is of type keyword/statement will modify it to make it more
54
  useful for sentence transoformers.
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  """
57
  query_classifier = loadQueryClassifier()
@@ -61,20 +77,51 @@ class QueryCheck(BaseComponent):
61
  output = {"query":query,
62
  "query_type": 'question/statement'}
63
  else:
64
- output = {"query": "what are the {} related issues and discussions?".format(query),
 
65
  "query_type": 'statements/keyword'}
66
  logging.info(output)
67
  return output, "output_1"
68
 
69
- def run_batch(self, query):
70
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  @st.cache(allow_output_mutation=True)
73
- def runSemanticPreprocessingPipeline(file_path, file_name,
74
  split_by: Literal["sentence", "word"] = 'sentence',
75
- split_respect_sentence_boundary = False,
76
- split_length:int = 2, split_overlap = 0,
77
- remove_punc = False)->List[Document]:
78
  """
79
  creates the pipeline and runs the preprocessing pipeline.
80
 
@@ -82,22 +129,25 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
82
  ------------
83
 
84
  file_name: filename, in case of streamlit application use
85
- st.session_state['filename']
86
  file_path: filepath, in case of streamlit application use
87
- st.session_state['filepath']
88
- removePunc: to remove all Punctuation including ',' and '.' or not
89
  split_by: document splitting strategy either as word or sentence
90
  split_length: when synthetically creating the paragrpahs from document,
91
- it defines the length of paragraph.
 
 
 
92
  split_respect_sentence_boundary: Used when using 'word' strategy for
93
- splititng of text.
 
94
 
95
  Return
96
  --------------
97
  List[Document]: When preprocessing pipeline is run, the output dictionary
98
- has four objects. For the Haysatck implementation of semantic search we,
99
- need to use the List of Haystack Document, which can be fetched by
100
- key = 'documents' on output.
101
 
102
  """
103
 
@@ -106,7 +156,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
106
  output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
107
  params= {"FileConverter": {"file_path": file_path, \
108
  "file_name": file_name},
109
- "UdfPreProcessor": {"remove_punc": remove_punc, \
110
  "split_by": split_by, \
111
  "split_length":split_length,\
112
  "split_overlap": split_overlap,
@@ -115,10 +165,11 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
115
  return output_semantic_pre
116
 
117
 
118
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
119
- def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = None,
 
120
  embedding_layer:int = None, retriever_top_k:int = 10,
121
- max_seq_len:int = 512, document_store:InMemoryDocumentStore = None):
122
  """
123
  Returns the Retriever model based on params provided.
124
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
@@ -129,14 +180,16 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
129
  Params
130
  ---------
131
  embedding_model: Name of the model to be used for embedding. Check the links
132
- provided in documentation
133
- embedding_model_format: check the github link of Haystack provided in documentation
134
- embedding_layer: check the github link of Haystack provided in documentation
135
- retriever_top_k: Number of Top results to be returned by retriever
136
- max_seq_len: everymodel has max seq len it can handle, check in model card.
137
- Needed to hanlde the edge cases.
138
- document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
139
- and pass the same to function call. Can be done using createDocumentStore from utils.
 
 
140
 
141
  Return
142
  -------
@@ -157,7 +210,8 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
157
  st.session_state['retriever'] = retriever
158
  return retriever
159
 
160
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 
161
  def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
162
  embedding_dim:int = 768):
163
  """
@@ -167,11 +221,11 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
167
  Params
168
  -------
169
  documents: List of haystack document. If using the preprocessing pipeline,
170
- can be fetched key = 'documents; on output of preprocessing pipeline.
171
  similarity: scoring function, can be either 'cosine' or 'dot_product'
172
  embedding_dim: Document store has default value of embedding size = 768, and
173
- update_embeddings method of Docstore cannot infer the embedding size of
174
- retiever automaticallu, therefore set this value as per the model card.
175
 
176
  Return
177
  -------
@@ -185,13 +239,13 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
185
  return document_store
186
 
187
 
188
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 
189
  def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
190
- useQueryCheck = True, embedding_model_format:Text = None,
191
- max_seq_len:int =512,embedding_dim:int = 768,
192
- embedding_layer:int = None, retriever_top_k:int = 10,
193
- reader_model:str = None, reader_top_k:int = 10
194
- ):
195
  """
196
  creates the semantic search pipeline and document Store object from the
197
  list of haystack documents. The top_k for the Reader and Retirever are kept
@@ -201,6 +255,14 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
201
  and to some extent extractive QA purpose. The purpose of Reader is strictly to
202
  highlight the context for retrieved result and not for QA, however as stated
203
  it can work for QA too in limited sense.
 
 
 
 
 
 
 
 
204
 
205
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
206
  2. https://www.sbert.net/examples/applications/semantic-search/README.html
@@ -208,37 +270,39 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
208
  4. https://docs.haystack.deepset.ai/docs/reader
209
 
210
 
211
-
212
  Params
213
  ----------
214
  documents: list of Haystack Documents, returned by preprocessig pipeline.
215
  embedding_model: Name of the model to be used for embedding. Check the links
216
- provided in documentation
217
- embedding_model_format: check the github link of Haystack provided in documentation
 
218
  embedding_layer: check the github link of Haystack provided in documentation
 
 
 
219
  retriever_top_k: Number of Top results to be returned by retriever
220
  reader_model: Name of the model to be used for Reader node in hasyatck
221
- Pipeline. Check the links provided in documentation
222
  reader_top_k: Reader will use retrieved results to further find better matches.
223
- As purpose here is to use reader to extract context, the value is
224
- same as retriever_top_k.
225
- useQueryCheck: Whether to use the querycheck which modifies the query or not.
226
- embedding_dim: Document store has default value of embedding size = 768, and
227
- update_embeddings method of Docstore cannot infer the embedding size of
228
- retiever automaticallu, therefore set this value as per the model card.
229
  max_seq_len:everymodel has max seq len it can handle, check in model card.
230
- Needed to hanlde the edge cases
 
231
 
232
 
233
  Return
234
  ---------
235
  semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
236
- nodes [QueryCheck, Retriever, Reader]
 
 
237
 
238
  document_store: As retriever can work only with Haystack Document Store, the
239
- list of document returned by preprocessing pipeline are fed into to get
240
- InMemmoryDocumentStore object type, with retriever updating the embedding
241
- embeddings of each paragraph in document store.
242
 
243
  """
244
  document_store = createDocumentStore(documents=documents,
@@ -248,34 +312,187 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
248
  embedding_layer=embedding_layer,
249
  retriever_top_k= retriever_top_k,
250
  document_store = document_store,
251
- max_seq_len=max_seq_len)
252
-
253
  document_store.update_embeddings(retriever)
254
- reader = FARMReader(model_name_or_path=reader_model,
255
- top_k = reader_top_k, use_gpu=True)
256
  semantic_search_pipeline = Pipeline()
257
  if useQueryCheck and reader_model:
258
  querycheck = QueryCheck()
259
- semantic_search_pipeline.add_node(component = querycheck, name = "QueryCheck",
260
- inputs = ["Query"])
261
- semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
262
- inputs = ["QueryCheck.output_1"])
 
 
263
  semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
264
  inputs= ["EmbeddingRetriever"])
 
265
  elif reader_model :
266
- semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
267
- inputs = ["Query"])
268
- semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
269
- inputs= ["EmbeddingRetriever"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  else:
271
- semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
272
- inputs = ["Query"])
273
 
 
274
 
275
- return semantic_search_pipeline, document_store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
- def semanticsearchAnnotator(matches: List[List[int]], document):
279
  """
280
  Annotates the text in the document defined by list of [start index, end index]
281
  Example: "How are you today", if document type is text, matches = [[0,3]]
@@ -311,12 +528,14 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
311
  print(annotated_text)
312
 
313
 
314
- def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:Text,
 
315
  embedding_model_format:Text,
316
- embedding_layer:int, reader_model:str,
317
- retriever_top_k:int = 10, reader_top_k:int = 10,
318
- return_results:bool = False, embedding_dim:int = 768,
319
- max_seq_len:int = 512):
 
320
  """
321
  Performs the Semantic search on the List of haystack documents which is
322
  returned by preprocessing Pipeline.
@@ -327,7 +546,7 @@ def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:T
327
  documents: List fo Haystack documents returned by preprocessing pipeline.
328
 
329
  """
330
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
331
  embedding_model= embedding_model,
332
  embedding_layer= embedding_layer,
333
  embedding_model_format= embedding_model_format,
@@ -335,22 +554,24 @@ def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:T
335
  reader_top_k= reader_top_k, embedding_dim=embedding_dim,
336
  max_seq_len=max_seq_len)
337
 
338
- results = semanticsearch_pipeline.run(query = query)
 
 
 
 
 
 
339
  if return_results:
340
- return results
341
  else:
342
  if check_streamlit:
343
  st.markdown("##### Top few semantic search results #####")
344
  else:
345
  print("Top few semantic search results")
346
- for i,answer in enumerate(results['answers']):
347
- temp = answer.to_dict()
348
- doc = doc_store.get_document_by_id(temp['document_id']).content
349
- start_idx = doc.find(temp['context'])
350
- end_idx = start_idx + len(temp['context'])
351
- match = [[start_idx,end_idx]]
352
  if check_streamlit:
353
  st.write("Result {}".format(i+1))
354
  else:
355
  print("Result {}".format(i+1))
356
- semanticsearchAnnotator(match, doc)
 
 
1
+ from haystack.nodes import TransformersQueryClassifier, Docs2Answers
2
  from haystack.nodes import EmbeddingRetriever, FARMReader
3
  from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
5
  from markdown import markdown
6
  from annotated_text import annotation
7
  from haystack.schema import Document
8
+ from typing import List, Text, Union
9
  from typing_extensions import Literal
10
  from utils.preprocessing import processingpipeline
11
  from utils.streamlitcheck import check_streamlit
12
  from haystack.pipelines import Pipeline
13
+ import pandas as pd
14
  import logging
15
  try:
16
  from termcolor import colored
 
38
  Uses Query Classifier from Haystack, process the query based on query type.
39
  Ability to determine the statements is not so good, therefore the chances
40
  statement also get modified. Ex: "List water related issues" will be
41
+ identified by the model as keywords, and therefore it be processed as "what
42
+ are the 'list all water related issues' related issues and discussions?".
43
+ This is one shortcoming but is igonred for now, as semantic search will not
44
+ get affected a lot, by this. If you want to pass keywords list and want to
45
+ do batch processing use. run_batch. Example: if you want to find relevant
46
+ passages for water, food security, poverty then querylist = ["water", "food
47
+ security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
48
 
49
  1. https://docs.haystack.deepset.ai/docs/query_classifier
50
 
 
52
 
53
  outgoing_edges = 1
54
 
55
+ def run(self, query:str):
56
  """
57
+ mandatory method to use the custom node. Determines the query type, if
58
  if the query is of type keyword/statement will modify it to make it more
59
  useful for sentence transoformers.
60
+
61
+ Params
62
+ --------
63
+ query: query/statement/keywords in form of string
64
+
65
+ Return
66
+ ------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case the output contain key = 'query'.
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
 
72
  """
73
  query_classifier = loadQueryClassifier()
 
77
  output = {"query":query,
78
  "query_type": 'question/statement'}
79
  else:
80
+ output = {"query": "what are the {} related issues and \
81
+ discussions?".format(query),
82
  "query_type": 'statements/keyword'}
83
  logging.info(output)
84
  return output, "output_1"
85
 
86
+ def run_batch(self, queries:List[str]):
87
+ """
88
+ running multiple queries in one go, howeevr need the queries to be passed
89
+ as list of string. Example: if you want to find relevant passages for
90
+ water, food security, poverty then querylist = ["water", "food security",
91
+ "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
92
+
93
+ Params
94
+ --------
95
+ queries: queries/statements/keywords in form of string encapsulated
96
+ within List
97
+
98
+ Return
99
+ ------
100
+ output: dictionary, with key as identifier and value could be anything
101
+ we need to return. In this case the output contain key = 'queries'.
102
+
103
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
104
+ """
105
+ query_classifier = loadQueryClassifier()
106
+ query_list = []
107
+ for query in queries:
108
+ result = query_classifier.run(query=query)
109
+ if result[1] == "output_1":
110
+ query_list.append(query)
111
+ else:
112
+ query_list.append("what are the {} related issues and \
113
+ discussions?".format(query))
114
+ output = {'queries':query_list}
115
+ logging.info(output)
116
+ return output, "output_1"
117
+
118
 
119
  @st.cache(allow_output_mutation=True)
120
+ def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
121
  split_by: Literal["sentence", "word"] = 'sentence',
122
+ split_length:int = 2, split_overlap:int = 0,
123
+ split_respect_sentence_boundary:bool = False,
124
+ remove_punc:bool = False)->List[Document]:
125
  """
126
  creates the pipeline and runs the preprocessing pipeline.
127
 
 
129
  ------------
130
 
131
  file_name: filename, in case of streamlit application use
132
+ st.session_state['filename']
133
  file_path: filepath, in case of streamlit application use
134
+ st.session_state['filepath']
 
135
  split_by: document splitting strategy either as word or sentence
136
  split_length: when synthetically creating the paragrpahs from document,
137
+ it defines the length of paragraph.
138
+ split_overlap: Number of words or sentences that overlap when creating the
139
+ paragraphs. This is done as one sentence or 'some words' make sense
140
+ when read in together with others. Therefore the overlap is used.
141
  split_respect_sentence_boundary: Used when using 'word' strategy for
142
+ splititng of text.
143
+ remove_punc: to remove all Punctuation including ',' and '.' or not
144
 
145
  Return
146
  --------------
147
  List[Document]: When preprocessing pipeline is run, the output dictionary
148
+ has four objects. For the Haysatck implementation of semantic search we,
149
+ need to use the List of Haystack Document, which can be fetched by
150
+ key = 'documents' on output.
151
 
152
  """
153
 
 
156
  output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
157
  params= {"FileConverter": {"file_path": file_path, \
158
  "file_name": file_name},
159
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
160
  "split_by": split_by, \
161
  "split_length":split_length,\
162
  "split_overlap": split_overlap,
 
165
  return output_semantic_pre
166
 
167
 
168
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
169
+ allow_output_mutation=True)
170
+ def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
171
  embedding_layer:int = None, retriever_top_k:int = 10,
172
+ max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
173
  """
174
  Returns the Retriever model based on params provided.
175
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
 
180
  Params
181
  ---------
182
  embedding_model: Name of the model to be used for embedding. Check the links
183
+ provided in documentation
184
+ embedding_model_format: check the github link of Haystack provided in
185
+ documentation embedding_layer: check the github link of Haystack
186
+ provided in documentation retriever_top_k: Number of Top results to
187
+ be returned by
188
+ retriever max_seq_len: everymodel has max seq len it can handle, check in
189
+ model card. Needed to hanlde the edge cases.
190
+ document_store: InMemoryDocumentStore, write haystack Document list to
191
+ DocumentStore and pass the same to function call. Can be done using
192
+ createDocumentStore from utils.
193
 
194
  Return
195
  -------
 
210
  st.session_state['retriever'] = retriever
211
  return retriever
212
 
213
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
214
+ allow_output_mutation=True)
215
  def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
216
  embedding_dim:int = 768):
217
  """
 
221
  Params
222
  -------
223
  documents: List of haystack document. If using the preprocessing pipeline,
224
+ can be fetched key = 'documents; on output of preprocessing pipeline.
225
  similarity: scoring function, can be either 'cosine' or 'dot_product'
226
  embedding_dim: Document store has default value of embedding size = 768, and
227
+ update_embeddings method of Docstore cannot infer the embedding size of
228
+ retiever automatically, therefore set this value as per the model card.
229
 
230
  Return
231
  -------
 
239
  return document_store
240
 
241
 
242
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
243
+ allow_output_mutation=True)
244
  def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
245
+ embedding_model_format:Text = None,embedding_layer:int = None,
246
+ embedding_dim:int = 768,retriever_top_k:int = 10,
247
+ reader_model:str = None, reader_top_k:int = 10,
248
+ max_seq_len:int =512,useQueryCheck = True, ):
 
249
  """
250
  creates the semantic search pipeline and document Store object from the
251
  list of haystack documents. The top_k for the Reader and Retirever are kept
 
255
  and to some extent extractive QA purpose. The purpose of Reader is strictly to
256
  highlight the context for retrieved result and not for QA, however as stated
257
  it can work for QA too in limited sense.
258
+ There are 4 variants of pipeline it can return
259
+ 1.QueryCheck > Retriever > Reader
260
+ 2.Retriever > Reader
261
+ 3.QueryCheck > Retriever > Docs2Answers : If reader is None,
262
+ then Doc2answer is used to keep the output of pipeline structurally same.
263
+ 4.Retriever > Docs2Answers
264
+
265
+ Links
266
 
267
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
268
  2. https://www.sbert.net/examples/applications/semantic-search/README.html
 
270
  4. https://docs.haystack.deepset.ai/docs/reader
271
 
272
 
 
273
  Params
274
  ----------
275
  documents: list of Haystack Documents, returned by preprocessig pipeline.
276
  embedding_model: Name of the model to be used for embedding. Check the links
277
+ provided in documentation
278
+ embedding_model_format: check the github link of Haystack provided in
279
+ documentation
280
  embedding_layer: check the github link of Haystack provided in documentation
281
+ embedding_dim: Document store has default value of embedding size = 768, and
282
+ update_embeddings method of Docstore cannot infer the embedding size of
283
+ retiever automatically, therefore set this value as per the model card.
284
  retriever_top_k: Number of Top results to be returned by retriever
285
  reader_model: Name of the model to be used for Reader node in hasyatck
286
+ Pipeline. Check the links provided in documentation
287
  reader_top_k: Reader will use retrieved results to further find better matches.
288
+ As purpose here is to use reader to extract context, the value is
289
+ same as retriever_top_k.
 
 
 
 
290
  max_seq_len:everymodel has max seq len it can handle, check in model card.
291
+ Needed to hanlde the edge cases
292
+ useQueryCheck: Whether to use the querycheck which modifies the query or not.
293
 
294
 
295
  Return
296
  ---------
297
  semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
298
+ nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
299
+ then Doc2answer is used to keep the output of pipeline structurally
300
+ same.
301
 
302
  document_store: As retriever can work only with Haystack Document Store, the
303
+ list of document returned by preprocessing pipeline are fed into to
304
+ get InMemmoryDocumentStore object type, with retriever updating the
305
+ embeddings of each paragraph in document store.
306
 
307
  """
308
  document_store = createDocumentStore(documents=documents,
 
312
  embedding_layer=embedding_layer,
313
  retriever_top_k= retriever_top_k,
314
  document_store = document_store,
315
+ max_seq_len=max_seq_len)
 
316
  document_store.update_embeddings(retriever)
 
 
317
  semantic_search_pipeline = Pipeline()
318
  if useQueryCheck and reader_model:
319
  querycheck = QueryCheck()
320
+ reader = FARMReader(model_name_or_path=reader_model,
321
+ top_k = reader_top_k, use_gpu=True)
322
+ semantic_search_pipeline.add_node(component = querycheck,
323
+ name = "QueryCheck",inputs = ["Query"])
324
+ semantic_search_pipeline.add_node(component = retriever,
325
+ name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
326
  semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
327
  inputs= ["EmbeddingRetriever"])
328
+
329
  elif reader_model :
330
+ reader = FARMReader(model_name_or_path=reader_model,
331
+ top_k = reader_top_k, use_gpu=True)
332
+ semantic_search_pipeline.add_node(component = retriever,
333
+ name = "EmbeddingRetriever",inputs = ["Query"])
334
+ semantic_search_pipeline.add_node(component = reader,
335
+ name = "FARMReader",inputs= ["EmbeddingRetriever"])
336
+ elif useQueryCheck and not reader_model:
337
+ querycheck = QueryCheck()
338
+ docs2answers = Docs2Answers()
339
+ semantic_search_pipeline.add_node(component = querycheck,
340
+ name = "QueryCheck",inputs = ["Query"])
341
+ semantic_search_pipeline.add_node(component = retriever,
342
+ name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
343
+ semantic_search_pipeline.add_node(component = docs2answers,
344
+ name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
345
+ elif not useQueryCheck and not reader_model:
346
+ docs2answers = Docs2Answers()
347
+ semantic_search_pipeline.add_node(component = retriever,
348
+ name = "EmbeddingRetriever",inputs = ["Query"])
349
+ semantic_search_pipeline.add_node(component = docs2answers,
350
+ name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
351
+
352
+ logging.info(semantic_search_pipeline.components)
353
+ return semantic_search_pipeline, document_store
354
+
355
+ def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
356
+ """
357
+ will use the haystack run or run_batch based on if single query is passed
358
+ as string or multiple queries as List[str]
359
+
360
+ Params
361
+ -------
362
+ pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
363
+ from utils.semanticsearch
364
+
365
+ queries: Either a single query or list of queries.
366
+
367
+ Return
368
+ -------
369
+ results: Dict containing answers and documents as key and their respective
370
+ values
371
+
372
+ """
373
+
374
+ if type(queries) == list:
375
+ results = pipeline.run_batch(queries=queries)
376
+ elif type(queries) == str:
377
+ results = pipeline.run(query=queries)
378
  else:
379
+ logging.info("Please check the input type for the queries")
380
+ return
381
 
382
+ return results
383
 
384
+ def process_query_output(results:dict)->pd.DataFrame:
385
+ """
386
+ Returns the dataframe with necessary information like including
387
+ ['query','answer','answer_offset','context_offset','context','content',
388
+ 'reader_score','retriever_score','id',]. This is designed for output given
389
+ by semantic search pipeline with single query and final node as reader.
390
+ The output of pipeline having Docs2Answers as final node or multiple queries
391
+ need to be handled separately. In these other cases, use process_semantic_output
392
+ from utils.semantic_search which uses this function internally to make one
393
+ combined dataframe.
394
+
395
+ Params
396
+ ---------
397
+ results: this dictionary should have key,values with
398
+ keys = [query,answers,documents], however answers is optional.
399
+ in case of [Doc2Answers as final node], process_semantic_output
400
+ doesnt return answers thereby setting all values contained in
401
+ answers to 'None'
402
+
403
+ Return
404
+ --------
405
+ df: dataframe with all the columns mentioned in function description.
406
+
407
+ """
408
+ query_text = results['query']
409
+ if 'answers' in results.keys():
410
+ answer_dict = {}
411
 
412
+ for answer in results['answers']:
413
+ answer_dict[answer.document_id] = answer.to_dict()
414
+ else:
415
+ answer_dict = {}
416
+ docs = results['documents']
417
+ df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
418
+ 'context','content','reader_score','retriever_score',
419
+ 'id'])
420
+ for doc in docs:
421
+ row_list = {}
422
+ row_list['query'] = query_text
423
+ row_list['retriever_score'] = doc.score
424
+ row_list['id'] = doc.id
425
+ row_list['content'] = doc.content
426
+ if doc.id in answer_dict.keys():
427
+ row_list['answer'] = answer_dict[doc.id]['answer']
428
+ row_list['context'] = answer_dict[doc.id]['context']
429
+ row_list['reader_score'] = answer_dict[doc.id]['score']
430
+ answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
431
+ row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
432
+ start_idx = doc.content.find(row_list['context'])
433
+ end_idx = start_idx + len(row_list['context'])
434
+ row_list['context_offset'] = [start_idx, end_idx]
435
+ else:
436
+ row_list['answer'] = None
437
+ row_list['context'] = None
438
+ row_list['reader_score'] = None
439
+ row_list['answer_offset'] = None
440
+ row_list['context_offset'] = None
441
+ df_dictionary = pd.DataFrame([row_list])
442
+ df = pd.concat([df, df_dictionary], ignore_index=True)
443
+
444
+ return df
445
+
446
+ def process_semantic_output(results):
447
+ """
448
+ Returns the dataframe with necessary information like including
449
+ ['query','answer','answer_offset','context_offset','context','content',
450
+ 'reader_score','retriever_score','id',]. Distingushes if its single query or
451
+ multi queries by reading the pipeline output dictionary keys.
452
+ Uses the process_query_output to get the dataframe for each query and create
453
+ one concataneted dataframe. In case f Docs2Answers as final node, deletes
454
+ the answers part. See documentations of process_query_output.
455
+
456
+ Params
457
+ ---------
458
+ results: raw output of runSemanticPipeline.
459
+
460
+ Return
461
+ --------
462
+ df: dataframe with all the columns mentioned in function description.
463
+
464
+ """
465
+ output = {}
466
+ if 'query' in results.keys():
467
+ output['query'] = results['query']
468
+ output['documents'] = results['documents']
469
+ if results['node_id'] == 'Docs2Answers':
470
+ pass
471
+ else:
472
+ output['answers'] = results['answers']
473
+ df = process_query_output(output)
474
+ return df
475
+ if 'queries' in results.keys():
476
+ df = pd.DataFrame(columns=['query','answer','answer_offset',
477
+ 'context_offset','context','content',
478
+ 'reader_score','retriever_score','id'])
479
+ for query,answers,documents in zip(results['queries'],
480
+ results['answers'],results['documents']):
481
+ output = {}
482
+ output['query'] = query
483
+ output['documents'] = documents
484
+ if results['node_id'] == 'Docs2Answers':
485
+ pass
486
+ else:
487
+ output['answers'] = answers
488
+
489
+ temp = process_query_output(output)
490
+ df = pd.concat([df, temp], ignore_index=True)
491
+
492
+
493
+ return df
494
 
495
+ def semanticsearchAnnotator(matches:List[List[int]], document:Text):
496
  """
497
  Annotates the text in the document defined by list of [start index, end index]
498
  Example: "How are you today", if document type is text, matches = [[0,3]]
 
528
  print(annotated_text)
529
 
530
 
531
+ def semantic_keywordsearch(query:Text,documents:List[Document],
532
+ embedding_model:Text,
533
  embedding_model_format:Text,
534
+ embedding_layer:int, reader_model:str,
535
+ retriever_top_k:int = 10, reader_top_k:int = 10,
536
+ return_results:bool = False, embedding_dim:int = 768,
537
+ max_seq_len:int = 512,
538
+ sort_by:Literal["retriever", "reader"] = 'retriever'):
539
  """
540
  Performs the Semantic search on the List of haystack documents which is
541
  returned by preprocessing Pipeline.
 
546
  documents: List fo Haystack documents returned by preprocessing pipeline.
547
 
548
  """
549
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
550
  embedding_model= embedding_model,
551
  embedding_layer= embedding_layer,
552
  embedding_model_format= embedding_model_format,
 
554
  reader_top_k= reader_top_k, embedding_dim=embedding_dim,
555
  max_seq_len=max_seq_len)
556
 
557
+ raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
558
+ results_df = process_semantic_output(raw_output)
559
+ if sort_by == 'retriever':
560
+ results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
561
+ else:
562
+ results_df = results_df.sort_values(by=['reader_score'], ascending=False)
563
+
564
  if return_results:
565
+ return results_df
566
  else:
567
  if check_streamlit:
568
  st.markdown("##### Top few semantic search results #####")
569
  else:
570
  print("Top few semantic search results")
571
+ for i in range(len(results_df)):
 
 
 
 
 
572
  if check_streamlit:
573
  st.write("Result {}".format(i+1))
574
  else:
575
  print("Result {}".format(i+1))
576
+ semanticsearchAnnotator(results_df.loc[i]['context_offset'],
577
+ results_df.loc[i]['content'] )