Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 15, 2022

Commit

d7ce857

•

1 Parent(s): 7af394d

update

Browse files

Files changed (7) hide show

appStore/keyword_search.py +10 -9
appStore/sdg_analysis.py +7 -7
utils/keyword_extraction.py +3 -3
utils/lexical_search.py +14 -3
utils/preprocessing.py +8 -8
utils/sdg_classifier.py +10 -10
utils/semantic_search.py +38 -25

appStore/keyword_search.py CHANGED Viewed

@@ -6,7 +6,7 @@ import streamlit as st
 import json
 import logging
 from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
-from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_search
 from utils.checkconfig import getconfig
 # Declare all the necessary variables
@@ -21,6 +21,7 @@ embedding_model = config.get('semantic_search','RETRIEVER')
 embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
 embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
 embedding_dim  = int(config.get('semantic_search','EMBEDDING_DIM'))
 retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 reader_model = config.get('semantic_search','READER')
 reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
@@ -100,7 +101,7 @@ def app():
                 if 'filepath' in st.session_state:
                     if searchtype:
-                        allDocuments = runLexicalPreprocessingPipeline(
                                     file_name=st.session_state['filename'],
                                     file_path=st.session_state['filepath'],
                                     split_by=lexical_split_by,
@@ -110,13 +111,12 @@ def app():
                         logging.info("performing lexical search")
                         with st.spinner("Performing Exact matching search \
                                         (Lexical search) for you"):
-                            st.markdown("##### Top few lexical search (TFIDF) hits #####")
                             lexical_search(
                                 query=queryList,
-                                documents = allDocuments['documents'],
                                 top_k = lexical_top_k )
                     else:
-                        allDocuments = runSemanticPreprocessingPipeline(
                                             file_path= st.session_state['filepath'],
                                             file_name  = st.session_state['filename'],
                                             split_by=split_by,
@@ -124,20 +124,21 @@ def app():
                                             split_overlap=split_overlap,
                                             removePunc= remove_punc,
                         split_respect_sentence_boundary=split_respect_sentence_boundary)
-                        if len(allDocuments['documents']) > 100:
                             warning_msg = ": This might take sometime, please sit back and relax."
                         else:
                             warning_msg = ""
                         logging.info("starting semantic search")
                         with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
-                            semantic_search(query = queryList,
-                            documents = allDocuments['documents'],
                             embedding_model=embedding_model,
                             embedding_layer=embedding_layer,
                             embedding_model_format=embedding_model_format,
                             reader_model=reader_model,reader_top_k=reader_top_k,
-                            retriever_top_k=retriever_top_k, embedding_dim=embedding_dim)
                 else:
                     st.info("🤔 No document found, please try to upload it at the sidebar!")

 import json
 import logging
 from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
+from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
 from utils.checkconfig import getconfig
 # Declare all the necessary variables
 embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
 embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
 embedding_dim  = int(config.get('semantic_search','EMBEDDING_DIM'))
+max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
 retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 reader_model = config.get('semantic_search','READER')
 reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
                 if 'filepath' in st.session_state:
                     if searchtype:
+                        all_documents = runLexicalPreprocessingPipeline(
                                     file_name=st.session_state['filename'],
                                     file_path=st.session_state['filepath'],
                                     split_by=lexical_split_by,
                         logging.info("performing lexical search")
                         with st.spinner("Performing Exact matching search \
                                         (Lexical search) for you"):
                             lexical_search(
                                 query=queryList,
+                                documents = all_documents['documents'],
                                 top_k = lexical_top_k )
                     else:
+                        all_documents = runSemanticPreprocessingPipeline(
                                             file_path= st.session_state['filepath'],
                                             file_name  = st.session_state['filename'],
                                             split_by=split_by,
                                             split_overlap=split_overlap,
                                             removePunc= remove_punc,
                         split_respect_sentence_boundary=split_respect_sentence_boundary)
+                        if len(all_documents['documents']) > 100:
                             warning_msg = ": This might take sometime, please sit back and relax."
                         else:
                             warning_msg = ""
                         logging.info("starting semantic search")
                         with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
+                            semantic_keywordsearch(query = queryList,
+                            documents = all_documents['documents'],
                             embedding_model=embedding_model,
                             embedding_layer=embedding_layer,
                             embedding_model_format=embedding_model_format,
                             reader_model=reader_model,reader_top_k=reader_top_k,
+                            retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
+                            max_seq_len=max_seq_len)
                 else:
                     st.info("🤔 No document found, please try to upload it at the sidebar!")

appStore/sdg_analysis.py CHANGED Viewed

@@ -93,31 +93,31 @@ def app():
                 file_path = st.session_state['filepath']
                 classifier = load_sdgClassifier(classifier_name=model_name)
                 st.session_state['sdg_classifier'] = classifier
-                allDocuments = runSDGPreprocessingPipeline(fileName= file_name,
                                         filePath= file_path, split_by= split_by,
                                         split_length= split_length,
                                         split_overlap= split_overlap,
                 split_respect_sentence_boundary= split_respect_sentence_boundary,
-                removePunc= remove_punc)
-                if len(allDocuments['documents']) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
                     warning_msg = ""
                 with st.spinner("Running SDG Classification{}".format(warning_msg)):
-                    df, x = sdg_classification(haystackdoc=allDocuments['documents'],
                                                 threshold= threshold)
                     df = df.drop(['Relevancy'], axis = 1)
                     sdg_labels = x.SDG.unique()[::-1]
-                    textrankkeywordlist = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
                         textranklist_ = textrank(textdata=sdgdata, words= top_n)
                         if len(textranklist_) > 0:
-                            textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
-                    tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
                     plt.rcParams['font.size'] = 25

                 file_path = st.session_state['filepath']
                 classifier = load_sdgClassifier(classifier_name=model_name)
                 st.session_state['sdg_classifier'] = classifier
+                all_documents = runSDGPreprocessingPipeline(fileName= file_name,
                                         filePath= file_path, split_by= split_by,
                                         split_length= split_length,
                                         split_overlap= split_overlap,
                 split_respect_sentence_boundary= split_respect_sentence_boundary,
+                remove_punc= remove_punc)
+                if len(all_documents['documents']) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
                     warning_msg = ""
                 with st.spinner("Running SDG Classification{}".format(warning_msg)):
+                    df, x = sdg_classification(haystack_doc=all_documents['documents'],
                                                 threshold= threshold)
                     df = df.drop(['Relevancy'], axis = 1)
                     sdg_labels = x.SDG.unique()[::-1]
+                    textrank_keyword_list = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
                         textranklist_ = textrank(textdata=sdgdata, words= top_n)
                         if len(textranklist_) > 0:
+                            textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
+                    tRkeywordsDf = pd.DataFrame(textrank_keyword_list)
                     plt.rcParams['font.size'] = 25

utils/keyword_extraction.py CHANGED Viewed

@@ -58,7 +58,7 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
     return results
-def tfidfKeyword(textdata, vectorizer, tfidfmodel, top_n):
     """
     TFIDF based keywords extraction
@@ -81,7 +81,7 @@ def tfidfKeyword(textdata, vectorizer, tfidfmodel, top_n):
     keywords = [keyword for keyword in results]
     return keywords
-def keywordExtraction(sdg:int,sdgdata:List[Text]):
     """
     TFIDF based keywords extraction
@@ -102,7 +102,7 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
     features = vectorizer.get_feature_names_out()
     tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
     sorted_items=sort_coo(tf_idf_vector.tocoo())
-    top_n = int(config.get('tfidf', 'TOP_N'))
     results=extract_topn_from_vector(features,sorted_items,top_n)
     keywords = [keyword for keyword in results]
     return keywords

     return results
+def tfidf_keyword(textdata, vectorizer, tfidfmodel, top_n):
     """
     TFIDF based keywords extraction
     keywords = [keyword for keyword in results]
     return keywords
+def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
     """
     TFIDF based keywords extraction
     features = vectorizer.get_feature_names_out()
     tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
     sorted_items=sort_coo(tf_idf_vector.tocoo())
+    top_n = top_n
     results=extract_topn_from_vector(features,sorted_items,top_n)
     keywords = [keyword for keyword in results]
     return keywords

utils/lexical_search.py CHANGED Viewed

@@ -25,7 +25,7 @@ except ImportError:
 def runLexicalPreprocessingPipeline(file_path,file_name,
                         split_by: Literal["sentence", "word"] = 'word',
-                        split_length:int = 80, removePunc:bool = False,
                         split_overlap:int = 0 )->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
@@ -61,7 +61,7 @@ def runLexicalPreprocessingPipeline(file_path,file_name,
     output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
-                                        "UdfPreProcessor": {"removePunc": removePunc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap}})
@@ -223,12 +223,23 @@ def lexical_search(query:Text,top_k:int, documents:List[Document]):
     retriever = TfidfRetriever(document_store)
     results = retriever.retrieve(query=query, top_k = top_k)
     query_tokens = tokenize_lexical_query(query)
     for count, result in enumerate(results):
         matches, doc = runSpacyMatcher(query_tokens,result.content)
         if len(matches) != 0:
             if check_streamlit():
                 st.write("Result {}".format(count+1))
             else:
                 print("Results {}".format(count +1))
             spacyAnnotator(matches, doc)

 def runLexicalPreprocessingPipeline(file_path,file_name,
                         split_by: Literal["sentence", "word"] = 'word',
+                        split_length:int = 80, remove_punc:bool = False,
                         split_overlap:int = 0 )->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
+                                        "UdfPreProcessor": {"remove_punc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap}})
     retriever = TfidfRetriever(document_store)
     results = retriever.retrieve(query=query, top_k = top_k)
     query_tokens = tokenize_lexical_query(query)
+    flag = True
     for count, result in enumerate(results):
         matches, doc = runSpacyMatcher(query_tokens,result.content)
         if len(matches) != 0:
+            if flag:
+                flag = False
+                if check_streamlit:
+                    st.markdown("##### Top few lexical search (TFIDF) hits #####")
+                else:
+                    print("Top few lexical search (TFIDF) hits")
             if check_streamlit():
                 st.write("Result {}".format(count+1))
             else:
                 print("Results {}".format(count +1))
             spacyAnnotator(matches, doc)
+    if flag:
+        st.info("🤔 No relevant result found. Please try another keyword.")

utils/preprocessing.py CHANGED Viewed

@@ -120,7 +120,7 @@ class FileConverter(BaseComponent):
         return
-def basic(s, removePunc:bool = False):
     """
     Performs basic cleaning of text.
@@ -141,7 +141,7 @@ def basic(s, removePunc:bool = False):
     s = re.sub('\n', ' ', s)
     # Remove punctuations
-    if removePunc == True:
       translator = str.maketrans(' ', ' ', string.punctuation)
       s = s.translate(translator)
     # Remove distracting single quotes and dotted pattern
@@ -164,7 +164,7 @@ class UdfPreProcessor(BaseComponent):
     """
     outgoing_edges = 1
-    def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_respect_sentence_boundary = False,
             split_length:int = 2, split_overlap:int = 0):
@@ -220,7 +220,7 @@ class UdfPreProcessor(BaseComponent):
             # i = basic(i)
             docs_processed = preprocessor.process([i])
             for item in docs_processed:
-                item.content = basic(item.content, removePunc= removePunc)
         df = pd.DataFrame(docs_processed)
         all_text = " ".join(df.content.to_list())
@@ -248,12 +248,12 @@ def processingpipeline():
     """
     preprocessing_pipeline = Pipeline()
-    fileconverter = FileConverter()
-    customPreprocessor = UdfPreProcessor()
-    preprocessing_pipeline.add_node(component=fileconverter,
                                     name="FileConverter", inputs=["File"])
-    preprocessing_pipeline.add_node(component = customPreprocessor,
                             name ='UdfPreProcessor', inputs=["FileConverter"])
     return preprocessing_pipeline

         return
+def basic(s, remove_punc:bool = False):
     """
     Performs basic cleaning of text.
     s = re.sub('\n', ' ', s)
     # Remove punctuations
+    if remove_punc == True:
       translator = str.maketrans(' ', ' ', string.punctuation)
       s = s.translate(translator)
     # Remove distracting single quotes and dotted pattern
     """
     outgoing_edges = 1
+    def run(self, documents:List[Document], remove_punc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_respect_sentence_boundary = False,
             split_length:int = 2, split_overlap:int = 0):
             # i = basic(i)
             docs_processed = preprocessor.process([i])
             for item in docs_processed:
+                item.content = basic(item.content, remove_punc= remove_punc)
         df = pd.DataFrame(docs_processed)
         all_text = " ".join(df.content.to_list())
     """
     preprocessing_pipeline = Pipeline()
+    file_converter = FileConverter()
+    custom_preprocessor = UdfPreProcessor()
+    preprocessing_pipeline.add_node(component=file_converter,
                                     name="FileConverter", inputs=["File"])
+    preprocessing_pipeline.add_node(component = custom_preprocessor,
                             name ='UdfPreProcessor', inputs=["FileConverter"])
     return preprocessing_pipeline

utils/sdg_classifier.py CHANGED Viewed

@@ -34,7 +34,7 @@ _lab_dict = {0: 'no_cat',
             17:'SDG 17 - Partnership for the goals',}
 @st.cache(allow_output_mutation=True)
-def load_sdgClassifier(configFile = None, classifier_name = None):
     """
     loads the document classifier using haystack, where the name/path of model
     in HF-hub as string is used to fetch the model object.Either configfile or
@@ -52,11 +52,11 @@ def load_sdgClassifier(configFile = None, classifier_name = None):
     Return: document classifier model
     """
     if not classifier_name:
-        if not configFile:
             logging.warning("Pass either model name or config file")
             return
         else:
-            config = getconfig(configFile)
             classifier_name = config.get('sdg','MODEL')
     logging.info("Loading classifier")
@@ -68,8 +68,8 @@ def load_sdgClassifier(configFile = None, classifier_name = None):
 @st.cache(allow_output_mutation=True)
-def sdg_classification(haystackdoc:List[Document],
-                        threshold:float, classifiermodel= None)->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     most appropriate label for each text. these labels are in terms of if text
@@ -93,14 +93,14 @@ def sdg_classification(haystackdoc:List[Document],
     """
     logging.info("Working on SDG Classification")
-    if not classifiermodel:
         if check_streamlit:
-            classifiermodel = st.session_state['sdg_classifier']
         else:
             logging.warning("No streamlit envinornment found, Pass the classifier")
             return
-    results = classifiermodel.predict(haystackdoc)
     labels_= [(l.meta['classification']['label'],
@@ -130,7 +130,7 @@ def runSDGPreprocessingPipeline(filePath, fileName,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_respect_sentence_boundary = False,
             split_length:int = 2, split_overlap = 0,
-            removePunc = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
@@ -163,7 +163,7 @@ def runSDGPreprocessingPipeline(filePath, fileName,
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
                             params= {"FileConverter": {"file_path": filePath, \
                                         "file_name": fileName},
-                                     "UdfPreProcessor": {"removePunc": removePunc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap, \

             17:'SDG 17 - Partnership for the goals',}
 @st.cache(allow_output_mutation=True)
+def load_sdgClassifier(config_file = None, classifier_name = None):
     """
     loads the document classifier using haystack, where the name/path of model
     in HF-hub as string is used to fetch the model object.Either configfile or
     Return: document classifier model
     """
     if not classifier_name:
+        if not config_file:
             logging.warning("Pass either model name or config file")
             return
         else:
+            config = getconfig(config_file)
             classifier_name = config.get('sdg','MODEL')
     logging.info("Loading classifier")
 @st.cache(allow_output_mutation=True)
+def sdg_classification(haystack_doc:List[Document],
+                        threshold:float, classifier_model= None)->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     most appropriate label for each text. these labels are in terms of if text
     """
     logging.info("Working on SDG Classification")
+    if not classifier_model:
         if check_streamlit:
+            classifier_model = st.session_state['sdg_classifier']
         else:
             logging.warning("No streamlit envinornment found, Pass the classifier")
             return
+    results = classifier_model.predict(haystack_doc)
     labels_= [(l.meta['classification']['label'],
             split_by: Literal["sentence", "word"] = 'sentence',
             split_respect_sentence_boundary = False,
             split_length:int = 2, split_overlap = 0,
+            remove_punc = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
                             params= {"FileConverter": {"file_path": filePath, \
                                         "file_name": fileName},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap, \

utils/semantic_search.py CHANGED Viewed

@@ -37,8 +37,8 @@ class QueryCheck(BaseComponent):
     Uses Query Classifier from Haystack, process the query based on query type.
     Ability to determine the statements is not so good, therefore the chances
     statement also get modified. Ex: "List water related issues" will be
-    identified by the model as keywords, and therefore it be processed as "find
-    all issues related to 'list all water related issues'". This is one shortcoming
     but is igonred for now, as semantic search will not get affected a lot, by this.
     1. https://docs.haystack.deepset.ai/docs/query_classifier
@@ -61,7 +61,7 @@ class QueryCheck(BaseComponent):
             output = {"query":query,
                        "query_type": 'question/statement'}
         else:
-            output = {"query": "find all issues related to {}".format(query),
                       "query_type": 'statements/keyword'}
         logging.info(output)
         return output, "output_1"
@@ -74,7 +74,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
                 split_by: Literal["sentence", "word"] = 'sentence',
                 split_respect_sentence_boundary = False,
                 split_length:int = 2, split_overlap = 0,
-                removePunc = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline.
@@ -106,7 +106,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
     output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
-                                        "UdfPreProcessor": {"removePunc": removePunc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap,
@@ -118,7 +118,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = None,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
-                 document_store:InMemoryDocumentStore = None):
     """
     Returns the Retriever model based on params provided.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
@@ -133,6 +133,8 @@ def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = No
     embedding_model_format: check the github link of Haystack provided in documentation
     embedding_layer: check the github link of Haystack provided in documentation
     retriever_top_k: Number of Top results to be returned by retriever
     document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
     and pass the same to function call. Can be done using createDocumentStore from utils.
@@ -149,14 +151,15 @@ def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = No
                 embedding_model=embedding_model,top_k = retriever_top_k,
                 document_store = document_store,
                 emb_extraction_layer=embedding_layer, scale_score =True,
-                model_format=embedding_model_format, use_gpu = True)
     if check_streamlit:
         st.session_state['retriever'] = retriever
     return retriever
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
-                            embedding_dim:int = 768):
     """
     Creates the InMemory Document Store from haystack list of Documents.
     It is  mandatory component for Retriever to work in Haystack frame work.
@@ -185,15 +188,20 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
                 useQueryCheck = True, embedding_model_format:Text = None,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
-                 reader_model:str =  None, reader_top_k:int = 10,
-                 embedding_dim:int = 768):
     """
     creates the semantic search pipeline and document Store object from the
     list of haystack documents. The top_k for the Reader and Retirever are kept
     same, so that all the results returned by Retriever are used, however the
     context is extracted by Reader for each retrieved result. The querycheck is
-    added as node to process the query.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     2. https://www.sbert.net/examples/applications/semantic-search/README.html
     3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
@@ -218,6 +226,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
     embedding_dim: Document store has default value of embedding size = 768, and
     update_embeddings method of Docstore cannot infer the embedding size of
     retiever automaticallu, therefore set this value as per the model card.
     Return
@@ -237,27 +247,28 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
                     embedding_model_format=embedding_model_format,
                     embedding_layer=embedding_layer,
                     retriever_top_k= retriever_top_k,
-                    document_store = document_store)
     document_store.update_embeddings(retriever)
     reader = FARMReader(model_name_or_path=reader_model,
                     top_k = reader_top_k, use_gpu=True)
-    semanticsearch_pipeline = Pipeline()
     if useQueryCheck:
         querycheck = QueryCheck()
-        semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
                                         inputs = ["Query"])
-        semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
                                         inputs = ["QueryCheck.output_1"])
-        semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
                                         inputs= ["EmbeddingRetriever"])
     else:
-        semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
                                         inputs = ["Query"])
-        semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
                                         inputs= ["EmbeddingRetriever"])
-    return semanticsearch_pipeline, document_store
 def semanticsearchAnnotator(matches: List[List[int]], document):
@@ -296,11 +307,12 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
         print(annotated_text)
-def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
                 embedding_model_format:Text,
                  embedding_layer:int,  reader_model:str,
                  retriever_top_k:int = 10, reader_top_k:int = 10,
-                 return_results:bool = False, embedding_dim:int = 768):
     """
     Performs the Semantic search on the List of haystack documents which is
     returned by preprocessing Pipeline.
@@ -316,7 +328,8 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
                         embedding_layer= embedding_layer,
                         embedding_model_format= embedding_model_format,
                         reader_model= reader_model, retriever_top_k= retriever_top_k,
-                        reader_top_k= reader_top_k, embedding_dim=embedding_dim)
     results = semanticsearch_pipeline.run(query = query)
     if return_results:
@@ -328,10 +341,10 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
             print("Top few semantic search results")
         for i,answer in enumerate(results['answers']):
             temp = answer.to_dict()
-            start_idx = temp['offsets_in_document'][0]['start']
-            end_idx = temp['offsets_in_document'][0]['end']
-            match = [[start_idx,end_idx]]
             doc = doc_store.get_document_by_id(temp['document_id']).content
             if check_streamlit:
                 st.write("Result {}".format(i+1))
             else:

     Uses Query Classifier from Haystack, process the query based on query type.
     Ability to determine the statements is not so good, therefore the chances
     statement also get modified. Ex: "List water related issues" will be
+    identified by the model as keywords, and therefore it be processed as "what are
+    the 'list all water related issues' related issues and discussions?". This is one shortcoming
     but is igonred for now, as semantic search will not get affected a lot, by this.
     1. https://docs.haystack.deepset.ai/docs/query_classifier
             output = {"query":query,
                        "query_type": 'question/statement'}
         else:
+            output = {"query": "what are the {} related issues and discussions?".format(query),
                       "query_type": 'statements/keyword'}
         logging.info(output)
         return output, "output_1"
                 split_by: Literal["sentence", "word"] = 'sentence',
                 split_respect_sentence_boundary = False,
                 split_length:int = 2, split_overlap = 0,
+                remove_punc = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline.
     output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
+                                        "UdfPreProcessor": {"remove_punc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap,
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = None,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
+                 max_seq_len:int = 512, document_store:InMemoryDocumentStore = None):
     """
     Returns the Retriever model based on params provided.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     embedding_model_format: check the github link of Haystack provided in documentation
     embedding_layer: check the github link of Haystack provided in documentation
     retriever_top_k: Number of Top results to be returned by retriever
+    max_seq_len: everymodel has max seq len it can handle, check in model card.
+    Needed to hanlde the edge cases.
     document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
     and pass the same to function call. Can be done using createDocumentStore from utils.
                 embedding_model=embedding_model,top_k = retriever_top_k,
                 document_store = document_store,
                 emb_extraction_layer=embedding_layer, scale_score =True,
+                model_format=embedding_model_format, use_gpu = True,
+                max_seq_len = max_seq_len )
     if check_streamlit:
         st.session_state['retriever'] = retriever
     return retriever
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
+                        embedding_dim:int = 768):
     """
     Creates the InMemory Document Store from haystack list of Documents.
     It is  mandatory component for Retriever to work in Haystack frame work.
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
                 useQueryCheck = True, embedding_model_format:Text = None,
+                max_seq_len:int =512,embedding_dim:int = 768,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
+                 reader_model:str =  None, reader_top_k:int = 10
+                 ):
     """
     creates the semantic search pipeline and document Store object from the
     list of haystack documents. The top_k for the Reader and Retirever are kept
     same, so that all the results returned by Retriever are used, however the
     context is extracted by Reader for each retrieved result. The querycheck is
+    added as node to process the query. This pipeline is suited for keyword search,
+    and to some extent extractive QA purpose. The purpose of Reader is strictly to
+    highlight the context for retrieved result and not for QA, however as stated
+    it can work for QA too in limited sense.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     2. https://www.sbert.net/examples/applications/semantic-search/README.html
     3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
     embedding_dim: Document store has default value of embedding size = 768, and
     update_embeddings method of Docstore cannot infer the embedding size of
     retiever automaticallu, therefore set this value as per the model card.
+    max_seq_len:everymodel has max seq len it can handle, check in model card.
+    Needed to hanlde the edge cases
     Return
                     embedding_model_format=embedding_model_format,
                     embedding_layer=embedding_layer,
                     retriever_top_k= retriever_top_k,
+                    document_store = document_store,
+                    max_seq_len=max_seq_len)
     document_store.update_embeddings(retriever)
     reader = FARMReader(model_name_or_path=reader_model,
                     top_k = reader_top_k, use_gpu=True)
+    semantic_search_pipeline = Pipeline()
     if useQueryCheck:
         querycheck = QueryCheck()
+        semantic_search_pipeline.add_node(component = querycheck, name = "QueryCheck",
                                         inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
                                         inputs = ["QueryCheck.output_1"])
+        semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
                                         inputs= ["EmbeddingRetriever"])
     else:
+        semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
                                         inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
                                         inputs= ["EmbeddingRetriever"])
+    return semantic_search_pipeline, document_store
 def semanticsearchAnnotator(matches: List[List[int]], document):
         print(annotated_text)
+def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:Text,
                 embedding_model_format:Text,
                  embedding_layer:int,  reader_model:str,
                  retriever_top_k:int = 10, reader_top_k:int = 10,
+                 return_results:bool = False, embedding_dim:int = 768,
+                 max_seq_len:int = 512):
     """
     Performs the Semantic search on the List of haystack documents which is
     returned by preprocessing Pipeline.
                         embedding_layer= embedding_layer,
                         embedding_model_format= embedding_model_format,
                         reader_model= reader_model, retriever_top_k= retriever_top_k,
+                        reader_top_k= reader_top_k, embedding_dim=embedding_dim,
+                        max_seq_len=max_seq_len)
     results = semanticsearch_pipeline.run(query = query)
     if return_results:
             print("Top few semantic search results")
         for i,answer in enumerate(results['answers']):
             temp = answer.to_dict()
             doc = doc_store.get_document_by_id(temp['document_id']).content
+            start_idx = doc.find(temp['context'])
+            end_idx = start_idx + len(temp['context'])
+            match = [[start_idx,end_idx]]
             if check_streamlit:
                 st.write("Result {}".format(i+1))
             else: