Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 15, 2022

Commit

2663a97

•

1 Parent(s): fa191c0

retriever update and coherence

Browse files

Files changed (5) hide show

appStore/coherence.py +63 -5
appStore/keyword_search.py +3 -3
paramconfig.cfg +18 -5
utils/ndc_explorer.py +55 -0
utils/semantic_search.py +49 -28

appStore/coherence.py CHANGED Viewed

@@ -4,21 +4,42 @@ sys.path.append('../utils')
 import streamlit as st
 import ast
 # Reading data and Declaring necessary variables
 with open('docStore/ndcs/countryList.txt') as dfile:
-        countryList = dfile.read()
 countryList = ast.literal_eval(countryList)
 countrynames = list(countryList.keys())
 with open('docStore/ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
-            cca_sent = dfile.read()
 cca_sent = ast.literal_eval(cca_sent)
 with open('docStore/ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
     ccm_sent = dfile.read()
 ccm_sent = ast.literal_eval(ccm_sent)
 def app():
     #### APP INFO #####
@@ -55,6 +76,43 @@ def app():
             indicator is based on vector similarities in which only paragraphs \
             with similarity above 0.55  to the indicators are considered. """)
-    option = st.sidebar.selectbox('Select Country', (countrynames))
-    countryCode = countryList[option]

 import streamlit as st
 import ast
+import logging
+from utils.ndc_explorer import countrySpecificCCA, countrySpecificCCM
+from utils.checkconfig import getconfig
+from utils.semantic_search import runSemanticPreprocessingPipeline
 # Reading data and Declaring necessary variables
 with open('docStore/ndcs/countryList.txt') as dfile:
+    countryList = dfile.read()
 countryList = ast.literal_eval(countryList)
 countrynames = list(countryList.keys())
 with open('docStore/ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
+    cca_sent = dfile.read()
 cca_sent = ast.literal_eval(cca_sent)
 with open('docStore/ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
     ccm_sent = dfile.read()
 ccm_sent = ast.literal_eval(ccm_sent)
+config = getconfig('paramconfig.cfg')
+split_by = config.get('coherence','SPLIT_BY')
+split_length = int(config.get('coherence','SPLIT_LENGTH'))
+split_overlap = int(config.get('coherence','SPLIT_OVERLAP'))
+split_respect_sentence_boundary = bool(int(config.get('coherence',
+                                    'RESPECT_SENTENCE_BOUNDARY')))
+remove_punc = bool(int(config.get('coherence','REMOVE_PUNC')))
+embedding_model = config.get('coherence','RETRIEVER')
+embedding_model_format = config.get('coherence','RETRIEVER_FORMAT')
+embedding_layer = int(config.get('coherence','RETRIEVER_EMB_LAYER'))
+embedding_dim  = int(config.get('coherence','EMBEDDING_DIM'))
+retriever_top_k = int(config.get('coherence','RETRIEVER_TOP_K'))
+reader_model = config.get('coherence','READER')
+reader_top_k = int(config.get('coherence','RETRIEVER_TOP_K'))
 def app():
     #### APP INFO #####
             indicator is based on vector similarities in which only paragraphs \
             with similarity above 0.55  to the indicators are considered. """)
+    with st.sidebar:
+        option = st.selectbox('Select Country', (countrynames))
+        countryCode = countryList[option]
+        st.markdown("---")
+    with st.container():
+        if st.button("Check Coherence"):
+            sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
+            sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
+            if 'filepath' in st.session_state:
+                allDocuments = runSemanticPreprocessingPipeline(
+                                            file_path= st.session_state['filepath'],
+                                            file_name  = st.session_state['filename'],
+                                            split_by=split_by,
+                                            split_length= split_length,
+                                            split_overlap=split_overlap,
+                                            removePunc= remove_punc,
+                        split_respect_sentence_boundary=split_respect_sentence_boundary)
+                genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
+                if genre == 'Climate Change Adaptation':
+                    sent_dict = sent_cca
+                else:
+                    sent_dict = sent_ccm
+                sent_labels = []
+                for key,sent in sent_dict.items():
+                            sent_labels.append(sent)
+                if len(allDocuments['documents']) > 100:
+                            warning_msg = ": This might take sometime, please sit back and relax."
+                else:
+                    warning_msg = ""
+                logging.info("starting Coherence analysis, country selected {}".format(option))
+                with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
+                    pass
+            else:
+                st.info("🤔 No document found, please try to upload it at the sidebar!")
+                logging.warning("Terminated as no document provided")

appStore/keyword_search.py CHANGED Viewed

@@ -20,6 +20,7 @@ remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
 embedding_model = config.get('semantic_search','RETRIEVER')
 embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
 embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
 retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 reader_model = config.get('semantic_search','READER')
 reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
@@ -97,8 +98,7 @@ def app():
                 logging.warning("Terminated as no keyword provided")
             else:
                 if 'filepath' in st.session_state:
                     if searchtype:
                         allDocuments = runLexicalPreprocessingPipeline(
                                     file_name=st.session_state['filename'],
@@ -137,7 +137,7 @@ def app():
                             embedding_layer=embedding_layer,
                             embedding_model_format=embedding_model_format,
                             reader_model=reader_model,reader_top_k=reader_top_k,
-                            retriever_top_k=retriever_top_k)
                 else:
                     st.info("🤔 No document found, please try to upload it at the sidebar!")

 embedding_model = config.get('semantic_search','RETRIEVER')
 embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
 embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
+embedding_dim  = int(config.get('semantic_search','EMBEDDING_DIM'))
 retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 reader_model = config.get('semantic_search','READER')
 reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
                 logging.warning("Terminated as no keyword provided")
             else:
                 if 'filepath' in st.session_state:
                     if searchtype:
                         allDocuments = runLexicalPreprocessingPipeline(
                                     file_name=st.session_state['filename'],
                             embedding_layer=embedding_layer,
                             embedding_model_format=embedding_model_format,
                             reader_model=reader_model,reader_top_k=reader_top_k,
+                            retriever_top_k=retriever_top_k, embedding_dim=embedding_dim)
                 else:
                     st.info("🤔 No document found, please try to upload it at the sidebar!")

paramconfig.cfg CHANGED Viewed

@@ -8,8 +8,9 @@ REMOVE_PUNC = 0
 [semantic_search]
 RETRIEVER_TOP_K = 10
 MAX_SEQ_LENGTH = 64
-RETRIEVER = msmarco-bert-base-dot-v5
 RETRIEVER_FORMAT = sentence_transformers
 RETRIEVER_EMB_LAYER = -1
 READER = deepset/tinyroberta-squad2
 READER_TOP_K = 10
@@ -30,9 +31,21 @@ SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 15
-[preprocessor]
-SPLIT_OVERLAP_WORD = 10
-SPLIT_OVERLAP_SENTENCE = 1
 [tfidf]
 TOP_N = 20

 [semantic_search]
 RETRIEVER_TOP_K = 10
 MAX_SEQ_LENGTH = 64
+RETRIEVER = multi-qa-distilbert-dot-v1
 RETRIEVER_FORMAT = sentence_transformers
+EMBEDDING_DIM = 768
 RETRIEVER_EMB_LAYER = -1
 READER = deepset/tinyroberta-squad2
 READER_TOP_K = 10
 RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 15
 [tfidf]
 TOP_N = 20
+[coherence]
+RETRIEVER_TOP_K = 10
+MAX_SEQ_LENGTH = 64
+RETRIEVER = all-MiniLM-L6-v2
+RETRIEVER_FORMAT = sentence_transformers
+RETRIEVER_EMB_LAYER = -1
+EMBEDDING_DIM = 384
+READER = deepset/tinyroberta-squad2
+READER_TOP_K = 10
+THRESHOLD = 0.55
+SPLIT_BY = sentence
+SPLIT_LENGTH = 3
+SPLIT_OVERLAP = 0
+RESPECT_SENTENCE_BOUNDARY = 1
+REMOVE_PUNC = 0

utils/ndc_explorer.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import urllib.request
+import json
+link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
+def get_document(countryCode: str):
+                with urllib.request.urlopen(link) as urlfile:
+                    data =  json.loads(urlfile.read())
+                categoriesData = {}
+                categoriesData['categories']= data['categories']
+                categoriesData['subcategories']= data['subcategories']
+                keys_sub = categoriesData['subcategories'].keys()
+                documentType= 'NDCs'
+                if documentType in data.keys():
+                    if countryCode in data[documentType].keys():
+                        get_dict = {}
+                        for key, value in data[documentType][countryCode].items():
+                            if key not in ['country_name','region_id', 'region_name']:
+                                get_dict[key] = value['classification']
+                            else:
+                                get_dict[key] = value
+                    else:
+                        return None
+                else:
+                    return None
+                country = {}
+                for key in categoriesData['categories']:
+                    country[key]= {}
+                for key,value in categoriesData['subcategories'].items():
+                    country[value['category']][key] = get_dict[key]
+                return country
+        #   country_ndc = get_document('NDCs', countryList[option])
+def countrySpecificCCA(cca_sent, threshold, countryCode):
+    temp = {}
+    doc = get_document(countryCode)
+    for key,value in cca_sent.items():
+        id_ = doc['climate change adaptation'][key]['id']
+        if id_ >threshold:
+            temp[key] = value['id'][id_]
+    return temp
+def countrySpecificCCM(ccm_sent, threshold, countryCode):
+    temp = {}
+    doc = get_document(countryCode)
+    for key,value in ccm_sent.items():
+        id_ = doc['climate change mitigation'][key]['id']
+        if id_ >threshold:
+            temp[key] = value['id'][id_]
+    return temp

utils/semantic_search.py CHANGED Viewed

@@ -63,6 +63,7 @@ class QueryCheck(BaseComponent):
         else:
             output = {"query": "find all issues related to {}".format(query),
                       "query_type": 'statements/keyword'}
         return output, "output_1"
     def run_batch(self, query):
@@ -154,7 +155,8 @@ def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = No
     return retriever
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
-def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
     """
     Creates the InMemory Document Store from haystack list of Documents.
     It is  mandatory component for Retriever to work in Haystack frame work.
@@ -164,13 +166,17 @@ def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
     documents: List of haystack document. If using the preprocessing pipeline,
     can be fetched key = 'documents; on output of preprocessing pipeline.
     similarity: scoring function, can be either 'cosine' or 'dot_product'
     Return
     -------
     document_store: InMemory Document Store object type.
     """
-    document_store = InMemoryDocumentStore(similarity = similarity)
     document_store.write_documents(documents)
     return document_store
@@ -178,9 +184,10 @@ def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
-                embedding_model_format:Text = None,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
-                 reader_model:str =  None, reader_top_k:int = 10):
     """
     creates the semantic search pipeline and document Store object from the
     list of haystack documents. The top_k for the Reader and Retirever are kept
@@ -207,6 +214,11 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
     reader_top_k: Reader will use retrieved results to further find better matches.
                 As purpose here is to use reader to extract context, the value is
                 same as retriever_top_k.
     Return
     ---------
@@ -219,7 +231,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
     embeddings of each paragraph in document store.
     """
-    document_store = createDocumentStore(documents)
     retriever = loadRetriever(embedding_model = embedding_model,
                     embedding_model_format=embedding_model_format,
                     embedding_layer=embedding_layer,
@@ -227,17 +240,22 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
                     document_store = document_store)
     document_store.update_embeddings(retriever)
-    querycheck = QueryCheck()
     reader = FARMReader(model_name_or_path=reader_model,
                     top_k = reader_top_k, use_gpu=True)
     semanticsearch_pipeline = Pipeline()
-    semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
-                                    inputs = ["Query"])
-    semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
-                                    inputs = ["QueryCheck.output_1"])
-    semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
-                                    inputs= ["EmbeddingRetriever"])
     return semanticsearch_pipeline, document_store
@@ -281,7 +299,8 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
 def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
                 embedding_model_format:Text,
                  embedding_layer:int,  reader_model:str,
-                 retriever_top_k:int = 10, reader_top_k:int = 10):
     """
     Performs the Semantic search on the List of haystack documents which is
     returned by preprocessing Pipeline.
@@ -297,22 +316,24 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
                         embedding_layer= embedding_layer,
                         embedding_model_format= embedding_model_format,
                         reader_model= reader_model, retriever_top_k= retriever_top_k,
-                        reader_top_k= reader_top_k)
     results = semanticsearch_pipeline.run(query = query)
-    if check_streamlit:
-        st.markdown("##### Top few semantic search results #####")
     else:
-        print("Top few semantic search results")
-    for i,answer in enumerate(results['answers']):
-        temp = answer.to_dict()
-        start_idx = temp['offsets_in_document'][0]['start']
-        end_idx = temp['offsets_in_document'][0]['end']
-        match = [[start_idx,end_idx]]
-        doc = doc_store.get_document_by_id(temp['document_id']).content
         if check_streamlit:
-            st.write("Result {}".format(i+1))
         else:
-            print("Result {}".format(i+1))
-        semanticsearchAnnotator(match, doc)

         else:
             output = {"query": "find all issues related to {}".format(query),
                       "query_type": 'statements/keyword'}
+        logging.info(output)
         return output, "output_1"
     def run_batch(self, query):
     return retriever
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
+def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
+                            embedding_dim:int = 768):
     """
     Creates the InMemory Document Store from haystack list of Documents.
     It is  mandatory component for Retriever to work in Haystack frame work.
     documents: List of haystack document. If using the preprocessing pipeline,
     can be fetched key = 'documents; on output of preprocessing pipeline.
     similarity: scoring function, can be either 'cosine' or 'dot_product'
+    embedding_dim: Document store has default value of embedding size = 768, and
+    update_embeddings method of Docstore cannot infer the embedding size of
+    retiever automaticallu, therefore set this value as per the model card.
     Return
     -------
     document_store: InMemory Document Store object type.
     """
+    document_store = InMemoryDocumentStore(similarity = similarity,
+                                        embedding_dim = embedding_dim )
     document_store.write_documents(documents)
     return document_store
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
+                useQueryCheck = True, embedding_model_format:Text = None,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
+                 reader_model:str =  None, reader_top_k:int = 10,
+                 embedding_dim:int = 768):
     """
     creates the semantic search pipeline and document Store object from the
     list of haystack documents. The top_k for the Reader and Retirever are kept
     reader_top_k: Reader will use retrieved results to further find better matches.
                 As purpose here is to use reader to extract context, the value is
                 same as retriever_top_k.
+    useQueryCheck: Whether to use the querycheck which modifies the query or not.
+    embedding_dim: Document store has default value of embedding size = 768, and
+    update_embeddings method of Docstore cannot infer the embedding size of
+    retiever automaticallu, therefore set this value as per the model card.
     Return
     ---------
     embeddings of each paragraph in document store.
     """
+    document_store = createDocumentStore(documents=documents,
+                                    embedding_dim=embedding_dim)
     retriever = loadRetriever(embedding_model = embedding_model,
                     embedding_model_format=embedding_model_format,
                     embedding_layer=embedding_layer,
                     document_store = document_store)
     document_store.update_embeddings(retriever)
     reader = FARMReader(model_name_or_path=reader_model,
                     top_k = reader_top_k, use_gpu=True)
     semanticsearch_pipeline = Pipeline()
+    if useQueryCheck:
+        querycheck = QueryCheck()
+        semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
+                                        inputs = ["Query"])
+        semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
+                                        inputs = ["QueryCheck.output_1"])
+        semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
+                                        inputs= ["EmbeddingRetriever"])
+    else:
+        semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
+                                        inputs = ["Query"])
+        semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
+                                        inputs= ["EmbeddingRetriever"])
     return semanticsearch_pipeline, document_store
 def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
                 embedding_model_format:Text,
                  embedding_layer:int,  reader_model:str,
+                 retriever_top_k:int = 10, reader_top_k:int = 10,
+                 return_results:bool = False, embedding_dim:int = 768):
     """
     Performs the Semantic search on the List of haystack documents which is
     returned by preprocessing Pipeline.
                         embedding_layer= embedding_layer,
                         embedding_model_format= embedding_model_format,
                         reader_model= reader_model, retriever_top_k= retriever_top_k,
+                        reader_top_k= reader_top_k, embedding_dim=embedding_dim)
     results = semanticsearch_pipeline.run(query = query)
+    if return_results:
+        return results
     else:
         if check_streamlit:
+            st.markdown("##### Top few semantic search results #####")
         else:
+            print("Top few semantic search results")
+        for i,answer in enumerate(results['answers']):
+            temp = answer.to_dict()
+            start_idx = temp['offsets_in_document'][0]['start']
+            end_idx = temp['offsets_in_document'][0]['end']
+            match = [[start_idx,end_idx]]
+            doc = doc_store.get_document_by_id(temp['document_id']).content
+            if check_streamlit:
+                st.write("Result {}".format(i+1))
+            else:
+                print("Result {}".format(i+1))
+            semanticsearchAnnotator(match, doc)