Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

File size: 7,462 Bytes

cc5c327

# set path
import glob, os, sys
from udfPreprocess.search import semantic_search
sys.path.append('../udfPreprocess')

#import helper
import udfPreprocess.docPreprocessing as pre
import udfPreprocess.cleaning as clean
from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
#import needed libraries
import seaborn as sns
from pandas import DataFrame
from sentence_transformers import SentenceTransformer, CrossEncoder, util
# from keybert import KeyBERT
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd 
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import docx
from docx.shared import Inches
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE 
import logging
logger = logging.getLogger(__name__)
import tempfile
import sqlite3
import json
import configparser


def app():

    with st.container():
        st.markdown("<h1 style='text-align: center;  \
                      color: black;'> Search</h1>", 
                      unsafe_allow_html=True)
        st.write(' ')
        st.write(' ')

    with st.expander("ℹ️ - About this app", expanded=False):

        st.write(
            """     
            The *Keyword Search* app is an easy-to-use interface \ 
            built in Streamlit for doing keyword search in \
            policy document - developed by GIZ Data and the \
            Sustainable Development Solution Network.
            """)

        st.markdown("")
  
    
                  
    with st.sidebar:
        with open('sample/keywordexample.json','r') as json_file:
            keywordexample = json.load(json_file)
        
        genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
        if genre == 'Food':
            keywordList = keywordexample['Food']
        elif genre == 'Climate':
            keywordList = keywordexample['Climate']
        elif genre == 'Social':
            keywordList = keywordexample['Social']
        elif genre == 'Nature':
            keywordList = keywordexample['Nature']
        elif genre == 'Implementation':
            keywordList = keywordexample['Implementation']
        else:
            keywordList = None
        
        searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])

        
    with st.container():
        if keywordList is not None:
            queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
                                    value="{}".format(keywordList))
        else:
            queryList = st.text_input("Please enter here your question and we will look \
                                     for an answer in the document OR enter the keyword you \
                                     are looking for and we will \
                                     we will look for similar context \
                                     in the document.",
                                    placeholder="Enter keyword here")

        if st.button("Find them"):

            if queryList == "":
                st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
                logging.warning("Terminated as no keyword provided")
            else:
                
                if 'docs' in st.session_state:
                    docs = st.session_state['docs']
                    paraList = st.session_state['paraList']
                   
                    if searchtype == 'Exact Matches':
                        queryList = list(queryList.split(","))
                        logging.info("performing lexical search")
                        tokenized_corpus = bm25TokenizeDoc(paraList)
                        # st.write(len(tokenized_corpus))
                        document_bm25 = BM25Okapi(tokenized_corpus)

                        with st.spinner("Performing Exact matching search (Lexical search) for you"):
                            st.markdown("##### Top few lexical search (BM25) hits #####")

                            for keyword in queryList:

                                bm25_hits = lexical_search(keyword,document_bm25)
                              
                                
                                counter = 0
                                for hit in bm25_hits:
                                    if hit['score'] > 0.00:
                                        counter += 1
                                        if counter == 1:
                                            st.markdown("###### Results for keyword: **{}** ######".format(keyword))
                                        # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
                                        st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
                                        
                                
                                if counter == 0:
                                    st.write("No results found for '**{}**' ".format(keyword))
                                
                                st.markdown("---")
                    else:
                        logging.info("starting semantic search")
                        with st.spinner("Performing Similar/Contextual search"):
                            query = "Find {} related issues ?".format(queryList)
                            config = configparser.ConfigParser()
                            config.read_file(open('udfPreprocess/paramconfig.cfg'))
                            threshold = float(config.get('semantic_search','THRESHOLD'))
                            # st.write(query)                          
                            semantic_hits = semantic_search(query,paraList)
                            st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))

                            for i,queryhit in enumerate(semantic_hits):

                                # st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
                                counter = 0
                                for hit in queryhit:
                                    counter += 1
                                    
                                                
                                    if hit['score'] > threshold:
                                    # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
                                        st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))

                                    # document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
                                st.markdown("---")
                            # st.write(semantic_hits)

                          


                else:
                    st.info("🤔 No document found, please try to upload it at the sidebar!")
                    logging.warning("Terminated as no keyword provided")