Spaces:
Runtime error
Runtime error
# set path | |
import glob, os, sys; | |
sys.path.append('../utils') | |
import streamlit as st | |
import json | |
import logging | |
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search | |
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch | |
from utils.checkconfig import getconfig | |
from utils.streamlitcheck import checkbox_without_preselect | |
# Declare all the necessary variables | |
config = getconfig('paramconfig.cfg') | |
split_by = config.get('semantic_search','SPLIT_BY') | |
split_length = int(config.get('semantic_search','SPLIT_LENGTH')) | |
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP')) | |
split_respect_sentence_boundary = bool(int(config.get('semantic_search', | |
'RESPECT_SENTENCE_BOUNDARY'))) | |
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC'))) | |
embedding_model = config.get('semantic_search','RETRIEVER') | |
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT') | |
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER')) | |
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM')) | |
max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH')) | |
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K')) | |
reader_model = config.get('semantic_search','READER') | |
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K')) | |
top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE')) | |
lexical_split_by= config.get('lexical_search','SPLIT_BY') | |
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH')) | |
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP')) | |
lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC'))) | |
lexical_top_k=int(config.get('lexical_search','TOP_K')) | |
def app(): | |
with st.container(): | |
st.markdown("<h1 style='text-align: center; \ | |
color: black;'> Search</h1>", | |
unsafe_allow_html=True) | |
st.write(' ') | |
st.write(' ') | |
with st.expander("ℹ️ - About this app", expanded=False): | |
st.write( | |
""" | |
The *Search* app is an interface \ | |
for doing contextual and keyword searches in \ | |
policy documents. \ | |
""") | |
st.write("") | |
st.write(""" The application allows its user to perform a search\ | |
based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\ | |
and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\ | |
The lexical search only \ | |
displays paragraphs in the document with exact matching results, \ | |
the semantic search shows paragraphs with meaningful connections \ | |
(e.g., synonyms) based on the search context. Both \ | |
methods employ a probabilistic retrieval framework in its identification\ | |
of relevant paragraphs. By defualt the search is performed using \ | |
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \ | |
checkbox provided which will by-pass semantic search. Furthermore,\ | |
the application allows the user to search for pre-defined keywords \ | |
from different thematic buckets present in sidebar.""") | |
st.write("") | |
st.write(""" The Exact Matches gives back top {} findings, and Semantic | |
search provides with top {} answers.""".format(lexical_top_k, retriever_top_k)) | |
st.write("") | |
st.write("") | |
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB") | |
col1,col2,col3= st.columns([2,4,4]) | |
with col1: | |
st.caption("OCR File processing") | |
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True) | |
st.write("50 sec") | |
with col2: | |
st.caption("Lexical Search on 200 paragraphs(~ 35 pages)") | |
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True) | |
st.write("15 sec") | |
with col3: | |
st.caption("Semantic search on 200 paragraphs(~ 35 pages)") | |
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True) | |
st.write("120 sec(including emebedding creation)") | |
with st.sidebar: | |
with open('docStore/sample/keywordexample.json','r') as json_file: | |
keywordexample = json.load(json_file) | |
# genre = st.radio("Select Keyword Category", list(keywordexample.keys())) | |
st.caption("Select Keyword Category") | |
genre = checkbox_without_preselect(list(keywordexample.keys())) | |
if genre: | |
keywordList = keywordexample[genre] | |
else: | |
keywordList = None | |
st.markdown("---") | |
with st.container(): | |
type_hinting = "Please enter here your question and we \ | |
will look for an answer in the document\ | |
OR enter the keyword you are looking \ | |
for and we will look for similar\ | |
context in the document.\ | |
You can also explore predefined sets of keywords from sidebar. " | |
if keywordList is not None: | |
# queryList = st.text_input("You selected the {} category we \ | |
# will look for these keywords in document".format(genre) | |
# value="{}".format(keywordList)) | |
queryList = st.text_input(type_hinting, | |
value = "{}".format(keywordList)) | |
else: | |
queryList = st.text_input(type_hinting, | |
placeholder="Enter keyword/query here") | |
searchtype = st.checkbox("Show only Exact Matches") | |
if st.button("Find them"): | |
if queryList == "": | |
st.info("🤔 No keyword provided, if you dont have any, \ | |
please try example sets from sidebar!") | |
logging.warning("Terminated as no keyword provided") | |
else: | |
if 'filepath' in st.session_state: | |
if searchtype: | |
all_documents = runLexicalPreprocessingPipeline( | |
file_name=st.session_state['filename'], | |
file_path=st.session_state['filepath'], | |
split_by=lexical_split_by, | |
split_length=lexical_split_length, | |
split_overlap=lexical_split_overlap, | |
remove_punc=lexical_remove_punc) | |
logging.info("performing lexical search") | |
with st.spinner("Performing Exact matching search \ | |
(Lexical search) for you"): | |
lexical_search(query=queryList, | |
documents = all_documents['documents'], | |
top_k = lexical_top_k ) | |
else: | |
all_documents = runSemanticPreprocessingPipeline( | |
file_path= st.session_state['filepath'], | |
file_name = st.session_state['filename'], | |
split_by=split_by, | |
split_length= split_length, | |
split_overlap=split_overlap, | |
remove_punc= remove_punc, | |
split_respect_sentence_boundary=split_respect_sentence_boundary) | |
if len(all_documents['documents']) > 100: | |
warning_msg = ": This might take sometime, please sit back and relax." | |
else: | |
warning_msg = "" | |
logging.info("starting semantic search") | |
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)): | |
semantic_keywordsearch(query = queryList, | |
documents = all_documents['documents'], | |
embedding_model=embedding_model, | |
embedding_layer=embedding_layer, | |
embedding_model_format=embedding_model_format, | |
reader_model=reader_model,reader_top_k=reader_top_k, | |
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim, | |
max_seq_len=max_seq_len, | |
top_k_per_candidate = top_k_per_candidate) | |
else: | |
st.info("🤔 No document found, please try to upload it at the sidebar!") | |
logging.warning("Terminated as no document provided") | |