# set path import glob, os, sys; sys.path.append('../scripts') #import helper import scripts.process as pre import scripts.clean as clean #import needed libraries import seaborn as sns from pandas import DataFrame import matplotlib.pyplot as plt import numpy as np import streamlit as st import pandas as pd from sklearn.feature_extraction import _stop_words from haystack.document_stores import InMemoryDocumentStore from haystack.pipelines import ExtractiveQAPipeline from haystack.nodes import FARMReader, TfidfRetriever import string from markdown import markdown from annotated_text import annotation from tqdm.autonotebook import tqdm import numpy as np import tempfile import logging logger = logging.getLogger(__name__) #Haystack Components @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True) def start_haystack(documents_processed): document_store = InMemoryDocumentStore() document_store.write_documents(documents_processed) retriever = TfidfRetriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) pipeline = ExtractiveQAPipeline(reader, retriever) return pipeline def ask_question(question,pipeline): prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}) results = [] for answer in prediction["answers"]: answer = answer.to_dict() if answer["answer"]: results.append( { "context": "..." + answer["context"] + "...", "answer": answer["answer"], "relevance": round(answer["score"] * 100, 2), "offset_start_in_doc": answer["offsets_in_document"][0]["start"], } ) else: results.append( { "context": None, "answer": None, "relevance": round(answer["score"] * 100, 2), } ) return results def app(): with st.container(): st.markdown("

Keyword Search

", unsafe_allow_html=True) st.write(' ') st.write(' ') with st.expander("ℹī¸ - About this app", expanded=False): st.write( """ The *Keyword Search* app is an easy-to-use interface built in Streamlit for doing keyword search in policy document - developed by GIZ Data and the Sustainable Development Solution Network. """ ) st.markdown("") st.markdown("") st.markdown("## 📌 Step One: Upload document ") with st.container(): file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt']) if file is not None: with tempfile.NamedTemporaryFile(mode="wb") as temp: bytes_data = file.getvalue() temp.write(bytes_data) file_name = file.name file_path = temp.name st.write("Filename: ", file.name) # load document documents = pre.load_document(temp.name,file_name) documents_processed = pre.preprocessing(documents) pipeline = start_haystack(documents_processed) #docs = pre.load_document(temp.name, file) # preprocess document #haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs) question = st.text_input("Please enter your question here, we will look for the answer in the document.", value="floods",) if st.button("Find them."): with st.spinner("👑 Performing semantic search on"):#+file.name+"..."): try: msg = 'Asked ' + question logging.info(msg) results = ask_question(question,pipeline) st.write('## Top Results') #st.write(results) for count, result in enumerate(results): if result["answer"]: answer, context = result["answer"], result["context"] start_idx = context.find(answer) end_idx = start_idx + len(answer) st.write( markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]), unsafe_allow_html=True, ) st.markdown(f"**Relevance:** {result['relevance']}") else: st.info( "🤔    Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!" ) except Exception as e: logging.exception(e)