# set path import glob, os, sys; sys.path.append('../scripts') #import helper import scripts.process as pre import scripts.clean as clean #import needed libraries import seaborn as sns from pandas import DataFrame import matplotlib.pyplot as plt import numpy as np import streamlit as st import pandas as pd from sklearn.feature_extraction import _stop_words from haystack.document_stores import InMemoryDocumentStore from haystack.pipelines import ExtractiveQAPipeline from haystack.nodes import FARMReader, TfidfRetriever, EmbeddingRetriever import string from markdown import markdown from annotated_text import annotation from tqdm.autonotebook import tqdm import numpy as np import tempfile import logging logger = logging.getLogger(__name__) #Haystack Components @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True) def start_haystack(documents_processed): document_store = InMemoryDocumentStore() document_store.write_documents(documents_processed) retriever = EmbeddingRetriever( document_store=document_store, embedding_model="peter2000/sdg_sentence_transformer", #"sentence-transformers/multi-qa-mpnet-base-dot-v1", model_format="sentence_transformers") document_store.update_embeddings(retriever) reader = FARMReader(model_name_or_path="deepset/tinyroberta-squad2", use_gpu=True) #deepset/roberta-base-squad2 pipeline = ExtractiveQAPipeline(reader, retriever) return pipeline def ask_question(question,pipeline): prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}) results = [] for answer in prediction["answers"]: answer = answer.to_dict() if answer["answer"]: results.append( { "context": "..." + answer["context"] + "...", "answer": answer["answer"], "relevance": round(answer["score"] * 100, 2), "offset_start_in_doc": answer["offsets_in_document"][0]["start"], } ) else: results.append( { "context": None, "answer": None, "relevance": round(answer["score"] * 100, 2), } ) return results def app(): with st.container(): st.markdown("

Search

", unsafe_allow_html=True) st.write(' ') st.write(' ') with st.expander("ℹī¸ - About this app", expanded=False): st.write(""" The *Search* app is an easy-to-use interface built in Streamlit for searching keywords and phrases in policy document - developed by GIZ Data and the Sustainable Development Solution Network. """) st.markdown("") with st.container(): question = st.text_input("Please enter your question here, we will look for the answer in the document.", value="Which extreme weather is a particular risk?",) if st.button("Find them."): file = st.session_state['file'] if file is not None: with tempfile.NamedTemporaryFile(mode="wb") as temp: bytes_data = file.getvalue() temp.write(bytes_data) file_name = file.name file_path = temp.name # load document documents = pre.load_document(temp.name,file_name) documents_processed = pre.preprocessing(documents) pipeline = start_haystack(documents_processed ) with st.spinner("👑 Performing semantic search on"):#+file.name+"..."): try: msg = 'Asked ' + question logging.info(msg) results = ask_question(question,pipeline) st.write('## Top Results') #st.write(results) for count, result in enumerate(results): if result["answer"]: answer, context = result["answer"], result["context"] start_idx = context.find(answer) end_idx = start_idx + len(answer) st.write( markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]), unsafe_allow_html=True, ) st.markdown(f"**Relevance:** {result['relevance']}") else: st.info( "🤔    Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!" ) except Exception as e: logging.exception(e) else: st.info("🤔 No document found, please try to upload it at the sidebar!")