import shutil from typing import List from haystack import Document from haystack.document_stores import FAISSDocumentStore from haystack.nodes import EmbeddingRetriever, PromptNode from haystack.pipelines import Pipeline import streamlit as st from app_utils.entailment_checker import EntailmentChecker from app_utils.config import ( STATEMENTS_PATH, INDEX_DIR, RETRIEVER_MODEL, RETRIEVER_MODEL_FORMAT, NLI_MODEL, PROMPT_MODEL, ) @st.cache() def load_statements(): """Load statements from file""" with open(STATEMENTS_PATH) as fin: statements = [ line.strip() for line in fin.readlines() if not line.startswith("#") ] return statements # cached to make index and models load only at start @st.cache( hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True ) def start_haystack(): """ load document store, retriever, entailment checker and create pipeline """ shutil.copy(f"{INDEX_DIR}/faiss_document_store.db", ".") document_store = FAISSDocumentStore( faiss_index_path=f"{INDEX_DIR}/my_faiss_index.faiss", faiss_config_path=f"{INDEX_DIR}/my_faiss_index.json", ) print(f"Index size: {document_store.get_document_count()}") retriever = EmbeddingRetriever( document_store=document_store, embedding_model=RETRIEVER_MODEL, model_format=RETRIEVER_MODEL_FORMAT, ) entailment_checker = EntailmentChecker( model_name_or_path=NLI_MODEL, use_gpu=False, entailment_contradiction_threshold=0.5, ) pipe = Pipeline() pipe.add_node(component=retriever, name="retriever", inputs=["Query"]) pipe.add_node(component=entailment_checker, name="ec", inputs=["retriever"]) prompt_node = PromptNode(model_name_or_path=PROMPT_MODEL, max_length=150) return pipe, prompt_node pipe, prompt_node = start_haystack() # the pipeline is not included as parameter of the following function, # because it is difficult to cache @st.cache(allow_output_mutation=True) def check_statement(statement: str, retriever_top_k: int = 5): """Run query and verify statement""" params = {"retriever": {"top_k": retriever_top_k}} return pipe.run(statement, params=params) @st.cache( hash_funcs={"tokenizers.Tokenizer": lambda _: None}, allow_output_mutation=True ) def explain_using_llm( statement: str, documents: List[Document], entailment_or_contradiction: str ) -> str: """Explain entailment/contradiction, by prompting a LLM""" premise = " \n".join([doc.content.replace("\n", ". ") for doc in documents]) if entailment_or_contradiction == "entailment": verb = "entails" elif entailment_or_contradiction == "contradiction": verb = "contradicts" prompt = f"Premise: {premise}; Hypothesis: {statement}; Please explain in detail why the Premise {verb} the Hypothesis. Step by step Explanation:" print(prompt) return prompt_node(prompt)[0]