%%bash pip install --upgrade pip pip install farm-haystack[colab,elasticsearch,inference,ocr,preprocessing,file-conversion,pdf] wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz chown -R daemon:daemon elasticsearch-7.9.2 %%bash --bg sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch import os from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.utils import fetch_archive_from_http, convert_files_to_docs from haystack.nodes import PreProcessor from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader from haystack.pipelines import Pipeline from haystack.nodes import JoinDocuments import streamlit as st host = os.environ.get("ELASTICSEARCH_HOST", "localhost") document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document") @st.cache_resource def processor(): doc_dir = "data/Phase1_test_data" url = "https://github.com/dkbs12/External_test/raw/main/Phase1_test_data.zip" fetch_archive_from_http(url=url, output_dir=doc_dir) got_docs = convert_files_to_docs(dir_path=doc_dir) preprocessor = PreProcessor( clean_whitespace=True, clean_header_footer=True, clean_empty_lines=True, split_by="word", split_length=200, split_overlap=20, split_respect_sentence_boundary=True,) all_docs = preprocessor.process(got_docs) document_store.delete_documents() document_store.write_documents(all_docs) bm25_retriever = BM25Retriever(document_store=document_store) embedding_retriever = EmbeddingRetriever( document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1" ) document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") return bm25_retriever, embedding_retriever, reader with st.sidebar: BM25_number = st.slider('BM25 Retriever', min_value=1, max_value=15, value=10, step=1) Embedding_number = st.slider('Embedding Retriever', min_value=1, max_value=15, value=10, step=1) Response_number = st.slider('Response', min_value=1, max_value=10, value=5, step=1) def pipeline(query): bm25_retriever, embedding_retriever, reader = processor() p_ensemble = Pipeline() p_ensemble.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"]) p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) p_ensemble.add_node( component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["BM25Retriever", "EmbeddingRetriever"] ) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) res = p_ensemble.run( query=query, params={"EmbeddingRetriever": {"top_k": Embedding_number}, "BM25Retriever": {"top_k": BM25_number}, "Reader":{"top_k": Response_number}} ) return res st.title('NDC QA System') question = st.text_input( label='관련 파일에 대한 질문을 입력해 주세요.', placeholder='영문으로 입력해 주세요' ) if __name__ == '__main__': query = question if question: res = pipeline(query) answers = [] scores = [] names = [] contexts = [] for a in res['answers']: answer = a.answer answer = answer.replace('\n', ' ') answers.append(answer) score = a.score scores.append(score) name = a.meta['name'] names.append(name) context = a.context context = context.replace('\n', ' ') contexts.append(context) total_answer = [answers, scores, names, contexts] total = list(zip(*total_answer)) for i, items in enumerate(total): st.write('-'*6+' '+'#'+str(i+1)+' response '+'-'*6) for j, item in enumerate(items): if j == 0: j = '- answer :' st.write(j, item) for j, item in enumerate(items): if j == 1: j = '- score :' st.write(j, item) for j, item in enumerate(items): if j == 2: j = '- file name :' st.write(j, item) for j, item in enumerate(items): if j == 3: j = '- context :' st.write(j, item)