Spaces:
Build error
Build error
%%bash | |
pip install --upgrade pip | |
pip install farm-haystack[colab,elasticsearch,inference,ocr,preprocessing,file-conversion,pdf] | |
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q | |
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz | |
chown -R daemon:daemon elasticsearch-7.9.2 | |
%%bash --bg | |
sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch | |
import os | |
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore | |
from haystack.utils import fetch_archive_from_http, convert_files_to_docs | |
from haystack.nodes import PreProcessor | |
from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader | |
from haystack.pipelines import Pipeline | |
from haystack.nodes import JoinDocuments | |
import streamlit as st | |
host = os.environ.get("ELASTICSEARCH_HOST", "localhost") | |
document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document") | |
def processor(): | |
doc_dir = "data/Phase1_test_data" | |
url = "https://github.com/dkbs12/External_test/raw/main/Phase1_test_data.zip" | |
fetch_archive_from_http(url=url, output_dir=doc_dir) | |
got_docs = convert_files_to_docs(dir_path=doc_dir) | |
preprocessor = PreProcessor( | |
clean_whitespace=True, | |
clean_header_footer=True, | |
clean_empty_lines=True, | |
split_by="word", | |
split_length=200, | |
split_overlap=20, | |
split_respect_sentence_boundary=True,) | |
all_docs = preprocessor.process(got_docs) | |
document_store.delete_documents() | |
document_store.write_documents(all_docs) | |
bm25_retriever = BM25Retriever(document_store=document_store) | |
embedding_retriever = EmbeddingRetriever( | |
document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1" | |
) | |
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False) | |
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") | |
return bm25_retriever, embedding_retriever, reader | |
with st.sidebar: | |
BM25_number = st.slider('BM25 Retriever', min_value=1, max_value=15, value=10, step=1) | |
Embedding_number = st.slider('Embedding Retriever', min_value=1, max_value=15, value=10, step=1) | |
Response_number = st.slider('Response', min_value=1, max_value=10, value=5, step=1) | |
def pipeline(query): | |
bm25_retriever, embedding_retriever, reader = processor() | |
p_ensemble = Pipeline() | |
p_ensemble.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"]) | |
p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) | |
p_ensemble.add_node( | |
component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["BM25Retriever", "EmbeddingRetriever"] | |
) | |
p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) | |
res = p_ensemble.run( | |
query=query, params={"EmbeddingRetriever": {"top_k": Embedding_number}, "BM25Retriever": {"top_k": BM25_number}, "Reader":{"top_k": Response_number}} | |
) | |
return res | |
st.title('NDC QA System') | |
question = st.text_input( | |
label='๊ด๋ จ ํ์ผ์ ๋ํ ์ง๋ฌธ์ ์ ๋ ฅํด ์ฃผ์ธ์.', | |
placeholder='์๋ฌธ์ผ๋ก ์ ๋ ฅํด ์ฃผ์ธ์' | |
) | |
if __name__ == '__main__': | |
query = question | |
if question: | |
res = pipeline(query) | |
answers = [] | |
scores = [] | |
names = [] | |
contexts = [] | |
for a in res['answers']: | |
answer = a.answer | |
answer = answer.replace('\n', ' ') | |
answers.append(answer) | |
score = a.score | |
scores.append(score) | |
name = a.meta['name'] | |
names.append(name) | |
context = a.context | |
context = context.replace('\n', ' ') | |
contexts.append(context) | |
total_answer = [answers, scores, names, contexts] | |
total = list(zip(*total_answer)) | |
for i, items in enumerate(total): | |
st.write('-'*6+' '+'#'+str(i+1)+' response '+'-'*6) | |
for j, item in enumerate(items): | |
if j == 0: | |
j = '- answer :' | |
st.write(j, item) | |
for j, item in enumerate(items): | |
if j == 1: | |
j = '- score :' | |
st.write(j, item) | |
for j, item in enumerate(items): | |
if j == 2: | |
j = '- file name :' | |
st.write(j, item) | |
for j, item in enumerate(items): | |
if j == 3: | |
j = '- context :' | |
st.write(j, item) |