Extractive-QA / app.py
dkbs12's picture
Update app.py
7695d4e
raw
history blame
4.36 kB
%%bash
pip install --upgrade pip
pip install farm-haystack[colab,elasticsearch,inference,ocr,preprocessing,file-conversion,pdf]
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2
%%bash --bg
sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch
import os
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http, convert_files_to_docs
from haystack.nodes import PreProcessor
from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader
from haystack.pipelines import Pipeline
from haystack.nodes import JoinDocuments
import streamlit as st
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")
@st.cache_resource
def processor():
doc_dir = "data/Phase1_test_data"
url = "https://github.com/dkbs12/External_test/raw/main/Phase1_test_data.zip"
fetch_archive_from_http(url=url, output_dir=doc_dir)
got_docs = convert_files_to_docs(dir_path=doc_dir)
preprocessor = PreProcessor(
clean_whitespace=True,
clean_header_footer=True,
clean_empty_lines=True,
split_by="word",
split_length=200,
split_overlap=20,
split_respect_sentence_boundary=True,)
all_docs = preprocessor.process(got_docs)
document_store.delete_documents()
document_store.write_documents(all_docs)
bm25_retriever = BM25Retriever(document_store=document_store)
embedding_retriever = EmbeddingRetriever(
document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
return bm25_retriever, embedding_retriever, reader
with st.sidebar:
BM25_number = st.slider('BM25 Retriever', min_value=1, max_value=15, value=10, step=1)
Embedding_number = st.slider('Embedding Retriever', min_value=1, max_value=15, value=10, step=1)
Response_number = st.slider('Response', min_value=1, max_value=10, value=5, step=1)
def pipeline(query):
bm25_retriever, embedding_retriever, reader = processor()
p_ensemble = Pipeline()
p_ensemble.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"])
p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
p_ensemble.add_node(
component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["BM25Retriever", "EmbeddingRetriever"]
)
p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])
res = p_ensemble.run(
query=query, params={"EmbeddingRetriever": {"top_k": Embedding_number}, "BM25Retriever": {"top_k": BM25_number}, "Reader":{"top_k": Response_number}}
)
return res
st.title('NDC QA System')
question = st.text_input(
label='๊ด€๋ จ ํŒŒ์ผ์— ๋Œ€ํ•œ ์งˆ๋ฌธ์„ ์ž…๋ ฅํ•ด ์ฃผ์„ธ์š”.',
placeholder='์˜๋ฌธ์œผ๋กœ ์ž…๋ ฅํ•ด ์ฃผ์„ธ์š”'
)
if __name__ == '__main__':
query = question
if question:
res = pipeline(query)
answers = []
scores = []
names = []
contexts = []
for a in res['answers']:
answer = a.answer
answer = answer.replace('\n', ' ')
answers.append(answer)
score = a.score
scores.append(score)
name = a.meta['name']
names.append(name)
context = a.context
context = context.replace('\n', ' ')
contexts.append(context)
total_answer = [answers, scores, names, contexts]
total = list(zip(*total_answer))
for i, items in enumerate(total):
st.write('-'*6+' '+'#'+str(i+1)+' response '+'-'*6)
for j, item in enumerate(items):
if j == 0:
j = '- answer :'
st.write(j, item)
for j, item in enumerate(items):
if j == 1:
j = '- score :'
st.write(j, item)
for j, item in enumerate(items):
if j == 2:
j = '- file name :'
st.write(j, item)
for j, item in enumerate(items):
if j == 3:
j = '- context :'
st.write(j, item)