Spaces:

Abhilashvj
/

haystack_QA

Runtime error

haystack_QA / app.py

Abhilash V J

Added file uplaod option

bd5eb62 over 1 year ago

8.9 kB

	import os
	import sys
	import logging
	from pathlib import Path
	from json import JSONDecodeError
	import pandas as pd
	import streamlit as st
	from annotated_text import annotation
	from markdown import markdown
	import json
	from haystack import Document
	import pandas as pd
	from haystack.document_stores import PineconeDocumentStore
	from haystack.nodes import EmbeddingRetriever, FARMReader
	from haystack.pipelines import ExtractiveQAPipeline
	import shutil
	import uuid
	from pathlib import Path
	from haystack.pipelines import Pipeline
	from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter


	preprocessor = PreProcessor(
	clean_empty_lines=True,
	clean_whitespace=True,
	clean_header_footer=False,
	split_by="word",
	split_length=100,
	split_respect_sentence_boundary=True
	)
	file_type_classifier = FileTypeClassifier()
	text_converter = TextConverter()
	pdf_converter = PDFToTextConverter()
	docx_converter = DocxToTextConverter()


	FILE_UPLOAD_PATH= "./data/uploads/"
	os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
	# @st.cache
	def create_doc_store():
	document_store = PineconeDocumentStore(
	api_key= st.secrets["pinecone_apikey"],
	index='qa_demo',
	similarity="cosine",
	embedding_dim=768
	)
	return document_store

	# @st.cache
	# def create_pipe(document_store):
	# retriever = EmbeddingRetriever(
	# document_store=document_store,
	# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
	# model_format="sentence_transformers",
	# )
	# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
	# pipe = ExtractiveQAPipeline(reader, retriever)
	# return pipe

	def query(pipe, question, top_k_reader, top_k_retriever):
	res = pipe.run(
	query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
	)
	answer_df = []
	# for r in res['answers']:
	# ans_dict = res['answers'][0].meta
	# ans_dict["answer"] = r.context
	# answer_df.append(ans_dict)
	# result = pd.DataFrame(answer_df)
	# result.columns = ["Source","Title","Year","Link","Answer"]
	# result[["Answer","Link","Source","Title","Year"]]
	return res

	document_store = create_doc_store()
	# pipe = create_pipe(document_store)
	retriever = EmbeddingRetriever(
	document_store=document_store,
	embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
	model_format="sentence_transformers",
	)
	reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
	pipe = ExtractiveQAPipeline(reader, retriever)

	indexing_pipeline_with_classification = Pipeline()
	indexing_pipeline_with_classification.add_node(
	component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
	)
	indexing_pipeline_with_classification.add_node(
	component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
	)
	indexing_pipeline_with_classification.add_node(
	component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
	)
	indexing_pipeline_with_classification.add_node(
	component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
	)
	indexing_pipeline_with_classification.add_node(
	component=preprocessor,
	name="Preprocessor",
	inputs=["TextConverter", "PdfConverter", "DocxConverter"],
	)
	indexing_pipeline_with_classification.add_node(
	component=document_store, name="DocumentStore", inputs=["Preprocessor"]
	)

	def set_state_if_absent(key, value):
	if key not in st.session_state:
	st.session_state[key] = value

	# Adjust to a question that you would like users to see in the search bar when they load the UI:
	DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
	DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")

	# Sliders
	DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
	DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))


	st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")

	# Persistent state
	set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
	set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
	set_state_if_absent("results", None)


	# Small callback to reset the interface in case the text of the question changes
	def reset_results(*args):
	st.session_state.answer = None
	st.session_state.results = None
	st.session_state.raw_json = None

	# Title
	st.write("# Haystack Search Demo")
	st.markdown(
	"""
	This demo takes its data from two sample data csv with statistics on various topics. \n
	Ask any question on this topic and see if Haystack can find the correct answer to your query! \n
	Note: do not use keywords, but full-fledged questions. The demo is not optimized to deal with keyword queries and might misunderstand you.
	""",
	unsafe_allow_html=True,
	)

	# Sidebar
	st.sidebar.header("Options")
	st.sidebar.write("## File Upload:")
	data_files = st.sidebar.file_uploader(
	"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
	)
	ALL_FILES = []
	for data_file in data_files:
	# Upload file
	if data_file:
	file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{data_file.name}"
	with file_path.open("wb") as buffer:
	shutil.copyfileobj(data_file.file, buffer)
	ALL_FILES.append(file_path)
	st.sidebar.write(str(data_file.name) + "    ✅ ")
	indexing_pipeline_with_classification.run(file_paths=ALL_FILES)

	if len(ALL_FILES) > 0:
	document_store.update_embeddings(retriever, update_existing_embeddings=False)

	top_k_reader = st.sidebar.slider(
	"Max. number of answers",
	min_value=1,
	max_value=10,
	value=DEFAULT_NUMBER_OF_ANSWERS,
	step=1,
	on_change=reset_results,
	)
	top_k_retriever = st.sidebar.slider(
	"Max. number of documents from retriever",
	min_value=1,
	max_value=10,
	value=DEFAULT_DOCS_FROM_RETRIEVER,
	step=1,
	on_change=reset_results,
	)
	# data_files = st.file_uploader(
	# "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
	# )
	# for data_file in data_files:
	# # Upload file
	# if data_file:
	# raw_json = upload_doc(data_file)

	question = st.text_input(
	value=st.session_state.question,
	max_chars=100,
	on_change=reset_results,
	label="question",
	label_visibility="hidden",
	)
	col1, col2 = st.columns(2)
	col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
	col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)

	# Run button
	run_pressed = col1.button("Run")
	if run_pressed:

	run_query = (
	run_pressed or question != st.session_state.question
	)
	# Get results for query
	if run_query and question:
	reset_results()
	st.session_state.question = question

	with st.spinner(
	"🧠    Performing neural search on documents... \n "
	):
	try:
	st.session_state.results = query(
	pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
	)
	except JSONDecodeError as je:
	st.error("👓    An error occurred reading the results. Is the document store working?")
	except Exception as e:
	logging.exception(e)
	if "The server is busy processing requests" in str(e) or "503" in str(e):
	st.error("🧑‍🌾    All our workers are busy! Try again later.")
	else:
	st.error(f"🐞    An error occurred during the request. {str(e)}")


	if st.session_state.results:

	st.write("## Results:")

	for count, result in enumerate(st.session_state.results['answers']):
	answer, context = result.answer, result.context
	start_idx = context.find(answer)
	end_idx = start_idx + len(answer)
	source = f"[{result.meta['Title']}]({result.meta['link']})"
	# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
	st.write(
	markdown(f'Source: {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
	unsafe_allow_html=True,
	)