GPT-knowledge-management

Runtime error

App Files Files Community

GPT-knowledge-management / app.py

Abhilashvj

Duplicate from Abhilashvj/haystack_QA

37468fe about 2 years ago

raw

history blame

10.8 kB

	import json
	import logging
	import os
	import shutil
	import sys
	import uuid
	from json import JSONDecodeError
	from pathlib import Path

	import pandas as pd
	import pinecone
	import streamlit as st
	from annotated_text import annotation
	from haystack import Document
	from haystack.document_stores import PineconeDocumentStore
	from haystack.nodes import (
	DocxToTextConverter,
	EmbeddingRetriever,
	FARMReader,
	FileTypeClassifier,
	PDFToTextConverter,
	PreProcessor,
	TextConverter,
	)
	from haystack.pipelines import ExtractiveQAPipeline, Pipeline
	from markdown import markdown
	from sentence_transformers import SentenceTransformer

	index_name = "qa_demo"


	# connect to pinecone environment
	pinecone.init(
	api_key=st.secrets["pinecone_apikey"],
	# environment="us-west1-gcp"
	)
	index_name = "qa-demo"

	preprocessor = PreProcessor(
	clean_empty_lines=True,
	clean_whitespace=True,
	clean_header_footer=False,
	split_by="word",
	split_length=100,
	split_respect_sentence_boundary=True
	)
	file_type_classifier = FileTypeClassifier()
	text_converter = TextConverter()
	pdf_converter = PDFToTextConverter()
	docx_converter = DocxToTextConverter()

	# check if the abstractive-question-answering index exists
	if index_name not in pinecone.list_indexes():
	# create the index if it does not exist
	pinecone.create_index(
	index_name,
	dimension=768,
	metric="cosine"
	)

	# connect to abstractive-question-answering index we created
	index = pinecone.Index(index_name)

	FILE_UPLOAD_PATH= "./data/uploads/"
	os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
	# @st.cache
	def create_doc_store():
	document_store = PineconeDocumentStore(
	api_key= st.secrets["pinecone_apikey"],
	index=index_name,
	similarity="cosine",
	embedding_dim=768
	)
	return document_store

	# @st.cache
	# def create_pipe(document_store):
	# retriever = EmbeddingRetriever(
	# document_store=document_store,
	# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
	# model_format="sentence_transformers",
	# )
	# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
	# pipe = ExtractiveQAPipeline(reader, retriever)
	# return pipe

	def query(pipe, question, top_k_reader, top_k_retriever):
	res = pipe.run(
	query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
	)
	answer_df = []
	# for r in res['answers']:
	# ans_dict = res['answers'][0].meta
	# ans_dict["answer"] = r.context
	# answer_df.append(ans_dict)
	# result = pd.DataFrame(answer_df)
	# result.columns = ["Source","Title","Year","Link","Answer"]
	# result[["Answer","Link","Source","Title","Year"]]
	return res

	document_store = create_doc_store()
	# pipe = create_pipe(document_store)
	retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
	retriever = EmbeddingRetriever(
	document_store=document_store,
	embedding_model=retriever_model,
	model_format="sentence_transformers",
	)
	# load the retriever model from huggingface model hub
	sentence_encoder = SentenceTransformer(retriever_model)

	reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
	pipe = ExtractiveQAPipeline(reader, retriever)


	indexing_pipeline_with_classification = Pipeline()
	indexing_pipeline_with_classification.add_node(
	component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
	)
	indexing_pipeline_with_classification.add_node(
	component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
	)
	indexing_pipeline_with_classification.add_node(
	component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
	)
	indexing_pipeline_with_classification.add_node(
	component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
	)
	indexing_pipeline_with_classification.add_node(
	component=preprocessor,
	name="Preprocessor",
	inputs=["TextConverter", "PdfConverter", "DocxConverter"],
	)

	def set_state_if_absent(key, value):
	if key not in st.session_state:
	st.session_state[key] = value

	# Adjust to a question that you would like users to see in the search bar when they load the UI:
	DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
	DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")

	# Sliders
	DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
	DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))


	st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")

	# Persistent state
	set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
	set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
	set_state_if_absent("results", None)


	# Small callback to reset the interface in case the text of the question changes
	def reset_results(*args):
	st.session_state.answer = None
	st.session_state.results = None
	st.session_state.raw_json = None

	# Title
	st.write("# Haystack Search Demo")
	st.markdown(
	"""
	This demo takes its data from two sample data csv with statistics on various topics. \n
	Ask any question on this topic and see if Haystack can find the correct answer to your query! \n
	Note: do not use keywords, but full-fledged questions. The demo is not optimized to deal with keyword queries and might misunderstand you.
	""",
	unsafe_allow_html=True,
	)

	# Sidebar
	st.sidebar.header("Options")
	st.sidebar.write("## File Upload:")
	data_files = st.sidebar.file_uploader(
	"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
	)
	ALL_FILES = []
	META_DATA = []
	for data_file in data_files:
	# Upload file
	if data_file:
	file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{data_file.name}"
	with open(file_path, "wb") as f:
	f.write(data_file.getbuffer())
	ALL_FILES.append(file_path)
	st.sidebar.write(str(data_file.name) + "    ✅ ")
	META_DATA.append({"filename":data_file.name})


	if len(ALL_FILES) > 0:
	# document_store.update_embeddings(retriever, update_existing_embeddings=False)
	docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)["documents"]
	index_name = "qa_demo"
	# we will use batches of 64
	batch_size = 64
	# docs = docs['documents']
	with st.spinner(
	"🧠    Performing indexing of uplaoded documents... \n "
	):
	for i in range(0, len(docs), batch_size):
	# find end of batch
	i_end = min(i+batch_size, len(docs))
	# extract batch
	batch = [doc.content for doc in docs[i:i_end]]
	# generate embeddings for batch
	emb = sentence_encoder.encode(batch).tolist()
	# get metadata
	meta = [doc.meta for doc in docs[i:i_end]]
	# create unique IDs
	ids = [doc.id for doc in docs[i:i_end]]
	# add all to upsert list
	to_upsert = list(zip(ids, emb, meta))
	# upsert/insert these records to pinecone
	_ = index.upsert(vectors=to_upsert)

	top_k_reader = st.sidebar.slider(
	"Max. number of answers",
	min_value=1,
	max_value=10,
	value=DEFAULT_NUMBER_OF_ANSWERS,
	step=1,
	on_change=reset_results,
	)
	top_k_retriever = st.sidebar.slider(
	"Max. number of documents from retriever",
	min_value=1,
	max_value=10,
	value=DEFAULT_DOCS_FROM_RETRIEVER,
	step=1,
	on_change=reset_results,
	)
	# data_files = st.file_uploader(
	# "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
	# )
	# for data_file in data_files:
	# # Upload file
	# if data_file:
	# raw_json = upload_doc(data_file)

	question = st.text_input(
	value=st.session_state.question,
	max_chars=100,
	on_change=reset_results,
	label="question",
	label_visibility="hidden",
	)
	col1, col2 = st.columns(2)
	col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
	col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)

	# Run button
	run_pressed = col1.button("Run")
	if run_pressed:

	run_query = (
	run_pressed or question != st.session_state.question
	)
	# Get results for query
	if run_query and question:
	reset_results()
	st.session_state.question = question

	with st.spinner(
	"🧠    Performing neural search on documents... \n "
	):
	try:
	st.session_state.results = query(
	pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
	)
	except JSONDecodeError as je:
	st.error("👓    An error occurred reading the results. Is the document store working?")
	except Exception as e:
	logging.exception(e)
	if "The server is busy processing requests" in str(e) or "503" in str(e):
	st.error("🧑‍🌾    All our workers are busy! Try again later.")
	else:
	st.error(f"🐞    An error occurred during the request. {str(e)}")


	if st.session_state.results:

	st.write("## Results:")

	for count, result in enumerate(st.session_state.results['answers']):
	answer, context = result.answer, result.context
	start_idx = context.find(answer)
	end_idx = start_idx + len(answer)
	# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
	try:
	source = f"[{result.meta['Title']}]({result.meta['link']})"
	st.write(
	markdown(f'Source: {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
	unsafe_allow_html=True,
	)
	except:
	filename = result.meta.get('filename', "")
	st.write(
	markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
	unsafe_allow_html=True,
	)