Spaces:

AjiNiktech
/

Document_search

Sleeping

App Files Files Community

Document_search / app.py

AjiNiktech

Update app.py

8b88b36 verified about 1 year ago

raw

history blame contribute delete

9.5 kB

	import streamlit as st
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings
	import os
	import dotenv
	from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader, UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, UnstructuredExcelLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_chroma import Chroma
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
	from langchain_core.messages import HumanMessage, AIMessage
	from langchain.memory import ConversationBufferMemory
	import tempfile

	# Set page config
	st.set_page_config(page_title="Enterprise document search + chat", layout="wide")

	# Streamlit app header
	st.title("Enterprise document helpdesk")

	# Initialize session state
	if 'api_key_entered' not in st.session_state:
	st.session_state.api_key_entered = False

	# Sidebar
	with st.sidebar:
	st.header("Configuration")
	api_key = st.text_input("Enter your OpenAI API Key:", type="password")
	if api_key:
	os.environ["OPENAI_API_KEY"] = api_key
	st.session_state.api_key_entered = True

	if st.session_state.api_key_entered:
	st.header('Document Upload and Processing')
	uploaded_files = st.file_uploader('Upload your files', accept_multiple_files=True, type=['txt', 'pdf', 'csv', 'ppt', 'doc', 'xls', 'pptx', 'xlsx'])

	def load_file(file):
	file_extension = os.path.splitext(file.name)[1].lower()
	with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
	temp_file.write(file.getvalue())
	temp_file_path = temp_file.name

	if file_extension == '.txt':
	loader = TextLoader(temp_file_path)
	elif file_extension == '.pdf':
	loader = PyPDFLoader(temp_file_path)
	elif file_extension == '.csv':
	loader = CSVLoader(temp_file_path)
	elif file_extension in ['.ppt', '.pptx']:
	loader = UnstructuredPowerPointLoader(temp_file_path)
	elif file_extension in ['.doc', '.docx']:
	loader = UnstructuredWordDocumentLoader(temp_file_path)
	elif file_extension in ['.xls', '.xlsx']:
	loader = UnstructuredExcelLoader(temp_file_path)
	else:
	os.unlink(temp_file_path)
	raise ValueError(f"Unsupported file type: {file_extension}")

	documents = loader.load()
	os.unlink(temp_file_path)
	return documents

	def summarize_documents(documents):
	chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)

	combined_text = " ".join([doc.page_content for doc in documents])

	prompt = f"""Summarize the following document in a concise manner, highlighting the key points:

	{combined_text}

	Summary:"""

	response = chat.invoke(prompt)
	return response.content

	# Process uploaded files
	if uploaded_files:
	if st.button("Process Documents"):
	with st.spinner("Processing documents..."):
	all_documents = []
	for file in uploaded_files:
	all_documents.extend(load_file(file))

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	all_splits = text_splitter.split_documents(all_documents)

	# Store processed documents in session state
	st.session_state.processed_documents = all_splits
	st.success("Documents processed successfully!")

	# Add a button for summarization
	if st.button("Generate Summary"):
	with st.spinner("Generating summary..."):
	summary = summarize_documents(st.session_state.processed_documents)
	st.session_state.document_summary = summary
	st.success("Summary generated successfully!")

	# Display the summary if it exists
	if 'document_summary' in st.session_state:
	st.subheader("Document Summary")
	st.write(st.session_state.document_summary)

	# Main app logic
	if st.session_state.api_key_entered:
	# Initialize components
	@st.cache_resource
	def initialize_components():
	dotenv.load_dotenv()
	chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)
	embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
	return chat, embeddings

	# Load components
	chat, embeddings = initialize_components()

	# Create vectorstore and retriever only if documents are processed
	if 'processed_documents' in st.session_state:
	vectorstore = Chroma.from_documents(documents=st.session_state.processed_documents, embedding=embeddings)
	retriever = vectorstore.as_retriever(k=4)

	SYSTEM_TEMPLATE = """
	You are an advanced AI assistant designed for document search and chatbot functionality. Your primary functions are:

	1. Process and structure multiple documents in various formats, including:
	.txt, .pdf, .csv, .ppt, .doc, .xls, .pptx, and .xlsx

	2. Extract and organize information from these unstructured documents into a coherent, searchable format.

	3. Retrieve relevant information from the processed documents based on user queries.

	4. Act as a chatbot, engaging in conversations about the content of the documents.

	5. Provide accurate and contextual responses to user questions, drawing solely from the information contained within the processed documents.

	6. If a user's question is not related to the content of the provided documents, politely inform them that you can only answer questions based on the information in the given documents.

	7. When answering, cite the specific document or section where the information was found, if possible.

	8. If there's ambiguity in a query, ask for clarification to ensure you provide the most relevant information.

	9. Maintain confidentiality and do not share or discuss information from one user's documents with other users.

	Remember, your knowledge is limited to the content of the documents you've been given to process. Do not provide information or answer questions that are outside the scope of these documents. Always strive for accuracy and relevance in your responses.

	<context>
	{context}
	</context>

	Chat History:
	{chat_history}
	"""

	question_answering_prompt = ChatPromptTemplate.from_messages(
	[
	(
	"system",
	SYSTEM_TEMPLATE,
	),
	MessagesPlaceholder(variable_name="chat_history"),
	MessagesPlaceholder(variable_name="messages"),
	]
	)

	document_chain = create_stuff_documents_chain(chat, question_answering_prompt)

	# Initialize memory for each session
	if "memory" not in st.session_state:
	st.session_state.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

	# Chat interface
	st.subheader("Chat with Assistant")

	# Initialize chat history
	if "messages" not in st.session_state:
	st.session_state.messages = []

	# Display chat messages from history on app rerun
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# React to user input
	if prompt := st.chat_input("What would you like to know about Document?"):
	# Display user message in chat message container
	st.chat_message("user").markdown(prompt)
	# Add user message to chat history
	st.session_state.messages.append({"role": "user", "content": prompt})

	with st.chat_message("assistant"):
	message_placeholder = st.empty()

	# Retrieve relevant documents
	docs = retriever.get_relevant_documents(prompt)

	# Generate response
	response = document_chain.invoke(
	{
	"context": docs,
	"chat_history": st.session_state.memory.load_memory_variables({})["chat_history"],
	"messages": [
	HumanMessage(content=prompt)
	],
	}
	)

	# The response is already a string, so we can use it directly
	full_response = response
	message_placeholder.markdown(full_response)

	# Add assistant response to chat history
	st.session_state.messages.append({"role": "assistant", "content": full_response})

	# Update memory
	st.session_state.memory.save_context({"input": prompt}, {"output": full_response})

	else:
	st.info("Please upload and process documents to start chatting.")

	else:
	st.info("Please enter your OpenAI API Key in the sidebar to start.")

	# Add a footer
	st.markdown("---")
	st.markdown("By AI Planet")