Spaces:
Sleeping
Sleeping
File size: 9,498 Bytes
cf5ad3d d8f0bda cf5ad3d d8f0bda edb80a3 fe58b7c d8f0bda cf5ad3d edb80a3 cf5ad3d 025429e cf5ad3d 8b88b36 aa5df8c d8f0bda 8b88b36 d8f0bda 8b88b36 d8f0bda 8b88b36 aa5df8c 8b88b36 aa5df8c 8b88b36 aa5df8c 8b88b36 aa5df8c 8b88b36 aa5df8c 8b88b36 d8f0bda aa5df8c 8b88b36 b3b3691 3a30a49 b3b3691 aa5df8c 3a30a49 b3b3691 3a30a49 b3b3691 3a30a49 b3b3691 3a30a49 b3b3691 3a30a49 b3b3691 3a30a49 b3b3691 3a30a49 b3b3691 3a30a49 b3b3691 3a30a49 b3b3691 d8f0bda b3b3691 d8f0bda b3b3691 d8f0bda b3b3691 d8f0bda b3b3691 d8f0bda b3b3691 d8f0bda b3b3691 d8f0bda fe58b7c d8f0bda 8b88b36 cf5ad3d 8b88b36 cf5ad3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import streamlit as st
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import os
import dotenv
from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader, UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, UnstructuredExcelLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain.memory import ConversationBufferMemory
import tempfile
# Set page config
st.set_page_config(page_title="Enterprise document search + chat", layout="wide")
# Streamlit app header
st.title("Enterprise document helpdesk")
# Initialize session state
if 'api_key_entered' not in st.session_state:
st.session_state.api_key_entered = False
# Sidebar
with st.sidebar:
st.header("Configuration")
api_key = st.text_input("Enter your OpenAI API Key:", type="password")
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
st.session_state.api_key_entered = True
if st.session_state.api_key_entered:
st.header('Document Upload and Processing')
uploaded_files = st.file_uploader('Upload your files', accept_multiple_files=True, type=['txt', 'pdf', 'csv', 'ppt', 'doc', 'xls', 'pptx', 'xlsx'])
def load_file(file):
file_extension = os.path.splitext(file.name)[1].lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
temp_file.write(file.getvalue())
temp_file_path = temp_file.name
if file_extension == '.txt':
loader = TextLoader(temp_file_path)
elif file_extension == '.pdf':
loader = PyPDFLoader(temp_file_path)
elif file_extension == '.csv':
loader = CSVLoader(temp_file_path)
elif file_extension in ['.ppt', '.pptx']:
loader = UnstructuredPowerPointLoader(temp_file_path)
elif file_extension in ['.doc', '.docx']:
loader = UnstructuredWordDocumentLoader(temp_file_path)
elif file_extension in ['.xls', '.xlsx']:
loader = UnstructuredExcelLoader(temp_file_path)
else:
os.unlink(temp_file_path)
raise ValueError(f"Unsupported file type: {file_extension}")
documents = loader.load()
os.unlink(temp_file_path)
return documents
def summarize_documents(documents):
chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)
combined_text = " ".join([doc.page_content for doc in documents])
prompt = f"""Summarize the following document in a concise manner, highlighting the key points:
{combined_text}
Summary:"""
response = chat.invoke(prompt)
return response.content
# Process uploaded files
if uploaded_files:
if st.button("Process Documents"):
with st.spinner("Processing documents..."):
all_documents = []
for file in uploaded_files:
all_documents.extend(load_file(file))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(all_documents)
# Store processed documents in session state
st.session_state.processed_documents = all_splits
st.success("Documents processed successfully!")
# Add a button for summarization
if st.button("Generate Summary"):
with st.spinner("Generating summary..."):
summary = summarize_documents(st.session_state.processed_documents)
st.session_state.document_summary = summary
st.success("Summary generated successfully!")
# Display the summary if it exists
if 'document_summary' in st.session_state:
st.subheader("Document Summary")
st.write(st.session_state.document_summary)
# Main app logic
if st.session_state.api_key_entered:
# Initialize components
@st.cache_resource
def initialize_components():
dotenv.load_dotenv()
chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
return chat, embeddings
# Load components
chat, embeddings = initialize_components()
# Create vectorstore and retriever only if documents are processed
if 'processed_documents' in st.session_state:
vectorstore = Chroma.from_documents(documents=st.session_state.processed_documents, embedding=embeddings)
retriever = vectorstore.as_retriever(k=4)
SYSTEM_TEMPLATE = """
You are an advanced AI assistant designed for document search and chatbot functionality. Your primary functions are:
1. Process and structure multiple documents in various formats, including:
.txt, .pdf, .csv, .ppt, .doc, .xls, .pptx, and .xlsx
2. Extract and organize information from these unstructured documents into a coherent, searchable format.
3. Retrieve relevant information from the processed documents based on user queries.
4. Act as a chatbot, engaging in conversations about the content of the documents.
5. Provide accurate and contextual responses to user questions, drawing solely from the information contained within the processed documents.
6. If a user's question is not related to the content of the provided documents, politely inform them that you can only answer questions based on the information in the given documents.
7. When answering, cite the specific document or section where the information was found, if possible.
8. If there's ambiguity in a query, ask for clarification to ensure you provide the most relevant information.
9. Maintain confidentiality and do not share or discuss information from one user's documents with other users.
Remember, your knowledge is limited to the content of the documents you've been given to process. Do not provide information or answer questions that are outside the scope of these documents. Always strive for accuracy and relevance in your responses.
<context>
{context}
</context>
Chat History:
{chat_history}
"""
question_answering_prompt = ChatPromptTemplate.from_messages(
[
(
"system",
SYSTEM_TEMPLATE,
),
MessagesPlaceholder(variable_name="chat_history"),
MessagesPlaceholder(variable_name="messages"),
]
)
document_chain = create_stuff_documents_chain(chat, question_answering_prompt)
# Initialize memory for each session
if "memory" not in st.session_state:
st.session_state.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Chat interface
st.subheader("Chat with Assistant")
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# React to user input
if prompt := st.chat_input("What would you like to know about Document?"):
# Display user message in chat message container
st.chat_message("user").markdown(prompt)
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("assistant"):
message_placeholder = st.empty()
# Retrieve relevant documents
docs = retriever.get_relevant_documents(prompt)
# Generate response
response = document_chain.invoke(
{
"context": docs,
"chat_history": st.session_state.memory.load_memory_variables({})["chat_history"],
"messages": [
HumanMessage(content=prompt)
],
}
)
# The response is already a string, so we can use it directly
full_response = response
message_placeholder.markdown(full_response)
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": full_response})
# Update memory
st.session_state.memory.save_context({"input": prompt}, {"output": full_response})
else:
st.info("Please upload and process documents to start chatting.")
else:
st.info("Please enter your OpenAI API Key in the sidebar to start.")
# Add a footer
st.markdown("---")
st.markdown("By AI Planet") |