Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| import streamlit as st | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from langchain.prompts import PromptTemplate | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, CSVLoader | |
| import tempfile | |
| # Load environment variables | |
| load_dotenv() | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| # Custom template to guide LLM model | |
| custom_template = """ | |
| <s>[INST]You will start the conversation by greeting the user and introducing yourself as an Expert PDF documents analyze and assistant, | |
| stating your availability for assistance. Your next step will depend on the user's response. | |
| If the user expresses a need for assistance in pdf or document or txt or csv, you will ask them to describe their question. | |
| However, if the user asks questions out of context from the knowledge base, you will immediately thank them and | |
| say goodbye, ending the conversation. Remember to base your responses on the user's needs, providing accurate and | |
| concise information regarding the data within the knowledge base. Your interactions should be professional and | |
| focused, ensuring the user's queries are addressed efficiently without deviating from the set flows. | |
| CHAT HISTORY: {chat_history} | |
| QUESTION: {question} | |
| ANSWER: | |
| </s>[INST] | |
| """ | |
| CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) | |
| prompt_template = """<s>[INST] | |
| You will answer from the provided files stored in knowledge base | |
| CONTEXT: {context} | |
| CHAT HISTORY: {chat_history} | |
| QUESTION: {question} | |
| ANSWER: | |
| </s>[INST] | |
| """ | |
| prompt = PromptTemplate(template=prompt_template, | |
| input_variables=['context', 'question', 'chat_history']) | |
| # Function to extract text from documents | |
| def get_document_text(uploaded_files): | |
| documents = [] | |
| for uploaded_file in uploaded_files: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[-1]) as temp_file: | |
| temp_file.write(uploaded_file.read()) | |
| temp_file_path = temp_file.name | |
| # Load document based on its type | |
| if uploaded_file.name.endswith(".pdf"): | |
| loader = PyPDFLoader(temp_file_path) | |
| documents.extend(loader.load()) | |
| elif uploaded_file.name.endswith(".docx") or uploaded_file.name.endswith(".doc"): | |
| loader = Docx2txtLoader(temp_file_path) | |
| documents.extend(loader.load()) | |
| elif uploaded_file.name.endswith(".txt"): | |
| loader = TextLoader(temp_file_path) | |
| documents.extend(loader.load()) | |
| elif uploaded_file.name.endswith(".csv"): | |
| loader = CSVLoader(temp_file_path) | |
| documents.extend(loader.load()) | |
| return documents | |
| # Split text into chunks | |
| def get_chunks(documents): | |
| text_splitter = CharacterTextSplitter(separator="\n", chunk_size=600, chunk_overlap=200, length_function=len) | |
| chunks = [chunk for doc in documents for chunk in text_splitter.split_text(doc.page_content)] | |
| return chunks | |
| # Create vectorstore | |
| def get_vectorstore(chunks): | |
| embeddings = OpenAIEmbeddings() | |
| vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings) | |
| return vectorstore | |
| # Create a conversational chain | |
| def get_conversationchain(vectorstore): | |
| llm = ChatOpenAI(temperature=0.1, model_name='gpt-4o-mini') | |
| memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True) | |
| conversation_chain = ConversationalRetrievalChain.from_llm( | |
| llm=llm, | |
| retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k": 10}), | |
| condense_question_prompt=CUSTOM_QUESTION_PROMPT, | |
| memory=memory, | |
| combine_docs_chain_kwargs={'prompt': prompt} | |
| ) | |
| return conversation_chain | |
| # Handle user questions and update chat history | |
| def handle_question(question): | |
| if not st.session_state.conversation: | |
| st.warning("Please process your documents first.") | |
| return | |
| response = st.session_state.conversation({'question': question}) | |
| st.session_state.chat_history = response['chat_history'] | |
| for i, msg in enumerate(st.session_state.chat_history): | |
| if i % 2 == 0: | |
| st.markdown(f"**You:** {msg.content}") | |
| else: | |
| st.markdown(f"**Bot:** {msg.content}") | |
| def handle_question(question): | |
| if not st.session_state.conversation: | |
| st.warning("Please process your documents first.") | |
| return | |
| # Get the response from the conversation chain | |
| response = st.session_state.conversation({'question': question}) | |
| # Update chat history | |
| st.session_state.chat_history = response['chat_history'] | |
| # Display chat history | |
| for i, msg in enumerate(st.session_state.chat_history): | |
| if i % 2 == 0: | |
| st.markdown(f"**You:** {msg.content}") | |
| else: | |
| st.markdown(f"**Bot:** {msg.content}") | |
| # Main Streamlit app | |
| def main(): | |
| st.set_page_config(page_title="Chat with Documents", page_icon="π") | |
| st.title("π Chat with Your Documents") | |
| st.sidebar.title("Upload Your Files") | |
| if "conversation" not in st.session_state: | |
| st.session_state.conversation = None | |
| if "chat_history" not in st.session_state: | |
| st.session_state.chat_history = None | |
| # File uploader | |
| uploaded_files = st.sidebar.file_uploader("Upload your files (PDF, DOCX, TXT, CSV):", accept_multiple_files=True) | |
| # Process button | |
| if st.sidebar.button("Process Documents"): | |
| if uploaded_files: | |
| with st.spinner("Processing documents..."): | |
| # Extract text and create conversation chain | |
| raw_documents = get_document_text(uploaded_files) | |
| text_chunks = get_chunks(raw_documents) | |
| vectorstore = get_vectorstore(text_chunks) | |
| st.session_state.conversation = get_conversationchain(vectorstore) | |
| st.success("Documents processed successfully!") | |
| else: | |
| st.warning("Please upload at least one document.") | |
| # User input | |
| question = st.text_input("Ask a question about the uploaded documents:") | |
| if question: | |
| handle_question(question) | |
| if __name__ == '__main__': | |
| main() | |