import os import streamlit as st from dotenv import load_dotenv from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings # Changed to HuggingFace from langchain.vectorstores import FAISS from langchain.chat_models import ChatOpenAI # For LLM from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from htmlTemplates import css, bot_template, user_template # Function to extract text from PDF documents def get_pdf_text(pdf_docs): text = "" for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: extracted_text = page.extract_text() if extracted_text: text += extracted_text return text # Function to split text into manageable chunks def get_text_chunks(text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(text) return chunks # Function to create a vector store using HuggingFace embeddings def get_vectorstore(text_chunks, huggingface_api_key): embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", # Choose an appropriate model model_kwargs={"use_auth_token": huggingface_api_key} ) vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) return vectorstore # Function to initialize the conversational retrieval chain with GrokAI def get_conversation_chain(vectorstore, grok_api_key, grok_api_base): llm = ChatOpenAI( openai_api_key=grok_api_key, openai_api_base=grok_api_base, model_name="grok-beta", # Specify GrokAI's model temperature=0.5 ) memory = ConversationBufferMemory( memory_key='chat_history', return_messages=True ) conversation_chain = ConversationalRetrievalChain.from_llm( llm=llm, # Use the configured GrokAI LLM retriever=vectorstore.as_retriever(), memory=memory ) return conversation_chain # Function to handle user input and generate responses def handle_userinput(user_question): if st.session_state.conversation is None: st.warning("Documents are still being processed. Please wait.") return response = st.session_state.conversation({'question': user_question}) st.session_state.chat_history = response['chat_history'] # Function triggered when the user presses Enter in the input box def on_enter(): user_question = st.session_state.user_question if user_question: handle_userinput(user_question) st.session_state.user_question = "" # Clear the input box # Function to load and process PDF documents def load_and_process_pdfs(folder_path, huggingface_api_key, grok_api_key, grok_api_base): pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')] if not pdf_files: st.error(f"No PDF files found in the directory: {folder_path}") return pdf_docs = [] for file in pdf_files: file_path = os.path.join(folder_path, file) pdf_docs.append(file_path) with st.spinner("Processing documents..."): # Extract text from PDFs with st.spinner("Extracting text from PDFs..."): pdf_file_objects = [open(file, 'rb') for file in pdf_docs] raw_text = get_pdf_text(pdf_file_objects) # Close the files after reading for f in pdf_file_objects: f.close() # Split text into chunks with st.spinner("Splitting text into chunks..."): text_chunks = get_text_chunks(raw_text) # Create vector store using HuggingFace embeddings with st.spinner("Creating vector store..."): vectorstore = get_vectorstore(text_chunks, huggingface_api_key) # Initialize conversation chain with GrokAI LLM with st.spinner("Initializing conversation chain..."): st.session_state.conversation = get_conversation_chain(vectorstore, grok_api_key, grok_api_base) st.success("Documents processed successfully!") # Function to display chat history with auto-scrolling def display_chat_history(): if st.session_state.chat_history: for i, message in enumerate(st.session_state.chat_history): if i % 2 == 0: st.markdown(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True) else: st.markdown(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True) # Inject JavaScript to scroll the entire page to the bottom scroll_script = """ """ st.markdown(scroll_script, unsafe_allow_html=True) # Main function to run the Streamlit app def main(): load_dotenv() # Retrieve credentials from .env grok_api_key = os.getenv("GROK_API_KEY") grok_api_base = "https://api.x.ai/v1" # GrokAI's API base URL huggingface_api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN") st.set_page_config(page_title="Chat with AI Tax Agent", page_icon=":books:") st.write(css, unsafe_allow_html=True) if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = [] # Title Section st.header("Chat with AI Tax Agent :books:") # Automatically load and process PDFs on startup if st.session_state.conversation is None: documents_folder = "./documents/" # Specify your documents folder path here load_and_process_pdfs(documents_folder, huggingface_api_key, grok_api_key, grok_api_base) # Chat History Section display_chat_history() # Input Box Section st.text_input( "Ask a question about your documents:", key='user_question', on_change=on_enter ) if __name__ == '__main__': main()