import gradio as gr import fitz # PyMuPDF import re from langchain_openai.embeddings import OpenAIEmbeddings from langchain_chroma import Chroma from langchain.retrievers.multi_query import MultiQueryRetriever from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain_openai import ChatOpenAI from langchain_experimental.text_splitter import SemanticChunker import os openai_api_key = os.getenv("OPENAI_API_KEY") vectorstore = None llm = None qa_instance = None chat_history = [] # Global chat history def extract_text_from_pdf(pdf_bytes): document = fitz.open("pdf", pdf_bytes) text = "" for page_num in range(len(document)): page = document.load_page(page_num) text += page.get_text() document.close() return text def clean_text(text): cleaned_text = re.sub(r'\s+', ' ', text) cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text) cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text) return cleaned_text.strip() def initialize_chatbot(cleaned_text, openai_api_key): global vectorstore, llm, qa_instance if vectorstore is None: # Only create embeddings and Chroma once embeddings = OpenAIEmbeddings(api_key=openai_api_key) text_splitter = SemanticChunker(embeddings) docs = text_splitter.create_documents([cleaned_text]) vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings) if llm is None: llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True) retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm) memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory) def setup_qa_system(pdf_file): global chat_history if pdf_file is None: return [("Please upload a PDF file.", "")] extracted_text = extract_text_from_pdf(pdf_file) cleaned_text = clean_text(extracted_text) initialize_chatbot(cleaned_text, openai_api_key) chat_history = [("Chatbot initialized. Please ask a question.", "")] return chat_history def answer_query(question): global chat_history if qa_instance is None: return [("Please upload a PDF and initialize the system first.", "")] if not question.strip(): return [("Please enter a question.", "")] result = qa_instance({"question": question}) chat_history.append((question, result['answer'])) return chat_history with gr.Blocks() as demo: upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"]) chatbot = gr.Chatbot(label="Chatbot") question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...") upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot]) question.submit(answer_query, inputs=[question], outputs=[chatbot]) if __name__ == "__main__": demo.launch()