File size: 3,063 Bytes
7c56890
bfaa73f
 
 
 
 
 
 
 
 
9f85a45
 
bfaa73f
 
 
 
f26ae96
bfaa73f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f26ae96
bfaa73f
f26ae96
bfaa73f
 
 
 
 
 
 
 
 
 
 
f26ae96
bfaa73f
 
 
 
f26ae96
bfaa73f
 
 
 
f26ae96
bfaa73f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f26ae96
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
import fitz  # PyMuPDF
import re
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain_experimental.text_splitter import SemanticChunker
import os
openai_api_key = os.getenv("OPENAI_API_KEY")

vectorstore = None
llm = None
qa_instance = None
chat_history = []  # Global chat history

def extract_text_from_pdf(pdf_bytes):
    document = fitz.open("pdf", pdf_bytes)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    document.close()
    return text

def clean_text(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
    return cleaned_text.strip()

def initialize_chatbot(cleaned_text, openai_api_key):
    global vectorstore, llm, qa_instance
    if vectorstore is None:  # Only create embeddings and Chroma once
        embeddings = OpenAIEmbeddings(api_key=openai_api_key)
        text_splitter = SemanticChunker(embeddings)
        docs = text_splitter.create_documents([cleaned_text])
        vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
    if llm is None:
        llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
    retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

def setup_qa_system(pdf_file):
    global chat_history
    if pdf_file is None:
        return [("Please upload a PDF file.", "")]
    extracted_text = extract_text_from_pdf(pdf_file)
    cleaned_text = clean_text(extracted_text)
    initialize_chatbot(cleaned_text, openai_api_key)
    chat_history = [("Chatbot initialized. Please ask a question.", "")]
    return chat_history

def answer_query(question):
    global chat_history
    if qa_instance is None:
        return [("Please upload a PDF and initialize the system first.", "")]
    if not question.strip():
        return [("Please enter a question.", "")]
    result = qa_instance({"question": question})
    chat_history.append((question, result['answer']))
    return chat_history

with gr.Blocks() as demo:
    upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"])
    chatbot = gr.Chatbot(label="Chatbot")
    question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...")

    upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot])
    question.submit(answer_query, inputs=[question], outputs=[chatbot])

if __name__ == "__main__":
    demo.launch()