File size: 5,073 Bytes
8c0cdd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffd7039
69dccc0
8c0cdd0
 
 
 
 
69dccc0
8c0cdd0
 
 
380b7bb
8c0cdd0
 
 
 
69dccc0
8c0cdd0
 
69dccc0
8c0cdd0
69dccc0
 
8c0cdd0
 
 
 
 
 
45901b5
8c0cdd0
 
 
 
 
 
 
 
 
45901b5
8c0cdd0
 
 
 
 
 
 
69dccc0
 
8c0cdd0
 
 
 
 
 
 
 
 
 
 
45901b5
 
8c0cdd0
45901b5
8c0cdd0
69dccc0
8c0cdd0
bb77cb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# import gradio as gr
# import fitz  # PyMuPDF
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.schema import Document
# from langchain_community.vectorstores import Chroma
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.llms import OpenAI
# from langchain.prompts import PromptTemplate
# from langchain.memory import ConversationBufferMemory
# from langchain.chains import ConversationalRetrievalChain
# import os

# def extract_text_from_pdf(pdf_path):
#     doc = fitz.open(pdf_path)
#     text = ""
#     for page_num in range(len(doc)):
#         page = doc.load_page(page_num)
#         text += page.get_text()
#     return text

# # Load the text from the PDF and preprocess
# openai_api_key = os.getenv("OPENAI_API_KEY")
# pdf_path = "iess402.pdf"  # Path to your PDF file
# pdf_text = extract_text_from_pdf(pdf_path)
# document = Document(page_content=pdf_text, metadata={})
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=25)
# all_splits = text_splitter.split_documents([document])

# # Create vector store and setup the QA chain
# vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(api_key=openai_api_key))
# llm = OpenAI(api_key=openai_api_key, temperature=0, model="gpt-3.5-turbo-instruct", verbose=True)
# template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
# {context}
# Question: {question}
# Helpful Answer:"""
# QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)

# # Setup conversational retrieval chain with memory
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# retriever = vectorstore.as_retriever()
# qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

# # Define the function to ask questions and get answers
# def ask_question(question):
#     result = qa.invoke({"question": question})
#     return result['answer']

# # Create the Gradio interface
# iface = gr.Interface(fn=ask_question, inputs="text", outputs="text", title="PDF QA System", description="Ask questions based Textbook in Political Science for Class IX chapter 2.")

# # Launch the Gradio interface
# iface.launch()


import gradio as gr
import fitz  # PyMuPDF
import re
from pathlib import Path
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain_experimental.text_splitter import SemanticChunker
import os
openai_api_key = os.getenv("OPENAI_API_KEY")

def extract_text_from_pdf(pdf_file):
    document = fitz.open(pdf_file)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    document.close()
    return text

def clean_text(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
    return cleaned_text.strip()


def initialize_chatbot(cleaned_text, openai_api_key):
    embeddings = OpenAIEmbeddings(api_key=openai_api_key)
    text_splitter = SemanticChunker(embeddings)
    docs = text_splitter.create_documents([cleaned_text])
    vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)

    llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
    retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
    return qa

def answer_query(pdf_file, question):
    extracted_text = extract_text_from_pdf(pdf_file)
    cleaned_text = clean_text(extracted_text)
    qa = initialize_chatbot(cleaned_text, openai_api_key)
    result = qa({"question": question})
    return result['answer']

def process_pdf_and_question(pdf_file, question, chat_history):
    if pdf_file is None:
        return chat_history + [("Please upload a PDF file.", "")]
    if not question.strip():
        return chat_history + [("Please enter a question.", "")]
    
    answer = answer_query(pdf_file, question)
    chat_history.append((question, answer))
    return chat_history

with gr.Blocks() as demo:

    upload = gr.File(label="Upload PDF")
    chatbot = gr.Chatbot(label="Chat History")
    question = gr.Textbox(label="Ask a question")
    question.submit(process_pdf_and_question, inputs=[upload, question, chatbot], outputs=[chatbot])

if __name__ == "__main__":
    demo.launch()