Spaces:
Sleeping
Sleeping
File size: 3,016 Bytes
7c56890 bfaa73f 7c56890 bfaa73f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
from huggingface_hub import InferenceClient
import fitz # PyMuPDF
import re
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain_experimental.text_splitter import SemanticChunker
# Place your OpenAI API key in a safe place, such as an environment variable or a secure vault
openai_api_key = "YOUR_OPENAI_API_KEY_HERE"
vectorstore = None
llm = None
qa_instance = None
chat_history = []
def extract_text_from_pdf(pdf_bytes):
document = fitz.open("pdf", pdf_bytes)
text = ""
for page_num in range(len(document)):
page = document.load_page(page_num)
text += page.get_text()
document.close()
return text
def clean_text(text):
cleaned_text = re.sub(r'\s+', ' ', text)
cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
return cleaned_text.strip()
def initialize_chatbot(cleaned_text):
global vectorstore, llm, qa_instance
if vectorstore is None:
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
text_splitter = SemanticChunker(embeddings)
docs = text_splitter.create_documents([cleaned_text])
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
if llm is None:
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
def setup_qa_system(pdf_file):
if pdf_file is None:
return [("Please upload a PDF file.", "")]
extracted_text = extract_text_from_pdf(pdf_file)
cleaned_text = clean_text(extracted_text)
initialize_chatbot(cleaned_text)
chat_history = [("Chatbot initialized. Please ask a question.", "")]
return chat_history
def answer_query(question):
if qa_instance is None:
return [("Please upload a PDF and initialize the system first.", "")]
if not question.strip():
return [("Please enter a question.", "")]
result = qa_instance({"question": question})
chat_history.append((question, result['answer']))
return chat_history
with gr.Blocks() as demo:
upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"])
chatbot = gr.Chatbot(label="Chatbot")
question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...")
upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot])
question.submit(answer_query, inputs=[question], outputs=[chatbot])
demo.launch()
|