chat-bot / app.py
Gaurav-2273's picture
Update app.py
2e4fe6f verified
raw
history blame
No virus
5.84 kB
# import gradio as gr
# import fitz # PyMuPDF
# import re
# from langchain_openai.embeddings import OpenAIEmbeddings
# from langchain_chroma import Chroma
# from langchain.retrievers.multi_query import MultiQueryRetriever
# from langchain.chains import ConversationalRetrievalChain
# from langchain.memory import ConversationBufferMemory
# from langchain_openai import ChatOpenAI
# from langchain_experimental.text_splitter import SemanticChunker
# import os
# openai_api_key = os.getenv("OPENAI_API_KEY")
# vectorstore = None
# llm = None
# qa_instance = None
# chat_history = [] # Global chat history
# def extract_text_from_pdf(pdf_bytes):
# document = fitz.open("pdf", pdf_bytes)
# text = ""
# for page_num in range(len(document)):
# page = document.load_page(page_num)
# text += page.get_text()
# document.close()
# return text
# def clean_text(text):
# cleaned_text = re.sub(r'\s+', ' ', text)
# cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
# cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
# return cleaned_text.strip()
# def initialize_chatbot(cleaned_text, openai_api_key):
# global vectorstore, llm, qa_instance
# if vectorstore is None: # Only create embeddings and Chroma once
# embeddings = OpenAIEmbeddings(api_key=openai_api_key)
# text_splitter = SemanticChunker(embeddings)
# docs = text_splitter.create_documents([cleaned_text])
# vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
# if llm is None:
# llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
# retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
# def setup_qa_system(pdf_file):
# global chat_history
# if pdf_file is None:
# return [("Please upload a PDF file.", "")]
# extracted_text = extract_text_from_pdf(pdf_file)
# cleaned_text = clean_text(extracted_text)
# initialize_chatbot(cleaned_text, openai_api_key)
# chat_history = [("Chatbot initialized. Please ask a question.", "")]
# return chat_history
# def answer_query(question):
# global chat_history
# if qa_instance is None:
# return [("Please upload a PDF and initialize the system first.", "")]
# if not question.strip():
# return [("Please enter a question.", "")]
# result = qa_instance({"question": question})
# chat_history.append((question, result['answer']))
# return chat_history
# with gr.Blocks() as demo:
# upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"])
# chatbot = gr.Chatbot(label="Chatbot")
# question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...")
# upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot])
# question.submit(answer_query, inputs=[question], outputs=[chatbot])
# if __name__ == "__main__":
# demo.launch()
import gradio as gr
import json
from typing import List, Dict
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain.schema import Document
openai_api_key = "sk-proj-bxh8lX8T6EoQaDWm2cljT3BlbkFJylU5bVGc2eQxB8WCP1Ub"
vectorstore = None
llm = None
qa_instance = None
chat_history = []
def load_embeddings_from_json(json_file_path: str):
with open(json_file_path, 'r') as f:
data = json.load(f)
chunks = [item['chunk'] for item in data]
embeddings = [item['embeddings'] for item in data]
ids = [item.get('id', str(index)) for index, item in enumerate(data)]
return chunks, embeddings, ids
def initialize_chatbot_from_json(json_file_path: str, openai_api_key: str):
global vectorstore, llm, qa_instance
if vectorstore is None:
chunks, embeddings, ids = load_embeddings_from_json(json_file_path)
vectorstore = Chroma(
collection_name="my_collection",
persist_directory=None,
embedding_function=OpenAIEmbeddings(api_key=openai_api_key)
)
vectorstore._client._add(
collection_id=vectorstore._collection.id,
ids=ids,
embeddings=embeddings,
metadatas=[{"source": "json"} for _ in chunks],
documents=chunks,
)
if llm is None:
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
def answer_query(question: str):
global chat_history
if qa_instance is None:
return [("Please initialize the system first.", "")]
if not question.strip():
return [("Please enter a question.", "")]
result = qa_instance({"question": question})
chat_history.append((question, result['answer']))
return chat_history
with gr.Blocks() as demo:
chatbot = gr.Chatbot(label="Chatbot")
question = gr.Textbox(label="Ask a question", placeholder="Type your question...")
question.submit(answer_query, inputs=[question], outputs=[chatbot])
initialize_chatbot_from_json("embeddings.json", openai_api_key)
if __name__ == "__main__":
demo.launch()