Spaces:
Running
Running
# import gradio as gr | |
# import fitz # PyMuPDF | |
# import re | |
# from langchain_openai.embeddings import OpenAIEmbeddings | |
# from langchain_chroma import Chroma | |
# from langchain.retrievers.multi_query import MultiQueryRetriever | |
# from langchain.chains import ConversationalRetrievalChain | |
# from langchain.memory import ConversationBufferMemory | |
# from langchain_openai import ChatOpenAI | |
# from langchain_experimental.text_splitter import SemanticChunker | |
# import os | |
# openai_api_key = os.getenv("OPENAI_API_KEY") | |
# vectorstore = None | |
# llm = None | |
# qa_instance = None | |
# chat_history = [] # Global chat history | |
# def extract_text_from_pdf(pdf_bytes): | |
# document = fitz.open("pdf", pdf_bytes) | |
# text = "" | |
# for page_num in range(len(document)): | |
# page = document.load_page(page_num) | |
# text += page.get_text() | |
# document.close() | |
# return text | |
# def clean_text(text): | |
# cleaned_text = re.sub(r'\s+', ' ', text) | |
# cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text) | |
# cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text) | |
# return cleaned_text.strip() | |
# def initialize_chatbot(cleaned_text, openai_api_key): | |
# global vectorstore, llm, qa_instance | |
# if vectorstore is None: # Only create embeddings and Chroma once | |
# embeddings = OpenAIEmbeddings(api_key=openai_api_key) | |
# text_splitter = SemanticChunker(embeddings) | |
# docs = text_splitter.create_documents([cleaned_text]) | |
# vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings) | |
# if llm is None: | |
# llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True) | |
# retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm) | |
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
# qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory) | |
# def setup_qa_system(pdf_file): | |
# global chat_history | |
# if pdf_file is None: | |
# return [("Please upload a PDF file.", "")] | |
# extracted_text = extract_text_from_pdf(pdf_file) | |
# cleaned_text = clean_text(extracted_text) | |
# initialize_chatbot(cleaned_text, openai_api_key) | |
# chat_history = [("Chatbot initialized. Please ask a question.", "")] | |
# return chat_history | |
# def answer_query(question): | |
# global chat_history | |
# if qa_instance is None: | |
# return [("Please upload a PDF and initialize the system first.", "")] | |
# if not question.strip(): | |
# return [("Please enter a question.", "")] | |
# result = qa_instance({"question": question}) | |
# chat_history.append((question, result['answer'])) | |
# return chat_history | |
# with gr.Blocks() as demo: | |
# upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"]) | |
# chatbot = gr.Chatbot(label="Chatbot") | |
# question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...") | |
# upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot]) | |
# question.submit(answer_query, inputs=[question], outputs=[chatbot]) | |
# if __name__ == "__main__": | |
# demo.launch() | |
import gradio as gr | |
import json | |
from typing import List, Dict | |
from langchain_openai.embeddings import OpenAIEmbeddings | |
from langchain_chroma import Chroma | |
from langchain.retrievers.multi_query import MultiQueryRetriever | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.memory import ConversationBufferMemory | |
from langchain_openai import ChatOpenAI | |
from langchain.schema import Document | |
openai_api_key = "sk-proj-bxh8lX8T6EoQaDWm2cljT3BlbkFJylU5bVGc2eQxB8WCP1Ub" | |
vectorstore = None | |
llm = None | |
qa_instance = None | |
chat_history = [] | |
def load_embeddings_from_json(json_file_path: str): | |
with open(json_file_path, 'r') as f: | |
data = json.load(f) | |
chunks = [item['chunk'] for item in data] | |
embeddings = [item['embeddings'] for item in data] | |
ids = [item.get('id', str(index)) for index, item in enumerate(data)] | |
return chunks, embeddings, ids | |
def initialize_chatbot_from_json(json_file_path: str, openai_api_key: str): | |
global vectorstore, llm, qa_instance | |
if vectorstore is None: | |
chunks, embeddings, ids = load_embeddings_from_json(json_file_path) | |
vectorstore = Chroma( | |
collection_name="my_collection", | |
persist_directory=None, | |
embedding_function=OpenAIEmbeddings(api_key=openai_api_key) | |
) | |
vectorstore._client._add( | |
collection_id=vectorstore._collection.id, | |
ids=ids, | |
embeddings=embeddings, | |
metadatas=[{"source": "json"} for _ in chunks], | |
documents=chunks, | |
) | |
if llm is None: | |
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True) | |
retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm) | |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory) | |
def answer_query(question: str): | |
global chat_history | |
if qa_instance is None: | |
return [("Please initialize the system first.", "")] | |
if not question.strip(): | |
return [("Please enter a question.", "")] | |
result = qa_instance({"question": question}) | |
chat_history.append((question, result['answer'])) | |
return chat_history | |
with gr.Blocks() as demo: | |
chatbot = gr.Chatbot(label="Chatbot") | |
question = gr.Textbox(label="Ask a question", placeholder="Type your question...") | |
question.submit(answer_query, inputs=[question], outputs=[chatbot]) | |
initialize_chatbot_from_json("embeddings.json", openai_api_key) | |
if __name__ == "__main__": | |
demo.launch() | |