# import gradio as gr # import fitz # PyMuPDF # import re # from langchain_openai.embeddings import OpenAIEmbeddings # from langchain_chroma import Chroma # from langchain.retrievers.multi_query import MultiQueryRetriever # from langchain.chains import ConversationalRetrievalChain # from langchain.memory import ConversationBufferMemory # from langchain_openai import ChatOpenAI # from langchain_experimental.text_splitter import SemanticChunker # import os # openai_api_key = os.getenv("OPENAI_API_KEY") # vectorstore = None # llm = None # qa_instance = None # chat_history = [] # Global chat history # def extract_text_from_pdf(pdf_bytes): # document = fitz.open("pdf", pdf_bytes) # text = "" # for page_num in range(len(document)): # page = document.load_page(page_num) # text += page.get_text() # document.close() # return text # def clean_text(text): # cleaned_text = re.sub(r'\s+', ' ', text) # cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text) # cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text) # return cleaned_text.strip() # def initialize_chatbot(cleaned_text, openai_api_key): # global vectorstore, llm, qa_instance # if vectorstore is None: # Only create embeddings and Chroma once # embeddings = OpenAIEmbeddings(api_key=openai_api_key) # text_splitter = SemanticChunker(embeddings) # docs = text_splitter.create_documents([cleaned_text]) # vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings) # if llm is None: # llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True) # retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm) # memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) # qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory) # def setup_qa_system(pdf_file): # global chat_history # if pdf_file is None: # return [("Please upload a PDF file.", "")] # extracted_text = extract_text_from_pdf(pdf_file) # cleaned_text = clean_text(extracted_text) # initialize_chatbot(cleaned_text, openai_api_key) # chat_history = [("Chatbot initialized. Please ask a question.", "")] # return chat_history # def answer_query(question): # global chat_history # if qa_instance is None: # return [("Please upload a PDF and initialize the system first.", "")] # if not question.strip(): # return [("Please enter a question.", "")] # result = qa_instance({"question": question}) # chat_history.append((question, result['answer'])) # return chat_history # with gr.Blocks() as demo: # upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"]) # chatbot = gr.Chatbot(label="Chatbot") # question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...") # upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot]) # question.submit(answer_query, inputs=[question], outputs=[chatbot]) # if __name__ == "__main__": # demo.launch() import gradio as gr import json from typing import List, Dict from langchain_openai.embeddings import OpenAIEmbeddings from langchain_chroma import Chroma from langchain.retrievers.multi_query import MultiQueryRetriever from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain_openai import ChatOpenAI from langchain.schema import Document openai_api_key = "sk-proj-bxh8lX8T6EoQaDWm2cljT3BlbkFJylU5bVGc2eQxB8WCP1Ub" vectorstore = None llm = None qa_instance = None chat_history = [] def load_embeddings_from_json(json_file_path: str): with open(json_file_path, 'r') as f: data = json.load(f) chunks = [item['chunk'] for item in data] embeddings = [item['embeddings'] for item in data] ids = [item.get('id', str(index)) for index, item in enumerate(data)] return chunks, embeddings, ids def initialize_chatbot_from_json(json_file_path: str, openai_api_key: str): global vectorstore, llm, qa_instance if vectorstore is None: chunks, embeddings, ids = load_embeddings_from_json(json_file_path) vectorstore = Chroma( collection_name="my_collection", persist_directory=None, embedding_function=OpenAIEmbeddings(api_key=openai_api_key) ) vectorstore._client._add( collection_id=vectorstore._collection.id, ids=ids, embeddings=embeddings, metadatas=[{"source": "json"} for _ in chunks], documents=chunks, ) if llm is None: llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True) retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm) memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory) def answer_query(question: str): global chat_history if qa_instance is None: return [("Please initialize the system first.", "")] if not question.strip(): return [("Please enter a question.", "")] result = qa_instance({"question": question}) chat_history.append((question, result['answer'])) return chat_history with gr.Blocks() as demo: chatbot = gr.Chatbot(label="Chatbot") question = gr.Textbox(label="Ask a question", placeholder="Type your question...") question.submit(answer_query, inputs=[question], outputs=[chatbot]) initialize_chatbot_from_json("embeddings.json", openai_api_key) if __name__ == "__main__": demo.launch()