Spaces:
Sleeping
Sleeping
File size: 5,836 Bytes
2e4fe6f 7c56890 2e4fe6f bfaa73f 2e4fe6f 1031a9d 2e4fe6f bfaa73f 2e4fe6f bfaa73f 2e4fe6f bfaa73f 2e4fe6f f26ae96 bfaa73f 2e4fe6f bfaa73f 2e4fe6f bfaa73f 2e4fe6f bfaa73f f26ae96 2e4fe6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# import gradio as gr
# import fitz # PyMuPDF
# import re
# from langchain_openai.embeddings import OpenAIEmbeddings
# from langchain_chroma import Chroma
# from langchain.retrievers.multi_query import MultiQueryRetriever
# from langchain.chains import ConversationalRetrievalChain
# from langchain.memory import ConversationBufferMemory
# from langchain_openai import ChatOpenAI
# from langchain_experimental.text_splitter import SemanticChunker
# import os
# openai_api_key = os.getenv("OPENAI_API_KEY")
# vectorstore = None
# llm = None
# qa_instance = None
# chat_history = [] # Global chat history
# def extract_text_from_pdf(pdf_bytes):
# document = fitz.open("pdf", pdf_bytes)
# text = ""
# for page_num in range(len(document)):
# page = document.load_page(page_num)
# text += page.get_text()
# document.close()
# return text
# def clean_text(text):
# cleaned_text = re.sub(r'\s+', ' ', text)
# cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
# cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
# return cleaned_text.strip()
# def initialize_chatbot(cleaned_text, openai_api_key):
# global vectorstore, llm, qa_instance
# if vectorstore is None: # Only create embeddings and Chroma once
# embeddings = OpenAIEmbeddings(api_key=openai_api_key)
# text_splitter = SemanticChunker(embeddings)
# docs = text_splitter.create_documents([cleaned_text])
# vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
# if llm is None:
# llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
# retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
# def setup_qa_system(pdf_file):
# global chat_history
# if pdf_file is None:
# return [("Please upload a PDF file.", "")]
# extracted_text = extract_text_from_pdf(pdf_file)
# cleaned_text = clean_text(extracted_text)
# initialize_chatbot(cleaned_text, openai_api_key)
# chat_history = [("Chatbot initialized. Please ask a question.", "")]
# return chat_history
# def answer_query(question):
# global chat_history
# if qa_instance is None:
# return [("Please upload a PDF and initialize the system first.", "")]
# if not question.strip():
# return [("Please enter a question.", "")]
# result = qa_instance({"question": question})
# chat_history.append((question, result['answer']))
# return chat_history
# with gr.Blocks() as demo:
# upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"])
# chatbot = gr.Chatbot(label="Chatbot")
# question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...")
# upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot])
# question.submit(answer_query, inputs=[question], outputs=[chatbot])
# if __name__ == "__main__":
# demo.launch()
import gradio as gr
import json
from typing import List, Dict
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain.schema import Document
openai_api_key = "sk-proj-bxh8lX8T6EoQaDWm2cljT3BlbkFJylU5bVGc2eQxB8WCP1Ub"
vectorstore = None
llm = None
qa_instance = None
chat_history = []
def load_embeddings_from_json(json_file_path: str):
with open(json_file_path, 'r') as f:
data = json.load(f)
chunks = [item['chunk'] for item in data]
embeddings = [item['embeddings'] for item in data]
ids = [item.get('id', str(index)) for index, item in enumerate(data)]
return chunks, embeddings, ids
def initialize_chatbot_from_json(json_file_path: str, openai_api_key: str):
global vectorstore, llm, qa_instance
if vectorstore is None:
chunks, embeddings, ids = load_embeddings_from_json(json_file_path)
vectorstore = Chroma(
collection_name="my_collection",
persist_directory=None,
embedding_function=OpenAIEmbeddings(api_key=openai_api_key)
)
vectorstore._client._add(
collection_id=vectorstore._collection.id,
ids=ids,
embeddings=embeddings,
metadatas=[{"source": "json"} for _ in chunks],
documents=chunks,
)
if llm is None:
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
def answer_query(question: str):
global chat_history
if qa_instance is None:
return [("Please initialize the system first.", "")]
if not question.strip():
return [("Please enter a question.", "")]
result = qa_instance({"question": question})
chat_history.append((question, result['answer']))
return chat_history
with gr.Blocks() as demo:
chatbot = gr.Chatbot(label="Chatbot")
question = gr.Textbox(label="Ask a question", placeholder="Type your question...")
question.submit(answer_query, inputs=[question], outputs=[chatbot])
initialize_chatbot_from_json("embeddings.json", openai_api_key)
if __name__ == "__main__":
demo.launch()
|