File size: 5,836 Bytes
2e4fe6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c56890
2e4fe6f
 
bfaa73f
 
 
 
 
 
2e4fe6f
1031a9d
2e4fe6f
bfaa73f
 
 
 
2e4fe6f
 
 
 
 
 
 
 
 
 
 
bfaa73f
2e4fe6f
 
 
 
 
 
 
 
 
 
 
 
 
 
bfaa73f
 
 
 
 
 
2e4fe6f
f26ae96
bfaa73f
2e4fe6f
bfaa73f
 
 
 
 
 
 
 
2e4fe6f
bfaa73f
 
2e4fe6f
bfaa73f
f26ae96
 
2e4fe6f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# import gradio as gr
# import fitz  # PyMuPDF
# import re
# from langchain_openai.embeddings import OpenAIEmbeddings
# from langchain_chroma import Chroma
# from langchain.retrievers.multi_query import MultiQueryRetriever
# from langchain.chains import ConversationalRetrievalChain
# from langchain.memory import ConversationBufferMemory
# from langchain_openai import ChatOpenAI
# from langchain_experimental.text_splitter import SemanticChunker

# import os
# openai_api_key = os.getenv("OPENAI_API_KEY")

# vectorstore = None
# llm = None
# qa_instance = None
# chat_history = []  # Global chat history

# def extract_text_from_pdf(pdf_bytes):
#     document = fitz.open("pdf", pdf_bytes)
#     text = ""
#     for page_num in range(len(document)):
#         page = document.load_page(page_num)
#         text += page.get_text()
#     document.close()
#     return text

# def clean_text(text):
#     cleaned_text = re.sub(r'\s+', ' ', text)
#     cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
#     cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
#     return cleaned_text.strip()

# def initialize_chatbot(cleaned_text, openai_api_key):
#     global vectorstore, llm, qa_instance
#     if vectorstore is None:  # Only create embeddings and Chroma once
#         embeddings = OpenAIEmbeddings(api_key=openai_api_key)
#         text_splitter = SemanticChunker(embeddings)
#         docs = text_splitter.create_documents([cleaned_text])
#         vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
#     if llm is None:
#         llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
#     retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
#     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
#     qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

# def setup_qa_system(pdf_file):
#     global chat_history
#     if pdf_file is None:
#         return [("Please upload a PDF file.", "")]
#     extracted_text = extract_text_from_pdf(pdf_file)
#     cleaned_text = clean_text(extracted_text)
#     initialize_chatbot(cleaned_text, openai_api_key)
#     chat_history = [("Chatbot initialized. Please ask a question.", "")]
#     return chat_history

# def answer_query(question):
#     global chat_history
#     if qa_instance is None:
#         return [("Please upload a PDF and initialize the system first.", "")]
#     if not question.strip():
#         return [("Please enter a question.", "")]
#     result = qa_instance({"question": question})
#     chat_history.append((question, result['answer']))
#     return chat_history

# with gr.Blocks() as demo:
#     upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"])
#     chatbot = gr.Chatbot(label="Chatbot")
#     question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...")

#     upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot])
#     question.submit(answer_query, inputs=[question], outputs=[chatbot])

# if __name__ == "__main__":
#     demo.launch()



import gradio as gr
import json
from typing import List, Dict
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain.schema import Document

openai_api_key = "sk-proj-bxh8lX8T6EoQaDWm2cljT3BlbkFJylU5bVGc2eQxB8WCP1Ub"

vectorstore = None
llm = None
qa_instance = None
chat_history = []

def load_embeddings_from_json(json_file_path: str):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    chunks = [item['chunk'] for item in data]
    embeddings = [item['embeddings'] for item in data]
    ids = [item.get('id', str(index)) for index, item in enumerate(data)]
    return chunks, embeddings, ids

def initialize_chatbot_from_json(json_file_path: str, openai_api_key: str):
    global vectorstore, llm, qa_instance
    if vectorstore is None:
        chunks, embeddings, ids = load_embeddings_from_json(json_file_path)
        vectorstore = Chroma(
            collection_name="my_collection",
            persist_directory=None,
            embedding_function=OpenAIEmbeddings(api_key=openai_api_key)
        )
        vectorstore._client._add(
            collection_id=vectorstore._collection.id,
            ids=ids,
            embeddings=embeddings,
            metadatas=[{"source": "json"} for _ in chunks],
            documents=chunks,
        )
    if llm is None:
        llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
    retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

def answer_query(question: str):
    global chat_history
    if qa_instance is None:
        return [("Please initialize the system first.", "")]
    if not question.strip():
        return [("Please enter a question.", "")]
    result = qa_instance({"question": question})
    chat_history.append((question, result['answer']))
    return chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="Chatbot")
    question = gr.Textbox(label="Ask a question", placeholder="Type your question...")

    question.submit(answer_query, inputs=[question], outputs=[chatbot])
    initialize_chatbot_from_json("embeddings.json", openai_api_key)

if __name__ == "__main__":
    demo.launch()