Gaurav-2273 commited on
Commit
2e4fe6f
1 Parent(s): 4d4a98c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -43
app.py CHANGED
@@ -1,64 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
- import re
4
  from langchain_openai.embeddings import OpenAIEmbeddings
5
  from langchain_chroma import Chroma
6
  from langchain.retrievers.multi_query import MultiQueryRetriever
7
  from langchain.chains import ConversationalRetrievalChain
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain_openai import ChatOpenAI
10
- from langchain_experimental.text_splitter import SemanticChunker
11
 
12
- import os
13
- openai_api_key = os.getenv("OPENAI_API_KEY")
14
 
15
  vectorstore = None
16
  llm = None
17
  qa_instance = None
18
- chat_history = [] # Global chat history
19
-
20
- def extract_text_from_pdf(pdf_bytes):
21
- document = fitz.open("pdf", pdf_bytes)
22
- text = ""
23
- for page_num in range(len(document)):
24
- page = document.load_page(page_num)
25
- text += page.get_text()
26
- document.close()
27
- return text
28
-
29
- def clean_text(text):
30
- cleaned_text = re.sub(r'\s+', ' ', text)
31
- cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
32
- cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
33
- return cleaned_text.strip()
34
-
35
- def initialize_chatbot(cleaned_text, openai_api_key):
36
  global vectorstore, llm, qa_instance
37
- if vectorstore is None: # Only create embeddings and Chroma once
38
- embeddings = OpenAIEmbeddings(api_key=openai_api_key)
39
- text_splitter = SemanticChunker(embeddings)
40
- docs = text_splitter.create_documents([cleaned_text])
41
- vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
 
 
 
 
 
 
 
 
 
42
  if llm is None:
43
  llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
44
  retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
45
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
46
  qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
47
 
48
- def setup_qa_system(pdf_file):
49
- global chat_history
50
- if pdf_file is None:
51
- return [("Please upload a PDF file.", "")]
52
- extracted_text = extract_text_from_pdf(pdf_file)
53
- cleaned_text = clean_text(extracted_text)
54
- initialize_chatbot(cleaned_text, openai_api_key)
55
- chat_history = [("Chatbot initialized. Please ask a question.", "")]
56
- return chat_history
57
-
58
- def answer_query(question):
59
  global chat_history
60
  if qa_instance is None:
61
- return [("Please upload a PDF and initialize the system first.", "")]
62
  if not question.strip():
63
  return [("Please enter a question.", "")]
64
  result = qa_instance({"question": question})
@@ -66,12 +137,12 @@ def answer_query(question):
66
  return chat_history
67
 
68
  with gr.Blocks() as demo:
69
- upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"])
70
  chatbot = gr.Chatbot(label="Chatbot")
71
- question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...")
72
 
73
- upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot])
74
  question.submit(answer_query, inputs=[question], outputs=[chatbot])
 
75
 
76
  if __name__ == "__main__":
77
  demo.launch()
 
 
1
+ # import gradio as gr
2
+ # import fitz # PyMuPDF
3
+ # import re
4
+ # from langchain_openai.embeddings import OpenAIEmbeddings
5
+ # from langchain_chroma import Chroma
6
+ # from langchain.retrievers.multi_query import MultiQueryRetriever
7
+ # from langchain.chains import ConversationalRetrievalChain
8
+ # from langchain.memory import ConversationBufferMemory
9
+ # from langchain_openai import ChatOpenAI
10
+ # from langchain_experimental.text_splitter import SemanticChunker
11
+
12
+ # import os
13
+ # openai_api_key = os.getenv("OPENAI_API_KEY")
14
+
15
+ # vectorstore = None
16
+ # llm = None
17
+ # qa_instance = None
18
+ # chat_history = [] # Global chat history
19
+
20
+ # def extract_text_from_pdf(pdf_bytes):
21
+ # document = fitz.open("pdf", pdf_bytes)
22
+ # text = ""
23
+ # for page_num in range(len(document)):
24
+ # page = document.load_page(page_num)
25
+ # text += page.get_text()
26
+ # document.close()
27
+ # return text
28
+
29
+ # def clean_text(text):
30
+ # cleaned_text = re.sub(r'\s+', ' ', text)
31
+ # cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
32
+ # cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
33
+ # return cleaned_text.strip()
34
+
35
+ # def initialize_chatbot(cleaned_text, openai_api_key):
36
+ # global vectorstore, llm, qa_instance
37
+ # if vectorstore is None: # Only create embeddings and Chroma once
38
+ # embeddings = OpenAIEmbeddings(api_key=openai_api_key)
39
+ # text_splitter = SemanticChunker(embeddings)
40
+ # docs = text_splitter.create_documents([cleaned_text])
41
+ # vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
42
+ # if llm is None:
43
+ # llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
44
+ # retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
45
+ # memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
46
+ # qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
47
+
48
+ # def setup_qa_system(pdf_file):
49
+ # global chat_history
50
+ # if pdf_file is None:
51
+ # return [("Please upload a PDF file.", "")]
52
+ # extracted_text = extract_text_from_pdf(pdf_file)
53
+ # cleaned_text = clean_text(extracted_text)
54
+ # initialize_chatbot(cleaned_text, openai_api_key)
55
+ # chat_history = [("Chatbot initialized. Please ask a question.", "")]
56
+ # return chat_history
57
+
58
+ # def answer_query(question):
59
+ # global chat_history
60
+ # if qa_instance is None:
61
+ # return [("Please upload a PDF and initialize the system first.", "")]
62
+ # if not question.strip():
63
+ # return [("Please enter a question.", "")]
64
+ # result = qa_instance({"question": question})
65
+ # chat_history.append((question, result['answer']))
66
+ # return chat_history
67
+
68
+ # with gr.Blocks() as demo:
69
+ # upload = gr.File(label="Upload PDF", type="binary", file_types=["pdf"])
70
+ # chatbot = gr.Chatbot(label="Chatbot")
71
+ # question = gr.Textbox(label="Ask a question", placeholder="Type your question after uploading PDF...")
72
+
73
+ # upload.change(setup_qa_system, inputs=[upload], outputs=[chatbot])
74
+ # question.submit(answer_query, inputs=[question], outputs=[chatbot])
75
+
76
+ # if __name__ == "__main__":
77
+ # demo.launch()
78
+
79
+
80
+
81
  import gradio as gr
82
+ import json
83
+ from typing import List, Dict
84
  from langchain_openai.embeddings import OpenAIEmbeddings
85
  from langchain_chroma import Chroma
86
  from langchain.retrievers.multi_query import MultiQueryRetriever
87
  from langchain.chains import ConversationalRetrievalChain
88
  from langchain.memory import ConversationBufferMemory
89
  from langchain_openai import ChatOpenAI
90
+ from langchain.schema import Document
91
 
92
+ openai_api_key = "sk-proj-bxh8lX8T6EoQaDWm2cljT3BlbkFJylU5bVGc2eQxB8WCP1Ub"
 
93
 
94
  vectorstore = None
95
  llm = None
96
  qa_instance = None
97
+ chat_history = []
98
+
99
+ def load_embeddings_from_json(json_file_path: str):
100
+ with open(json_file_path, 'r') as f:
101
+ data = json.load(f)
102
+ chunks = [item['chunk'] for item in data]
103
+ embeddings = [item['embeddings'] for item in data]
104
+ ids = [item.get('id', str(index)) for index, item in enumerate(data)]
105
+ return chunks, embeddings, ids
106
+
107
+ def initialize_chatbot_from_json(json_file_path: str, openai_api_key: str):
 
 
 
 
 
 
 
108
  global vectorstore, llm, qa_instance
109
+ if vectorstore is None:
110
+ chunks, embeddings, ids = load_embeddings_from_json(json_file_path)
111
+ vectorstore = Chroma(
112
+ collection_name="my_collection",
113
+ persist_directory=None,
114
+ embedding_function=OpenAIEmbeddings(api_key=openai_api_key)
115
+ )
116
+ vectorstore._client._add(
117
+ collection_id=vectorstore._collection.id,
118
+ ids=ids,
119
+ embeddings=embeddings,
120
+ metadatas=[{"source": "json"} for _ in chunks],
121
+ documents=chunks,
122
+ )
123
  if llm is None:
124
  llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
125
  retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
126
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
127
  qa_instance = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
128
 
129
+ def answer_query(question: str):
 
 
 
 
 
 
 
 
 
 
130
  global chat_history
131
  if qa_instance is None:
132
+ return [("Please initialize the system first.", "")]
133
  if not question.strip():
134
  return [("Please enter a question.", "")]
135
  result = qa_instance({"question": question})
 
137
  return chat_history
138
 
139
  with gr.Blocks() as demo:
 
140
  chatbot = gr.Chatbot(label="Chatbot")
141
+ question = gr.Textbox(label="Ask a question", placeholder="Type your question...")
142
 
 
143
  question.submit(answer_query, inputs=[question], outputs=[chatbot])
144
+ initialize_chatbot_from_json("embeddings.json", openai_api_key)
145
 
146
  if __name__ == "__main__":
147
  demo.launch()
148
+