th8m0z commited on
Commit
54f6539
1 Parent(s): 7dad24a

refactored the project

Browse files
__pycache__/app.cpython-311.pyc CHANGED
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
 
__pycache__/semantic_search.cpython-311.pyc ADDED
Binary file (2.76 kB). View file
 
__pycache__/ui.cpython-311.pyc ADDED
Binary file (4.61 kB). View file
 
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import urllib.request
2
  import fitz
3
  import re
4
- import numpy as np
5
- import tensorflow_hub as hub
6
  import openai
7
- import gradio as gr
8
  import os
9
- from sklearn.neighbors import NearestNeighbors
 
 
10
 
11
  def download_pdf(url, output_path):
12
  urllib.request.urlretrieve(url, output_path)
@@ -57,43 +56,6 @@ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
57
  return chunks
58
 
59
 
60
- class SemanticSearch:
61
-
62
- def __init__(self):
63
- self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
64
- self.fitted = False
65
-
66
-
67
- def fit(self, data, batch=1000, n_neighbors=5):
68
- self.data = data
69
- self.embeddings = self.get_text_embedding(data, batch=batch)
70
- n_neighbors = min(n_neighbors, len(self.embeddings))
71
- self.nn = NearestNeighbors(n_neighbors=n_neighbors)
72
- self.nn.fit(self.embeddings)
73
- self.fitted = True
74
-
75
-
76
- def __call__(self, text, return_data=True):
77
- inp_emb = self.use([text])
78
- neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
79
-
80
- if return_data:
81
- return [self.data[i] for i in neighbors]
82
- else:
83
- return neighbors
84
-
85
-
86
- def get_text_embedding(self, texts, batch=1000):
87
- embeddings = []
88
- for i in range(0, len(texts), batch):
89
- text_batch = texts[i:(i+batch)]
90
- emb_batch = self.use(text_batch)
91
- embeddings.append(emb_batch)
92
- embeddings = np.vstack(embeddings)
93
- return embeddings
94
-
95
-
96
-
97
  def load_recommender(paths, start_page=1):
98
  global recommender
99
  texts = []
@@ -139,20 +101,18 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
139
  return message
140
 
141
 
142
- def generate_answer(question, openAI_key, model):
143
  topn_chunks = recommender(question)
144
  prompt = 'search results:\n\n'
145
  for c in topn_chunks:
146
  prompt += c + '\n\n'
147
 
148
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
149
- "Cite each reference using [ Page Number] notation. "\
150
  "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
151
 
152
  prompt += f"{question}\nAnswer:"
153
- answer = generate_text(openAI_key, prompt, model)
154
- return answer
155
-
156
 
157
  def question_answer(chat_history, url, files, question, openAI_key, model):
158
  try:
@@ -184,10 +144,8 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
184
 
185
  if question.strip() == '':
186
  return '[ERROR]: Question field is empty'
187
- if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
188
- answer = generate_answer_text_davinci_003(question, openAI_key)
189
- else:
190
- answer = generate_answer(question, openAI_key, model)
191
  chat_history.append([question, answer])
192
  return chat_history
193
  except openai.error.InvalidRequestError as e:
@@ -195,110 +153,3 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
195
 
196
 
197
 
198
- def generate_text_text_davinci_003(openAI_key,prompt, engine="text-davinci-003"):
199
- openai.api_key = openAI_key
200
- completions = openai.Completion.create(
201
- engine=engine,
202
- prompt=prompt,
203
- max_tokens=512,
204
- n=1,
205
- stop=None,
206
- temperature=0.7,
207
- )
208
- message = completions.choices[0].text
209
- return message
210
-
211
-
212
- def generate_answer_text_davinci_003(question,openAI_key):
213
- topn_chunks = recommender(question)
214
- # print("topn chunks == " + str(topn_chunks))
215
- prompt = ""
216
- prompt += 'search results:\n\n'
217
- for c in topn_chunks:
218
- prompt += c + '\n\n'
219
-
220
- prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
221
- "Cite each reference using [PDF Number][Page Number] notation (every result has this number at the beginning). "\
222
- "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
223
- "with the same name, create separate answers for each. Only include information found in the results and "\
224
- "don't add any additional information. Make sure the answer is correct and don't output false content. "\
225
- "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
226
- "search results which has nothing to do with the question. Only answer what is asked. The "\
227
- "answer should be short and concise.\n\n"
228
-
229
- prompt += f"Query: {question}\nAnswer:"
230
- print("prompt == " + str(prompt))
231
- # print("prompt == " + str(prompt))
232
- answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
233
- return answer
234
-
235
- # pre-defined questions
236
- questions = [
237
- "What did the study investigate?",
238
- "Can you provide a summary of this paper?",
239
- "what are the methodologies used in this study?",
240
- "what are the data intervals used in this study? Give me the start dates and end dates?",
241
- "what are the main limitations of this study?",
242
- "what are the main shortcomings of this study?",
243
- "what are the main findings of the study?",
244
- "what are the main results of the study?",
245
- "what are the main contributions of this study?",
246
- "what is the conclusion of this paper?",
247
- "what are the input features used in this study?",
248
- "what is the dependent variable in this study?",
249
- ]
250
-
251
-
252
- recommender = SemanticSearch()
253
-
254
- title = 'PDF GPT Turbo'
255
- description = """ PDF GPT Turbo allows you to chat with your PDF files. It uses Google's Universal Sentence Encoder with Deep averaging network (DAN) to give hallucination free response by improving the embedding quality of OpenAI. It cites the page number in square brackets([Page No.]) and shows where the information is located, adding credibility to the responses."""
256
-
257
- with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as demo:
258
-
259
- gr.Markdown(f'<center><h3>{title}</h3></center>')
260
- gr.Markdown(description)
261
-
262
- with gr.Row():
263
-
264
- with gr.Group():
265
- gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
266
- with gr.Accordion("API Key"):
267
- openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
268
- url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
269
- gr.Markdown("<center><h4>OR<h4></center>")
270
- files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
271
- question = gr.Textbox(label='Enter your question here')
272
- gr.Examples(
273
- [[q] for q in questions],
274
- inputs=[question],
275
- label="PRE-DEFINED QUESTIONS: Click on a question to auto-fill the input box, then press Enter!",
276
- )
277
- model = gr.Radio([
278
- 'gpt-3.5-turbo',
279
- 'gpt-3.5-turbo-16k',
280
- 'gpt-3.5-turbo-0613',
281
- 'gpt-3.5-turbo-16k-0613',
282
- 'text-davinci-003',
283
- 'gpt-4',
284
- 'gpt-4-32k'
285
- ], label='Select Model', default='gpt-3.5-turbo')
286
- btn = gr.Button(value='Submit')
287
-
288
- btn.style(full_width=True)
289
-
290
- with gr.Group():
291
- chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
292
-
293
-
294
-
295
- # Bind the click event of the button to the question_answer function
296
- btn.click(
297
- question_answer,
298
- inputs=[chatbot, url, files, question, openAI_key, model],
299
- outputs=[chatbot],
300
- )
301
-
302
- demo.launch()
303
-
304
-
 
1
  import urllib.request
2
  import fitz
3
  import re
 
 
4
  import openai
 
5
  import os
6
+ from semantic_search import SemanticSearch
7
+
8
+ recommender = SemanticSearch()
9
 
10
  def download_pdf(url, output_path):
11
  urllib.request.urlretrieve(url, output_path)
 
56
  return chunks
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def load_recommender(paths, start_page=1):
60
  global recommender
61
  texts = []
 
101
  return message
102
 
103
 
104
+ def construct_prompt(question):
105
  topn_chunks = recommender(question)
106
  prompt = 'search results:\n\n'
107
  for c in topn_chunks:
108
  prompt += c + '\n\n'
109
 
110
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
111
+ "Cite each reference using [PDF Number][Page Number] notation. "\
112
  "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
113
 
114
  prompt += f"{question}\nAnswer:"
115
+ return prompt
 
 
116
 
117
  def question_answer(chat_history, url, files, question, openAI_key, model):
118
  try:
 
144
 
145
  if question.strip() == '':
146
  return '[ERROR]: Question field is empty'
147
+ prompt = construct_prompt(question)
148
+ answer = generate_text(openAI_key, prompt, model)
 
 
149
  chat_history.append([question, answer])
150
  return chat_history
151
  except openai.error.InvalidRequestError as e:
 
153
 
154
 
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
semantic_search.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow_hub as hub
3
+ from sklearn.neighbors import NearestNeighbors
4
+
5
+ class SemanticSearch:
6
+
7
+ def __init__(self):
8
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
9
+ self.fitted = False
10
+
11
+
12
+ def fit(self, data, batch=1000, n_neighbors=5):
13
+ self.data = data
14
+ self.embeddings = self.get_text_embedding(data, batch=batch)
15
+ n_neighbors = min(n_neighbors, len(self.embeddings))
16
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
17
+ self.nn.fit(self.embeddings)
18
+ self.fitted = True
19
+
20
+
21
+ def __call__(self, text, return_data=True):
22
+ inp_emb = self.use([text])
23
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
24
+
25
+ if return_data:
26
+ return [self.data[i] for i in neighbors]
27
+ else:
28
+ return neighbors
29
+
30
+
31
+ def get_text_embedding(self, texts, batch=1000):
32
+ embeddings = []
33
+ for i in range(0, len(texts), batch):
34
+ text_batch = texts[i:(i+batch)]
35
+ emb_batch = self.use(text_batch)
36
+ embeddings.append(emb_batch)
37
+ embeddings = np.vstack(embeddings)
38
+ return embeddings
39
+
ui.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import app as app
3
+
4
+
5
+
6
+ # pre-defined questions
7
+ questions = [
8
+ "What did the study investigate?",
9
+ "Can you provide a summary of this paper?",
10
+ "what are the methodologies used in this study?",
11
+ "what are the data intervals used in this study? Give me the start dates and end dates?",
12
+ "what are the main limitations of this study?",
13
+ "what are the main shortcomings of this study?",
14
+ "what are the main findings of the study?",
15
+ "what are the main results of the study?",
16
+ "what are the main contributions of this study?",
17
+ "what is the conclusion of this paper?",
18
+ "what are the input features used in this study?",
19
+ "what is the dependent variable in this study?",
20
+ ]
21
+
22
+ title = 'PDF GPT Turbo'
23
+ description = """ PDF GPT Turbo allows you to chat with your PDF files. It uses Google's Universal Sentence Encoder with Deep averaging network (DAN) to give hallucination free response by improving the embedding quality of OpenAI. It cites the page number in square brackets([Page No.]) and shows where the information is located, adding credibility to the responses."""
24
+
25
+ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as demo:
26
+
27
+ gr.Markdown(f'<center><h3>{title}</h3></center>')
28
+ gr.Markdown(description)
29
+
30
+ with gr.Row():
31
+
32
+ with gr.Group():
33
+ gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
34
+ with gr.Accordion("API Key"):
35
+ openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
36
+ url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
37
+ gr.Markdown("<center><h4>OR<h4></center>")
38
+ files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
39
+ question = gr.Textbox(label='Enter your question here')
40
+ gr.Examples(
41
+ [[q] for q in questions],
42
+ inputs=[question],
43
+ label="PRE-DEFINED QUESTIONS: Click on a question to auto-fill the input box, then press Enter!",
44
+ )
45
+ model = gr.Radio([
46
+ 'gpt-3.5-turbo',
47
+ 'gpt-3.5-turbo-16k',
48
+ 'gpt-3.5-turbo-0613',
49
+ 'gpt-3.5-turbo-16k-0613',
50
+ 'text-davinci-003',
51
+ 'gpt-4',
52
+ 'gpt-4-32k'
53
+ ], label='Select Model', default='gpt-3.5-turbo')
54
+ btn = gr.Button(value='Submit')
55
+
56
+ btn.style(full_width=True)
57
+
58
+ with gr.Group():
59
+ chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
60
+
61
+
62
+
63
+ # Bind the click event of the button to the question_answer function
64
+ btn.click(
65
+ app.question_answer,
66
+ inputs=[chatbot, url, files, question, openAI_key, model],
67
+ outputs=[chatbot],
68
+ )
69
+
70
+ demo.launch()