parvezalmuqtadir commited on
Commit
3168ecd
1 Parent(s): e70ec55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -104
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  from dotenv import load_dotenv
3
- import urllib.request
4
- import fitz
5
  import re
6
  import numpy as np
7
  import tensorflow_hub as hub
@@ -18,13 +18,11 @@ openAI_key = os.getenv('OPENAI_API_KEY')
18
  def download_pdf(url, output_path):
19
  urllib.request.urlretrieve(url, output_path)
20
 
21
-
22
  def preprocess(text):
23
  text = text.replace('\n', ' ')
24
  text = re.sub('\s+', ' ', text)
25
  return text
26
 
27
-
28
  def pdf_to_text(path, start_page=1, end_page=None):
29
  doc = fitz.open(path)
30
  total_pages = doc.page_count
@@ -34,7 +32,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
34
 
35
  text_list = []
36
 
37
- for i in range(start_page-1, end_page):
38
  text = doc.load_page(i).get_text("text")
39
  text = preprocess(text)
40
  text_list.append(text)
@@ -42,32 +40,28 @@ def pdf_to_text(path, start_page=1, end_page=None):
42
  doc.close()
43
  return text_list
44
 
45
-
46
  def text_to_chunks(texts, word_length=150, start_page=1):
47
  text_toks = [t.split(' ') for t in texts]
48
- page_nums = []
49
  chunks = []
50
-
51
  for idx, words in enumerate(text_toks):
52
  for i in range(0, len(words), word_length):
53
  chunk = words[i:i+word_length]
54
- if (i+word_length) > len(words) and (len(chunk) < word_length) and (
55
- len(text_toks) != (idx+1)):
56
- text_toks[idx+1] = chunk + text_toks[idx+1]
57
  continue
58
  chunk = ' '.join(chunk).strip()
59
- chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
60
  chunks.append(chunk)
61
  return chunks
62
 
63
-
64
  class SemanticSearch:
65
-
66
  def __init__(self):
67
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
68
  self.fitted = False
69
-
70
-
71
  def fit(self, data, batch=1000, n_neighbors=5):
72
  self.data = data
73
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -75,43 +69,40 @@ class SemanticSearch:
75
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
76
  self.nn.fit(self.embeddings)
77
  self.fitted = True
78
-
79
-
80
  def __call__(self, text, return_data=True):
81
  inp_emb = self.use([text])
82
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
83
-
84
  if return_data:
85
  return [self.data[i] for i in neighbors]
86
  else:
87
  return neighbors
88
-
89
-
90
  def get_text_embedding(self, texts, batch=1000):
91
  embeddings = []
92
  for i in range(0, len(texts), batch):
93
- text_batch = texts[i:(i+batch)]
94
  emb_batch = self.use(text_batch)
95
  embeddings.append(emb_batch)
96
  embeddings = np.vstack(embeddings)
97
  return embeddings
98
 
99
-
100
 
101
  def load_recommender(path, start_page=1):
102
- global recommender
103
  texts = pdf_to_text(path, start_page=start_page)
104
  chunks = text_to_chunks(texts, start_page=start_page)
105
  recommender.fit(chunks)
106
  return 'Corpus Loaded.'
107
 
108
- def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
109
  openai.api_key = openAI_key
110
- temperature=0.7
111
- max_tokens=256
112
- top_p=1
113
- frequency_penalty=0
114
- presence_penalty=0
115
 
116
  if model == "text-davinci-003":
117
  completions = openai.Completion.create(
@@ -139,35 +130,16 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
139
  ).choices[0].message['content']
140
  return message
141
 
142
-
143
- def generate_answer(question, openAI_key, model):
144
- topn_chunks = recommender(question)
145
- prompt = 'search results:\n\n'
146
- for c in topn_chunks:
147
- prompt += c + '\n\n'
148
-
149
- prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
150
- "Cite each reference using [ Page Number] notation. "\
151
- "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
152
-
153
- prompt += f"{question}\nAnswer:"
154
- answer = generate_text(openAI_key, prompt, model)
155
- return answer
156
-
157
-
158
  def question_answer(chat_history, url, file, question, model):
159
  try:
160
- if openAI_key.strip()=='':
161
  return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
162
- if url.strip() == '' and file is None:
163
- return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
164
  if url.strip() != '' and file is not None:
165
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
166
- if model is None or model =='':
167
  return '[ERROR]: You have not selected any model. Please choose an LLM model.'
168
  if url.strip() != '':
169
- glob_url = url
170
- download_pdf(glob_url, 'corpus.pdf')
171
  load_recommender('corpus.pdf')
172
  else:
173
  old_file_name = file.name
@@ -177,74 +149,30 @@ def question_answer(chat_history, url, file, question, model):
177
  load_recommender(file_name)
178
  if question.strip() == '':
179
  return '[ERROR]: Question field is empty'
180
- if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
181
- answer = generate_answer_text_davinci_003(question, openAI_key)
182
- else:
183
- answer = generate_answer(question, openAI_key, model)
184
  chat_history.append([question, answer])
185
  return chat_history
186
  except openai.error.InvalidRequestError as e:
187
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
188
 
189
-
190
-
191
- def generate_text_text_davinci_003(openAI_key,prompt, engine="text-davinci-003"):
192
- openai.api_key = openAI_key
193
- completions = openai.Completion.create(
194
- engine=engine,
195
- prompt=prompt,
196
- max_tokens=512,
197
- n=1,
198
- stop=None,
199
- temperature=0.7,
200
- )
201
- message = completions.choices[0].text
202
- return message
203
-
204
-
205
- def generate_answer_text_davinci_003(question,openAI_key):
206
- topn_chunks = recommender(question)
207
- prompt = ""
208
- prompt += 'search results:\n\n'
209
- for c in topn_chunks:
210
- prompt += c + '\n\n'
211
-
212
- prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
213
- "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
214
- "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
215
- "with the same name, create separate answers for each. Only include information found in the results and "\
216
- "don't add any additional information. Make sure the answer is correct and don't output false content. "\
217
- "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
218
- "search results which has nothing to do with the question. Only answer what is asked. The "\
219
- "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
220
-
221
- prompt += f"Query: {question}\nAnswer:"
222
- answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
223
- return answer
224
-
225
- recommender = SemanticSearch()
226
-
227
  title = 'PDF GPT Turbo'
228
  description = """ PDF GPT Turbo allows you to chat with your PDF files. It uses Google's Universal Sentence Encoder with Deep averaging network (DAN) to give hallucination free response by improving the embedding quality of OpenAI. It cites the page number in square brackets([Page No.]) and shows where the information is located, adding credibility to the responses."""
229
 
230
- # Modify the interface setup to remove the OpenAI key input
231
  with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as demo:
232
-
233
  gr.Markdown(f'<center><h3>{title}</h3></center>')
234
  gr.Markdown(description)
235
 
236
  with gr.Row():
237
  with gr.Group():
238
- # Remove the OpenAI key input setup from here
239
  url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
240
  gr.Markdown("<center><h4>OR<h4></center>")
241
  file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
242
  question = gr.Textbox(label='Enter your question here')
243
  model = gr.Radio([
244
- 'gpt-3.5-turbo',
245
- 'gpt-3.5-turbo-16k',
246
- 'gpt-3.5-turbo-0613',
247
- 'gpt-3.5-turbo-16k-0613',
248
  'text-davinci-003',
249
  'gpt-4',
250
  'gpt-4-32k'
@@ -255,7 +183,6 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
255
  with gr.Group():
256
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
257
 
258
- # Bind the click event of the button to the question_answer function
259
  btn.click(
260
  question_answer,
261
  inputs=[chatbot, url, file, question, model],
@@ -263,5 +190,3 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
263
  )
264
 
265
  demo.launch()
266
-
267
-
 
1
  import os
2
  from dotenv import load_dotenv
3
+ import urllib.request
4
+ import fitz # PyMuPDF
5
  import re
6
  import numpy as np
7
  import tensorflow_hub as hub
 
18
  def download_pdf(url, output_path):
19
  urllib.request.urlretrieve(url, output_path)
20
 
 
21
  def preprocess(text):
22
  text = text.replace('\n', ' ')
23
  text = re.sub('\s+', ' ', text)
24
  return text
25
 
 
26
  def pdf_to_text(path, start_page=1, end_page=None):
27
  doc = fitz.open(path)
28
  total_pages = doc.page_count
 
32
 
33
  text_list = []
34
 
35
+ for i in range(start_page - 1, end_page):
36
  text = doc.load_page(i).get_text("text")
37
  text = preprocess(text)
38
  text_list.append(text)
 
40
  doc.close()
41
  return text_list
42
 
 
43
  def text_to_chunks(texts, word_length=150, start_page=1):
44
  text_toks = [t.split(' ') for t in texts]
 
45
  chunks = []
46
+
47
  for idx, words in enumerate(text_toks):
48
  for i in range(0, len(words), word_length):
49
  chunk = words[i:i+word_length]
50
+ if (i + word_length) > len(words) and (len(chunk) < word_length) and (
51
+ len(text_toks) != (idx + 1)):
52
+ text_toks[idx + 1] = chunk + text_toks[idx + 1]
53
  continue
54
  chunk = ' '.join(chunk).strip()
55
+ chunk = f'[Page no. {idx + start_page}]' + ' ' + '"' + chunk + '"'
56
  chunks.append(chunk)
57
  return chunks
58
 
 
59
  class SemanticSearch:
60
+
61
  def __init__(self):
62
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
63
  self.fitted = False
64
+
 
65
  def fit(self, data, batch=1000, n_neighbors=5):
66
  self.data = data
67
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
69
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
70
  self.nn.fit(self.embeddings)
71
  self.fitted = True
72
+
 
73
  def __call__(self, text, return_data=True):
74
  inp_emb = self.use([text])
75
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
76
+
77
  if return_data:
78
  return [self.data[i] for i in neighbors]
79
  else:
80
  return neighbors
81
+
 
82
  def get_text_embedding(self, texts, batch=1000):
83
  embeddings = []
84
  for i in range(0, len(texts), batch):
85
+ text_batch = texts[i:(i + batch)]
86
  emb_batch = self.use(text_batch)
87
  embeddings.append(emb_batch)
88
  embeddings = np.vstack(embeddings)
89
  return embeddings
90
 
91
+ recommender = SemanticSearch()
92
 
93
  def load_recommender(path, start_page=1):
 
94
  texts = pdf_to_text(path, start_page=start_page)
95
  chunks = text_to_chunks(texts, start_page=start_page)
96
  recommender.fit(chunks)
97
  return 'Corpus Loaded.'
98
 
99
+ def generate_text(prompt, model="gpt-3.5-turbo"):
100
  openai.api_key = openAI_key
101
+ temperature = 0.7
102
+ max_tokens = 256
103
+ top_p = 1
104
+ frequency_penalty = 0
105
+ presence_penalty = 0
106
 
107
  if model == "text-davinci-003":
108
  completions = openai.Completion.create(
 
130
  ).choices[0].message['content']
131
  return message
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def question_answer(chat_history, url, file, question, model):
134
  try:
135
+ if openAI_key.strip() == '':
136
  return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
 
 
137
  if url.strip() != '' and file is not None:
138
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
139
+ if model is None or model == '':
140
  return '[ERROR]: You have not selected any model. Please choose an LLM model.'
141
  if url.strip() != '':
142
+ download_pdf(url, 'corpus.pdf')
 
143
  load_recommender('corpus.pdf')
144
  else:
145
  old_file_name = file.name
 
149
  load_recommender(file_name)
150
  if question.strip() == '':
151
  return '[ERROR]: Question field is empty'
152
+ answer = generate_text(question, model)
 
 
 
153
  chat_history.append([question, answer])
154
  return chat_history
155
  except openai.error.InvalidRequestError as e:
156
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  title = 'PDF GPT Turbo'
159
  description = """ PDF GPT Turbo allows you to chat with your PDF files. It uses Google's Universal Sentence Encoder with Deep averaging network (DAN) to give hallucination free response by improving the embedding quality of OpenAI. It cites the page number in square brackets([Page No.]) and shows where the information is located, adding credibility to the responses."""
160
 
 
161
  with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as demo:
 
162
  gr.Markdown(f'<center><h3>{title}</h3></center>')
163
  gr.Markdown(description)
164
 
165
  with gr.Row():
166
  with gr.Group():
 
167
  url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
168
  gr.Markdown("<center><h4>OR<h4></center>")
169
  file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
170
  question = gr.Textbox(label='Enter your question here')
171
  model = gr.Radio([
172
+ 'gpt-3.5-turbo',
173
+ 'gpt-3.5-turbo-16k',
174
+ 'gpt-3.5-turbo-0613',
175
+ 'gpt-3.5-turbo-16k-0613',
176
  'text-davinci-003',
177
  'gpt-4',
178
  'gpt-4-32k'
 
183
  with gr.Group():
184
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
185
 
 
186
  btn.click(
187
  question_answer,
188
  inputs=[chatbot, url, file, question, model],
 
190
  )
191
 
192
  demo.launch()