th8m0z commited on
Commit
0826ebe
1 Parent(s): 47d3f11

working with multiple files + better prompt

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
  venv/
 
 
1
  venv/
2
+ Universal Sentence Encoder/
__pycache__/api.cpython-311.pyc ADDED
Binary file (12.4 kB). View file
 
__pycache__/app.cpython-311.pyc ADDED
Binary file (16.1 kB). View file
 
app.py CHANGED
@@ -18,6 +18,7 @@ def preprocess(text):
18
  return text
19
 
20
 
 
21
  def pdf_to_text(path, start_page=1, end_page=None):
22
  doc = fitz.open(path)
23
  total_pages = doc.page_count
@@ -35,10 +36,11 @@ def pdf_to_text(path, start_page=1, end_page=None):
35
  doc.close()
36
  return text_list
37
 
 
 
38
 
39
- def text_to_chunks(texts, word_length=150, start_page=1):
40
- text_toks = [t.split(' ') for t in texts]
41
- page_nums = []
42
  chunks = []
43
 
44
  for idx, words in enumerate(text_toks):
@@ -49,15 +51,16 @@ def text_to_chunks(texts, word_length=150, start_page=1):
49
  text_toks[idx+1] = chunk + text_toks[idx+1]
50
  continue
51
  chunk = ' '.join(chunk).strip()
52
- chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
  chunks.append(chunk)
 
54
  return chunks
55
 
56
 
57
  class SemanticSearch:
58
 
59
  def __init__(self):
60
- self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
61
  self.fitted = False
62
 
63
 
@@ -91,10 +94,13 @@ class SemanticSearch:
91
 
92
 
93
 
94
- def load_recommender(path, start_page=1):
95
  global recommender
96
- texts = pdf_to_text(path, start_page=start_page)
97
- chunks = text_to_chunks(texts, start_page=start_page)
 
 
 
98
  recommender.fit(chunks)
99
  return 'Corpus Loaded.'
100
 
@@ -148,13 +154,15 @@ def generate_answer(question, openAI_key, model):
148
  return answer
149
 
150
 
151
- def question_answer(chat_history, url, file, question, openAI_key, model):
152
  try:
 
 
153
  if openAI_key.strip()=='':
154
  return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
155
- if url.strip() == '' and file is None:
156
  return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
157
- if url.strip() != '' and file is not None:
158
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
159
  if model is None or model =='':
160
  return '[ERROR]: You have not selected any model. Please choose an LLM model.'
@@ -163,11 +171,17 @@ def question_answer(chat_history, url, file, question, openAI_key, model):
163
  download_pdf(glob_url, 'corpus.pdf')
164
  load_recommender('corpus.pdf')
165
  else:
166
- old_file_name = file.name
167
- file_name = file.name
168
- file_name = file_name[:-12] + file_name[-4:]
169
- os.rename(old_file_name, file_name)
170
- load_recommender(file_name)
 
 
 
 
 
 
171
  if question.strip() == '':
172
  return '[ERROR]: Question field is empty'
173
  if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
@@ -197,21 +211,24 @@ def generate_text_text_davinci_003(openAI_key,prompt, engine="text-davinci-003")
197
 
198
  def generate_answer_text_davinci_003(question,openAI_key):
199
  topn_chunks = recommender(question)
 
200
  prompt = ""
201
  prompt += 'search results:\n\n'
202
  for c in topn_chunks:
203
  prompt += c + '\n\n'
204
 
205
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
206
- "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
207
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
208
  "with the same name, create separate answers for each. Only include information found in the results and "\
209
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
210
  "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
211
  "search results which has nothing to do with the question. Only answer what is asked. The "\
212
- "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
213
 
214
  prompt += f"Query: {question}\nAnswer:"
 
 
215
  answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
216
  return answer
217
 
@@ -248,9 +265,9 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
248
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
249
  with gr.Accordion("API Key"):
250
  openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
251
- url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
252
  gr.Markdown("<center><h4>OR<h4></center>")
253
- file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
254
  question = gr.Textbox(label='Enter your question here')
255
  gr.Examples(
256
  [[q] for q in questions],
@@ -274,11 +291,11 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
274
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
275
 
276
 
277
- #
278
  # Bind the click event of the button to the question_answer function
279
  btn.click(
280
  question_answer,
281
- inputs=[chatbot, url, file, question, openAI_key, model],
282
  outputs=[chatbot],
283
  )
284
 
 
18
  return text
19
 
20
 
21
+ # converts pdf to text
22
  def pdf_to_text(path, start_page=1, end_page=None):
23
  doc = fitz.open(path)
24
  total_pages = doc.page_count
 
36
  doc.close()
37
  return text_list
38
 
39
+ # one text converts a list of chunks
40
+ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
41
 
42
+ filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
43
+ text_toks = [t.split(' ') for t in filtered_texts]
 
44
  chunks = []
45
 
46
  for idx, words in enumerate(text_toks):
 
51
  text_toks[idx+1] = chunk + text_toks[idx+1]
52
  continue
53
  chunk = ' '.join(chunk).strip()
54
+ chunk = f'[PDF no. {file_number}] [Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
55
  chunks.append(chunk)
56
+ # print("chunks == " + str(chunks))
57
  return chunks
58
 
59
 
60
  class SemanticSearch:
61
 
62
  def __init__(self):
63
+ self.use = hub.load('./Universal Sentence Encoder/')
64
  self.fitted = False
65
 
66
 
 
94
 
95
 
96
 
97
+ def load_recommender(paths, start_page=1):
98
  global recommender
99
+ texts = []
100
+ chunks = []
101
+ for idx, path in enumerate(paths):
102
+ chunks += text_to_chunks(pdf_to_text(path, start_page=start_page), start_page=start_page, file_number=idx+1)
103
+ # print("chunks == " + str(chunks))
104
  recommender.fit(chunks)
105
  return 'Corpus Loaded.'
106
 
 
154
  return answer
155
 
156
 
157
+ def question_answer(chat_history, url, files, question, openAI_key, model):
158
  try:
159
+ if files == None:
160
+ files = []
161
  if openAI_key.strip()=='':
162
  return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
163
+ if url.strip() == '' and files == []:
164
  return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
165
+ if url.strip() != '' and files is not []:
166
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
167
  if model is None or model =='':
168
  return '[ERROR]: You have not selected any model. Please choose an LLM model.'
 
171
  download_pdf(glob_url, 'corpus.pdf')
172
  load_recommender('corpus.pdf')
173
  else:
174
+ print(files)
175
+ filenames = []
176
+ for file in files:
177
+ old_file_name = file.name
178
+ file_name = file.name
179
+ file_name = file_name[:-12] + file_name[-4:]
180
+ os.rename(old_file_name, file_name)
181
+ filenames.append(file_name)
182
+ load_recommender(filenames)
183
+
184
+
185
  if question.strip() == '':
186
  return '[ERROR]: Question field is empty'
187
  if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
 
211
 
212
  def generate_answer_text_davinci_003(question,openAI_key):
213
  topn_chunks = recommender(question)
214
+ # print("topn chunks == " + str(topn_chunks))
215
  prompt = ""
216
  prompt += 'search results:\n\n'
217
  for c in topn_chunks:
218
  prompt += c + '\n\n'
219
 
220
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
221
+ "Cite each reference using [PDF Number][Page Number] notation (every result has this number at the beginning). "\
222
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
223
  "with the same name, create separate answers for each. Only include information found in the results and "\
224
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
225
  "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
226
  "search results which has nothing to do with the question. Only answer what is asked. The "\
227
+ "answer should be short and concise.\n\n"
228
 
229
  prompt += f"Query: {question}\nAnswer:"
230
+ print("prompt == " + str(prompt))
231
+ # print("prompt == " + str(prompt))
232
  answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
233
  return answer
234
 
 
265
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
266
  with gr.Accordion("API Key"):
267
  openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
268
+ url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
269
  gr.Markdown("<center><h4>OR<h4></center>")
270
+ files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
271
  question = gr.Textbox(label='Enter your question here')
272
  gr.Examples(
273
  [[q] for q in questions],
 
291
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
292
 
293
 
294
+
295
  # Bind the click event of the button to the question_answer function
296
  btn.click(
297
  question_answer,
298
+ inputs=[chatbot, url, files, question, openAI_key, model],
299
  outputs=[chatbot],
300
  )
301