thomasmz1 commited on
Commit
45fcfc3
1 Parent(s): b454c71

Implementation of multi-pdf chat

Browse files

I have added a simple implementation of multi-pdf chats by making it possible to upload multiple pdfs via file upload. It's not possible to use multiple URLs yet.

Changes
- Making sure chunks get added from text of all files
- Added a [file number] parameter to each chunk
- Adapted the prompts to also cite the file number

Files changed (1) hide show
  1. app.py +33 -26
app.py CHANGED
@@ -18,6 +18,7 @@ def preprocess(text):
18
  return text
19
 
20
 
 
21
  def pdf_to_text(path, start_page=1, end_page=None):
22
  doc = fitz.open(path)
23
  total_pages = doc.page_count
@@ -35,8 +36,8 @@ def pdf_to_text(path, start_page=1, end_page=None):
35
  doc.close()
36
  return text_list
37
 
38
-
39
- def text_to_chunks(texts, word_length=150, start_page=1):
40
  text_toks = [t.split(' ') for t in texts]
41
  page_nums = []
42
  chunks = []
@@ -49,7 +50,7 @@ def text_to_chunks(texts, word_length=150, start_page=1):
49
  text_toks[idx+1] = chunk + text_toks[idx+1]
50
  continue
51
  chunk = ' '.join(chunk).strip()
52
- chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
  chunks.append(chunk)
54
  return chunks
55
 
@@ -91,10 +92,12 @@ class SemanticSearch:
91
 
92
 
93
 
94
- def load_recommender(path, start_page=1):
95
  global recommender
96
- texts = pdf_to_text(path, start_page=start_page)
97
- chunks = text_to_chunks(texts, start_page=start_page)
 
 
98
  recommender.fit(chunks)
99
  return 'Corpus Loaded.'
100
 
@@ -140,7 +143,7 @@ def generate_answer(question, openAI_key, model):
140
  prompt += c + '\n\n'
141
 
142
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
143
- "Cite each reference using [ Page Number] notation. "\
144
  "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
145
 
146
  prompt += f"{question}\nAnswer:"
@@ -148,13 +151,15 @@ def generate_answer(question, openAI_key, model):
148
  return answer
149
 
150
 
151
- def question_answer(chat_history, url, file, question, openAI_key, model):
152
  try:
 
 
153
  if openAI_key.strip()=='':
154
  return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
155
- if url.strip() == '' and file is None:
156
  return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
157
- if url.strip() != '' and file is not None:
158
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
159
  if model is None or model =='':
160
  return '[ERROR]: You have not selected any model. Please choose an LLM model.'
@@ -163,11 +168,16 @@ def question_answer(chat_history, url, file, question, openAI_key, model):
163
  download_pdf(glob_url, 'corpus.pdf')
164
  load_recommender('corpus.pdf')
165
  else:
166
- old_file_name = file.name
167
- file_name = file.name
168
- file_name = file_name[:-12] + file_name[-4:]
169
- os.rename(old_file_name, file_name)
170
- load_recommender(file_name)
 
 
 
 
 
171
  if question.strip() == '':
172
  return '[ERROR]: Question field is empty'
173
  if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
@@ -203,7 +213,7 @@ def generate_answer_text_davinci_003(question,openAI_key):
203
  prompt += c + '\n\n'
204
 
205
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
206
- "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
207
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
208
  "with the same name, create separate answers for each. Only include information found in the results and "\
209
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
@@ -212,6 +222,7 @@ def generate_answer_text_davinci_003(question,openAI_key):
212
  "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
213
 
214
  prompt += f"Query: {question}\nAnswer:"
 
215
  answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
216
  return answer
217
 
@@ -242,15 +253,14 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
242
  gr.Markdown(f'<center><h3>{title}</h3></center>')
243
  gr.Markdown(description)
244
 
245
- with gr.Row():
246
-
247
  with gr.Group():
248
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
249
  with gr.Accordion("API Key"):
250
  openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
251
- url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
252
  gr.Markdown("<center><h4>OR<h4></center>")
253
- file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
254
  question = gr.Textbox(label='Enter your question here')
255
  gr.Examples(
256
  [[q] for q in questions],
@@ -273,15 +283,12 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
273
  with gr.Group():
274
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
275
 
276
-
277
- #
278
  # Bind the click event of the button to the question_answer function
279
  btn.click(
280
  question_answer,
281
- inputs=[chatbot, url, file, question, openAI_key, model],
282
  outputs=[chatbot],
283
  )
284
 
285
- demo.launch()
286
-
287
-
 
18
  return text
19
 
20
 
21
+ # converts pdf to text
22
  def pdf_to_text(path, start_page=1, end_page=None):
23
  doc = fitz.open(path)
24
  total_pages = doc.page_count
 
36
  doc.close()
37
  return text_list
38
 
39
+ # one text converts a list of chunks
40
+ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
41
  text_toks = [t.split(' ') for t in texts]
42
  page_nums = []
43
  chunks = []
 
50
  text_toks[idx+1] = chunk + text_toks[idx+1]
51
  continue
52
  chunk = ' '.join(chunk).strip()
53
+ chunk = f'[File no. {file_number}] [Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
54
  chunks.append(chunk)
55
  return chunks
56
 
 
92
 
93
 
94
 
95
+ def load_recommender(paths, start_page=1):
96
  global recommender
97
+ texts = []
98
+ chunks = []
99
+ for idx, path in enumerate(paths):
100
+ chunks += text_to_chunks(pdf_to_text(path, start_page=start_page), start_page=start_page, file_number=idx+1)
101
  recommender.fit(chunks)
102
  return 'Corpus Loaded.'
103
 
 
143
  prompt += c + '\n\n'
144
 
145
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
146
+ "Cite each reference using [File number][ Page Number] notation. "\
147
  "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
148
 
149
  prompt += f"{question}\nAnswer:"
 
151
  return answer
152
 
153
 
154
+ def question_answer(chat_history, url, files, question, openAI_key, model):
155
  try:
156
+ if files == None:
157
+ files = []
158
  if openAI_key.strip()=='':
159
  return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
160
+ if url.strip() == '' and files == []:
161
  return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
162
+ if url.strip() != '' and files is not []:
163
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
164
  if model is None or model =='':
165
  return '[ERROR]: You have not selected any model. Please choose an LLM model.'
 
168
  download_pdf(glob_url, 'corpus.pdf')
169
  load_recommender('corpus.pdf')
170
  else:
171
+ filenames = []
172
+ for file in files:
173
+ old_file_name = file.name
174
+ file_name = file.name
175
+ file_name = file_name[:-12] + file_name[-4:]
176
+ os.rename(old_file_name, file_name)
177
+ filenames.append(file_name)
178
+ load_recommender(filenames)
179
+
180
+
181
  if question.strip() == '':
182
  return '[ERROR]: Question field is empty'
183
  if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
 
213
  prompt += c + '\n\n'
214
 
215
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
216
+ "Cite each reference using [File number] [ Page Number] notation (every result has this number at the beginning). "\
217
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
218
  "with the same name, create separate answers for each. Only include information found in the results and "\
219
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
 
222
  "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
223
 
224
  prompt += f"Query: {question}\nAnswer:"
225
+ # print("prompt == " + str(prompt))
226
  answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
227
  return answer
228
 
 
253
  gr.Markdown(f'<center><h3>{title}</h3></center>')
254
  gr.Markdown(description)
255
 
256
+ with gr.Row():
 
257
  with gr.Group():
258
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
259
  with gr.Accordion("API Key"):
260
  openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
261
+ url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
262
  gr.Markdown("<center><h4>OR<h4></center>")
263
+ files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
264
  question = gr.Textbox(label='Enter your question here')
265
  gr.Examples(
266
  [[q] for q in questions],
 
283
  with gr.Group():
284
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
285
 
286
+
 
287
  # Bind the click event of the button to the question_answer function
288
  btn.click(
289
  question_answer,
290
+ inputs=[chatbot, url, files, question, openAI_key, model],
291
  outputs=[chatbot],
292
  )
293
 
294
+ demo.launch()