bhaskartripathi commited on
Commit
49182e8
Β·
1 Parent(s): 93c2f3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -65
app.py CHANGED
@@ -1,34 +1,109 @@
 
 
1
  import urllib.request
2
  import fitz
3
  import re
4
  import numpy as np
5
  import tensorflow_hub as hub
6
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
7
- import torch
8
- import gradio as gr
9
- import os
10
  from sklearn.neighbors import NearestNeighbors
 
 
 
11
 
12
- model_name = "tiiuae/falcon-40b-instruct"
13
-
14
- tokenizer = AutoTokenizer.from_pretrained(model_name)
15
- text_gen = pipeline(
16
  "text-generation",
17
- model=model_name,
18
  tokenizer=tokenizer,
19
  torch_dtype=torch.bfloat16,
 
20
  device_map="auto",
21
  )
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def download_pdf(url, output_path):
 
24
  urllib.request.urlretrieve(url, output_path)
25
 
26
  def preprocess(text):
 
27
  text = text.replace('\n', ' ')
28
  text = re.sub('\s+', ' ', text)
29
  return text
30
 
31
  def pdf_to_text(path, start_page=1, end_page=None):
 
32
  doc = fitz.open(path)
33
  total_pages = doc.page_count
34
 
@@ -46,14 +121,14 @@ def pdf_to_text(path, start_page=1, end_page=None):
46
  return text_list
47
 
48
  def text_to_chunks(texts, word_length=150, start_page=1):
 
49
  text_toks = [t.split(' ') for t in texts]
50
  chunks = []
51
-
52
  for idx, words in enumerate(text_toks):
53
  for i in range(0, len(words), word_length):
54
  chunk = words[i:i+word_length]
55
- if (i+word_length) > len(words) and (len(chunk) < word_length) and (
56
- len(text_toks) != (idx+1)):
57
  text_toks[idx+1] = chunk + text_toks[idx+1]
58
  continue
59
  chunk = ' '.join(chunk).strip()
@@ -62,28 +137,32 @@ def text_to_chunks(texts, word_length=150, start_page=1):
62
  return chunks
63
 
64
  class SemanticSearch:
 
65
  def __init__(self):
66
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
67
  self.fitted = False
68
-
69
  def fit(self, data, batch=1000, n_neighbors=5):
 
70
  self.data = data
71
  self.embeddings = self.get_text_embedding(data, batch=batch)
72
  n_neighbors = min(n_neighbors, len(self.embeddings))
73
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
74
  self.nn.fit(self.embeddings)
75
  self.fitted = True
76
-
77
  def __call__(self, text, return_data=True):
 
78
  inp_emb = self.use([text])
79
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
80
-
81
  if return_data:
82
  return [self.data[i] for i in neighbors]
83
  else:
84
  return neighbors
85
-
86
  def get_text_embedding(self, texts, batch=1000):
 
87
  embeddings = []
88
  for i in range(0, len(texts), batch):
89
  text_batch = texts[i:(i+batch)]
@@ -93,23 +172,13 @@ class SemanticSearch:
93
  return embeddings
94
 
95
  def load_recommender(path, start_page=1):
 
96
  global recommender
97
  texts = pdf_to_text(path, start_page=start_page)
98
  chunks = text_to_chunks(texts, start_page=start_page)
99
  recommender.fit(chunks)
100
  return 'Corpus Loaded.'
101
 
102
- def generate_text(prompt, max_length=512):
103
- sequences = text_gen(
104
- prompt,
105
- max_length=max_length,
106
- do_sample=True,
107
- top_k=10,
108
- num_return_sequences=1,
109
- eos_token_id=tokenizer.eos_token_id,
110
- )
111
- message = sequences[0]['generated_text']
112
- return message
113
 
114
  def generate_answer(question):
115
  topn_chunks = recommender(question)
@@ -118,49 +187,88 @@ def generate_answer(question):
118
  for c in topn_chunks:
119
  prompt += c + '\n\n'
120
 
121
- prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
122
- "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
123
- "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
124
- "with the same name, create separate answers for each. Only include information found in the results and "\
125
- "don't add any additional information. Make sure the answer is correct and don't output false content. "\
126
- "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
127
- "search results which has nothing to do with the question. Only answer what is asked. The "\
128
- "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
129
-
130
  prompt += f"Query: {question}\nAnswer:"
131
- answer = generate_text(prompt, 512)
132
- return answer
133
-
134
- def question_answer(url, file, question):
135
- if url.strip() == '' and file == None:
136
- return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
137
 
138
- if url.strip() != '' and file != None:
139
- return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
140
-
141
- if url.strip() != '':
142
- glob_url = url
143
- download_pdf(glob_url, 'corpus.pdf')
144
- load_recommender('corpus.pdf')
 
145
 
146
- else:
147
- old_file_name = file.name
148
- file_name = file.name
149
- file_name = file_name[:-12] + file_name[-4:]
150
- os.rename(old_file_name, file_name)
151
- load_recommender(file_name)
152
 
153
- if question.strip() == '':
154
- return '[ERROR]: Question field is empty'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- return generate_answer(question)
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- recommender = SemanticSearch()
159
 
160
- title = 'PDF GPT'
161
- description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Falcon. It gives hallucination free response than other tools as the embeddings are better than GPT-3. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
162
 
163
- with gr.Blocks() as demo:
164
 
165
  gr.Markdown(f'<center><h1>{title}</h1></center>')
166
  gr.Markdown(description)
@@ -168,16 +276,29 @@ with gr.Blocks() as demo:
168
  with gr.Row():
169
 
170
  with gr.Group():
171
- url = gr.Textbox(label='Enter PDF URL here')
172
  gr.Markdown("<center><h4>OR<h4></center>")
173
  file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
174
  question = gr.Textbox(label='Enter your question here')
 
 
 
 
 
175
  btn = gr.Button(value='Submit')
 
 
176
  btn.style(full_width=True)
177
 
178
  with gr.Group():
179
- answer = gr.Textbox(label='The answer to your question is :')
 
180
 
181
- btn.click(question_answer, inputs=[url, file, question], outputs=[answer])
 
 
 
 
 
182
 
183
  demo.launch()
 
1
+ # Import required modules
2
+ import gradio as gr
3
  import urllib.request
4
  import fitz
5
  import re
6
  import numpy as np
7
  import tensorflow_hub as hub
 
 
 
 
8
  from sklearn.neighbors import NearestNeighbors
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ import transformers
11
+ import torch
12
 
13
+ # Load the Falcon model
14
+ model = "tiiuae/falcon-40b-instruct"
15
+ tokenizer = AutoTokenizer.from_pretrained(model)
16
+ pipeline = transformers.pipeline(
17
  "text-generation",
18
+ model=model,
19
  tokenizer=tokenizer,
20
  torch_dtype=torch.bfloat16,
21
+ trust_remote_code=True,
22
  device_map="auto",
23
  )
24
 
25
+ # Load the PDF-GPT model
26
+ recommender = SemanticSearch()
27
+
28
+ # Define chat function
29
+ def chat():
30
+ with gr.Interface(
31
+ question_answer,
32
+ [
33
+ gr.inputs.Textbox(placeholder="Chat History", type="text", label="Chat History", lines=20),
34
+ gr.inputs.Textbox(placeholder="Enter PDF URL here", type="text", label="URL"),
35
+ gr.inputs.File(label="Or upload your PDF here"),
36
+ gr.inputs.Textbox(placeholder="Enter your question here", type="text", label="Question"),
37
+ ],
38
+ gr.outputs.Textbox(placeholder="Chat History", type="text", label="Chat History", lines=20),
39
+ title="Falcon-PDF Chatbot",
40
+ description="A chatbot that can read and answer questions about a PDF document using the Falcon model",
41
+ layout="vertical",
42
+ ) as interface:
43
+ with gr.Row():
44
+ chatbot = gr.Chatbot(placeholder="Chat History", lines=20)
45
+ with gr.Row():
46
+ inputs = gr.Textbox(placeholder="Hello Falcon !!", label="Type an input and press Enter", max_lines=3)
47
+ url = gr.Textbox(placeholder="Enter PDF URL here", label="URL")
48
+ file = gr.File(label="Or upload your PDF here")
49
+ question = gr.Textbox(placeholder="Enter your question here", label="Question")
50
+ chat_button = gr.Button(label="Chat")
51
+
52
+ chat_button.on_click(question_answer, [chatbot, url, file, question])
53
+
54
+ with gr.Row():
55
+ retry_button = gr.Button("♻️ Retry last turn")
56
+ delete_turn_button = gr.Button("🧽 Delete last turn")
57
+ clear_chat_button = gr.Button("✨ Delete all history")
58
+
59
+ retry_button.on_click(retry_last_turn, [chatbot])
60
+ delete_turn_button.on_click(delete_last_turn, [chatbot])
61
+ clear_chat_button.on_click(clear_chat_history, [chatbot])
62
+
63
+ # Launch the Gradio interface
64
+ interface.launch()
65
+
66
+ def retry_last_turn(chat_history):
67
+ """Handles retrying the last turn."""
68
+ if len(chat_history) > 0:
69
+ # Get the last question from the chat history
70
+ last_question = chat_history[-1][0]
71
+ # Remove the last turn from the chat history
72
+ chat_history = chat_history[:-1]
73
+ # Retry the last question
74
+ question_answer(chat_history, last_question)
75
+ else:
76
+ print("Chat history is empty.")
77
+ return chat_history
78
+
79
+ def delete_last_turn(chat_history):
80
+ """Handles deleting the last turn."""
81
+ if len(chat_history) > 0:
82
+ # Remove the last turn from the chat history
83
+ chat_history = chat_history[:-1]
84
+ else:
85
+ print("Chat history is empty.")
86
+ return chat_history
87
+
88
+ def clear_chat_history(chat_history):
89
+ """Handles clearing the chat history."""
90
+ # Clear the chat history
91
+ chat_history = []
92
+ return chat_history
93
+
94
+
95
  def download_pdf(url, output_path):
96
+ """Download a PDF from a URL and save it to the specified output path."""
97
  urllib.request.urlretrieve(url, output_path)
98
 
99
  def preprocess(text):
100
+ """Preprocess a text by replacing newline characters with spaces and reducing multiple spaces to single spaces."""
101
  text = text.replace('\n', ' ')
102
  text = re.sub('\s+', ' ', text)
103
  return text
104
 
105
  def pdf_to_text(path, start_page=1, end_page=None):
106
+ """Extract text from a PDF file from the specified start page to the end page."""
107
  doc = fitz.open(path)
108
  total_pages = doc.page_count
109
 
 
121
  return text_list
122
 
123
  def text_to_chunks(texts, word_length=150, start_page=1):
124
+ """Split a list of texts into chunks with the specified word length."""
125
  text_toks = [t.split(' ') for t in texts]
126
  chunks = []
127
+
128
  for idx, words in enumerate(text_toks):
129
  for i in range(0, len(words), word_length):
130
  chunk = words[i:i+word_length]
131
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (len(text_toks) != (idx+1)):
 
132
  text_toks[idx+1] = chunk + text_toks[idx+1]
133
  continue
134
  chunk = ' '.join(chunk).strip()
 
137
  return chunks
138
 
139
  class SemanticSearch:
140
+ """A class for performing semantic search using the Universal Sentence Encoder."""
141
  def __init__(self):
142
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
143
  self.fitted = False
144
+
145
  def fit(self, data, batch=1000, n_neighbors=5):
146
+ """Fit the model to the data."""
147
  self.data = data
148
  self.embeddings = self.get_text_embedding(data, batch=batch)
149
  n_neighbors = min(n_neighbors, len(self.embeddings))
150
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
151
  self.nn.fit(self.embeddings)
152
  self.fitted = True
153
+
154
  def __call__(self, text, return_data=True):
155
+ """Find the nearest neighbors to a text."""
156
  inp_emb = self.use([text])
157
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
158
+
159
  if return_data:
160
  return [self.data[i] for i in neighbors]
161
  else:
162
  return neighbors
163
+
164
  def get_text_embedding(self, texts, batch=1000):
165
+ """Get the embeddings of a list of texts."""
166
  embeddings = []
167
  for i in range(0, len(texts), batch):
168
  text_batch = texts[i:(i+batch)]
 
172
  return embeddings
173
 
174
  def load_recommender(path, start_page=1):
175
+ """Load a recommender model with a PDF file."""
176
  global recommender
177
  texts = pdf_to_text(path, start_page=start_page)
178
  chunks = text_to_chunks(texts, start_page=start_page)
179
  recommender.fit(chunks)
180
  return 'Corpus Loaded.'
181
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  def generate_answer(question):
184
  topn_chunks = recommender(question)
 
187
  for c in topn_chunks:
188
  prompt += c + '\n\n'
189
 
 
 
 
 
 
 
 
 
 
190
  prompt += f"Query: {question}\nAnswer:"
 
 
 
 
 
 
191
 
192
+ sequences = pipeline(
193
+ prompt,
194
+ max_length=200,
195
+ do_sample=True,
196
+ top_k=10,
197
+ num_return_sequences=1,
198
+ eos_token_id=tokenizer.eos_token_id,
199
+ )
200
 
201
+ return sequences[0]['generated_text']
 
 
 
 
 
202
 
203
+ def question_answer(chat_history, url, file, question):
204
+ try:
205
+ if url.strip() == '' and file is None:
206
+ return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
207
+ if url.strip() != '' and file is not None:
208
+ return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
209
+ if url.strip() != '':
210
+ glob_url = url
211
+ download_pdf(glob_url, 'corpus.pdf')
212
+ load_recommender('corpus.pdf')
213
+ else:
214
+ old_file_name = file.name
215
+ file_name = file.name
216
+ file_name = file_name[:-12] + file_name[-4:]
217
+ os.rename(old_file_name, file_name)
218
+ load_recommender(file_name)
219
+ if question.strip() == '':
220
+ return '[ERROR]: Question field is empty'
221
+ topn_chunks = recommender(question)
222
+ prompt = ""
223
+ prompt += 'search results:\n\n'
224
+ for c in topn_chunks:
225
+ prompt += c + '\n\n'
226
+
227
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
228
+ "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
229
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
230
+ "with the same name, create separate answers for each. Only include information found in the results and "\
231
+ "don't add any additional information. Make sure the answer is correct and don't output false content. "\
232
+ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
233
+ "search results which has nothing to do with the question. Only answer what is asked. The "\
234
+ "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
235
+
236
+ prompt += f"Query: {question}\nAnswer:"
237
+
238
+ sequences = pipeline(
239
+ prompt,
240
+ max_length=200,
241
+ do_sample=True,
242
+ top_k=10,
243
+ num_return_sequences=1,
244
+ eos_token_id=tokenizer.eos_token_id,
245
+ )
246
+ answer = sequences[0]['generated_text']
247
+ chat_history.append([question, answer])
248
+ return chat_history
249
+ except Exception as e:
250
+ return f'[ERROR]: {str(e)}'
251
 
252
+ questions = [
253
+ "What did the study investigate?",
254
+ "Can you provide a summary of this document?",
255
+ "What are the methodologies used in this study?",
256
+ "What are the data intervals used in this study? Give me the start dates and end dates.",
257
+ "What are the main limitations of this study?",
258
+ "What are the main shortcomings of this study?",
259
+ "What are the main findings of the study?",
260
+ "What are the main results of the study?",
261
+ "What are the main contributions of this study?",
262
+ "What is the conclusion of this paper?",
263
+ "What are the input features used in this study?",
264
+ "What is the dependent variable in this study?",
265
+ ]
266
 
 
267
 
268
+ title = 'PDF GPT Turbo'
269
+ description = """ PDF GPT Turbo allows you to chat with your PDF file using Universal Sentence Encoder and Falcon. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
270
 
271
+ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 800px; }""") as demo:
272
 
273
  gr.Markdown(f'<center><h1>{title}</h1></center>')
274
  gr.Markdown(description)
 
276
  with gr.Row():
277
 
278
  with gr.Group():
279
+ url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
280
  gr.Markdown("<center><h4>OR<h4></center>")
281
  file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
282
  question = gr.Textbox(label='Enter your question here')
283
+ gr.Examples(
284
+ [[q] for q in questions],
285
+ inputs=[question],
286
+ label="PRE-DEFINED QUESTIONS: Click on a question to auto-fill the input box, then press Enter!",
287
+ )
288
  btn = gr.Button(value='Submit')
289
+
290
+
291
  btn.style(full_width=True)
292
 
293
  with gr.Group():
294
+ chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=20, elem_id="chatbot")
295
+
296
 
297
+ # Bind the click event of the button to the question_answer function
298
+ btn.click(
299
+ question_answer,
300
+ inputs=[chatbot, url, file, question],
301
+ outputs=[chatbot],
302
+ )
303
 
304
  demo.launch()