thomasmz1 commited on
Commit
961a746
1 Parent(s): b454c71

Refactor project

Browse files

With this update the following changes were implemented:

- Project refactor into multiple files + cleaning up codebase
- Removing non-ASCII characters from chunks
- Uploading and chatting with multiple PDFs

Files changed (3) hide show
  1. app.py +6 -225
  2. functions.py +154 -0
  3. semantic_search.py +41 -0
app.py CHANGED
@@ -1,219 +1,5 @@
1
- import urllib.request
2
- import fitz
3
- import re
4
- import numpy as np
5
- import tensorflow_hub as hub
6
- import openai
7
  import gradio as gr
8
- import os
9
- from sklearn.neighbors import NearestNeighbors
10
-
11
- def download_pdf(url, output_path):
12
- urllib.request.urlretrieve(url, output_path)
13
-
14
-
15
- def preprocess(text):
16
- text = text.replace('\n', ' ')
17
- text = re.sub('\s+', ' ', text)
18
- return text
19
-
20
-
21
- def pdf_to_text(path, start_page=1, end_page=None):
22
- doc = fitz.open(path)
23
- total_pages = doc.page_count
24
-
25
- if end_page is None:
26
- end_page = total_pages
27
-
28
- text_list = []
29
-
30
- for i in range(start_page-1, end_page):
31
- text = doc.load_page(i).get_text("text")
32
- text = preprocess(text)
33
- text_list.append(text)
34
-
35
- doc.close()
36
- return text_list
37
-
38
-
39
- def text_to_chunks(texts, word_length=150, start_page=1):
40
- text_toks = [t.split(' ') for t in texts]
41
- page_nums = []
42
- chunks = []
43
-
44
- for idx, words in enumerate(text_toks):
45
- for i in range(0, len(words), word_length):
46
- chunk = words[i:i+word_length]
47
- if (i+word_length) > len(words) and (len(chunk) < word_length) and (
48
- len(text_toks) != (idx+1)):
49
- text_toks[idx+1] = chunk + text_toks[idx+1]
50
- continue
51
- chunk = ' '.join(chunk).strip()
52
- chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
- chunks.append(chunk)
54
- return chunks
55
-
56
-
57
- class SemanticSearch:
58
-
59
- def __init__(self):
60
- self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
61
- self.fitted = False
62
-
63
-
64
- def fit(self, data, batch=1000, n_neighbors=5):
65
- self.data = data
66
- self.embeddings = self.get_text_embedding(data, batch=batch)
67
- n_neighbors = min(n_neighbors, len(self.embeddings))
68
- self.nn = NearestNeighbors(n_neighbors=n_neighbors)
69
- self.nn.fit(self.embeddings)
70
- self.fitted = True
71
-
72
-
73
- def __call__(self, text, return_data=True):
74
- inp_emb = self.use([text])
75
- neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
76
-
77
- if return_data:
78
- return [self.data[i] for i in neighbors]
79
- else:
80
- return neighbors
81
-
82
-
83
- def get_text_embedding(self, texts, batch=1000):
84
- embeddings = []
85
- for i in range(0, len(texts), batch):
86
- text_batch = texts[i:(i+batch)]
87
- emb_batch = self.use(text_batch)
88
- embeddings.append(emb_batch)
89
- embeddings = np.vstack(embeddings)
90
- return embeddings
91
-
92
-
93
-
94
- def load_recommender(path, start_page=1):
95
- global recommender
96
- texts = pdf_to_text(path, start_page=start_page)
97
- chunks = text_to_chunks(texts, start_page=start_page)
98
- recommender.fit(chunks)
99
- return 'Corpus Loaded.'
100
-
101
- def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
102
- openai.api_key = openAI_key
103
- temperature=0.7
104
- max_tokens=256
105
- top_p=1
106
- frequency_penalty=0
107
- presence_penalty=0
108
-
109
- if model == "text-davinci-003":
110
- completions = openai.Completion.create(
111
- engine=model,
112
- prompt=prompt,
113
- max_tokens=max_tokens,
114
- n=1,
115
- stop=None,
116
- temperature=temperature,
117
- )
118
- message = completions.choices[0].text
119
- else:
120
- message = openai.ChatCompletion.create(
121
- model=model,
122
- messages=[
123
- {"role": "system", "content": "You are a helpful assistant."},
124
- {"role": "assistant", "content": "Here is some initial assistant message."},
125
- {"role": "user", "content": prompt}
126
- ],
127
- temperature=.3,
128
- max_tokens=max_tokens,
129
- top_p=top_p,
130
- frequency_penalty=frequency_penalty,
131
- presence_penalty=presence_penalty,
132
- ).choices[0].message['content']
133
- return message
134
-
135
-
136
- def generate_answer(question, openAI_key, model):
137
- topn_chunks = recommender(question)
138
- prompt = 'search results:\n\n'
139
- for c in topn_chunks:
140
- prompt += c + '\n\n'
141
-
142
- prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
143
- "Cite each reference using [ Page Number] notation. "\
144
- "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
145
-
146
- prompt += f"{question}\nAnswer:"
147
- answer = generate_text(openAI_key, prompt, model)
148
- return answer
149
-
150
-
151
- def question_answer(chat_history, url, file, question, openAI_key, model):
152
- try:
153
- if openAI_key.strip()=='':
154
- return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
155
- if url.strip() == '' and file is None:
156
- return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
157
- if url.strip() != '' and file is not None:
158
- return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
159
- if model is None or model =='':
160
- return '[ERROR]: You have not selected any model. Please choose an LLM model.'
161
- if url.strip() != '':
162
- glob_url = url
163
- download_pdf(glob_url, 'corpus.pdf')
164
- load_recommender('corpus.pdf')
165
- else:
166
- old_file_name = file.name
167
- file_name = file.name
168
- file_name = file_name[:-12] + file_name[-4:]
169
- os.rename(old_file_name, file_name)
170
- load_recommender(file_name)
171
- if question.strip() == '':
172
- return '[ERROR]: Question field is empty'
173
- if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
174
- answer = generate_answer_text_davinci_003(question, openAI_key)
175
- else:
176
- answer = generate_answer(question, openAI_key, model)
177
- chat_history.append([question, answer])
178
- return chat_history
179
- except openai.error.InvalidRequestError as e:
180
- return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
181
-
182
-
183
-
184
- def generate_text_text_davinci_003(openAI_key,prompt, engine="text-davinci-003"):
185
- openai.api_key = openAI_key
186
- completions = openai.Completion.create(
187
- engine=engine,
188
- prompt=prompt,
189
- max_tokens=512,
190
- n=1,
191
- stop=None,
192
- temperature=0.7,
193
- )
194
- message = completions.choices[0].text
195
- return message
196
-
197
-
198
- def generate_answer_text_davinci_003(question,openAI_key):
199
- topn_chunks = recommender(question)
200
- prompt = ""
201
- prompt += 'search results:\n\n'
202
- for c in topn_chunks:
203
- prompt += c + '\n\n'
204
-
205
- prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
206
- "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
207
- "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
208
- "with the same name, create separate answers for each. Only include information found in the results and "\
209
- "don't add any additional information. Make sure the answer is correct and don't output false content. "\
210
- "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
211
- "search results which has nothing to do with the question. Only answer what is asked. The "\
212
- "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
213
-
214
- prompt += f"Query: {question}\nAnswer:"
215
- answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
216
- return answer
217
 
218
  # pre-defined questions
219
  questions = [
@@ -231,9 +17,6 @@ questions = [
231
  "what is the dependent variable in this study?",
232
  ]
233
 
234
-
235
- recommender = SemanticSearch()
236
-
237
  title = 'PDF GPT Turbo'
238
  description = """ PDF GPT Turbo allows you to chat with your PDF files. It uses Google's Universal Sentence Encoder with Deep averaging network (DAN) to give hallucination free response by improving the embedding quality of OpenAI. It cites the page number in square brackets([Page No.]) and shows where the information is located, adding credibility to the responses."""
239
 
@@ -248,9 +31,9 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
248
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
249
  with gr.Accordion("API Key"):
250
  openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
251
- url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
252
  gr.Markdown("<center><h4>OR<h4></center>")
253
- file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
254
  question = gr.Textbox(label='Enter your question here')
255
  gr.Examples(
256
  [[q] for q in questions],
@@ -274,14 +57,12 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
274
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
275
 
276
 
277
- #
278
  # Bind the click event of the button to the question_answer function
279
  btn.click(
280
- question_answer,
281
- inputs=[chatbot, url, file, question, openAI_key, model],
282
  outputs=[chatbot],
283
  )
284
 
285
  demo.launch()
286
-
287
-
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import functions as functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  # pre-defined questions
5
  questions = [
 
17
  "what is the dependent variable in this study?",
18
  ]
19
 
 
 
 
20
  title = 'PDF GPT Turbo'
21
  description = """ PDF GPT Turbo allows you to chat with your PDF files. It uses Google's Universal Sentence Encoder with Deep averaging network (DAN) to give hallucination free response by improving the embedding quality of OpenAI. It cites the page number in square brackets([Page No.]) and shows where the information is located, adding credibility to the responses."""
22
 
 
31
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
32
  with gr.Accordion("API Key"):
33
  openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
34
+ url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
35
  gr.Markdown("<center><h4>OR<h4></center>")
36
+ files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
37
  question = gr.Textbox(label='Enter your question here')
38
  gr.Examples(
39
  [[q] for q in questions],
 
57
  chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
58
 
59
 
60
+
61
  # Bind the click event of the button to the question_answer function
62
  btn.click(
63
+ functions.question_answer,
64
+ inputs=[chatbot, url, files, question, openAI_key, model],
65
  outputs=[chatbot],
66
  )
67
 
68
  demo.launch()
 
 
functions.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import fitz
3
+ import re
4
+ import openai
5
+ import os
6
+ from semantic_search import SemanticSearch
7
+
8
+ recommender = SemanticSearch()
9
+
10
+ def download_pdf(url, output_path):
11
+ urllib.request.urlretrieve(url, output_path)
12
+
13
+
14
+ def preprocess(text):
15
+ text = text.replace('\n', ' ')
16
+ text = re.sub('\s+', ' ', text)
17
+ return text
18
+
19
+
20
+ # converts pdf to text
21
+ def pdf_to_text(path, start_page=1, end_page=None):
22
+ doc = fitz.open(path)
23
+ total_pages = doc.page_count
24
+
25
+ if end_page is None:
26
+ end_page = total_pages
27
+
28
+ text_list = []
29
+
30
+ for i in range(start_page-1, end_page):
31
+ text = doc.load_page(i).get_text("text")
32
+ text = preprocess(text)
33
+ text_list.append(text)
34
+
35
+ doc.close()
36
+ return text_list
37
+
38
+ # converts a text into a list of chunks
39
+ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
40
+
41
+ filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
42
+ text_toks = [t.split(' ') for t in filtered_texts]
43
+ chunks = []
44
+
45
+ for idx, words in enumerate(text_toks):
46
+ for i in range(0, len(words), word_length):
47
+ chunk = words[i:i+word_length]
48
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
49
+ len(text_toks) != (idx+1)):
50
+ text_toks[idx+1] = chunk + text_toks[idx+1]
51
+ continue
52
+ chunk = ' '.join(chunk).strip()
53
+ chunk = f'[PDF no. {file_number}] [Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
54
+ chunks.append(chunk)
55
+ return chunks
56
+
57
+
58
+ # merges a list of pdfs into a list of chunks and fits the recommender
59
+ def load_recommender(paths, start_page=1):
60
+ global recommender
61
+ chunks = []
62
+ for idx, path in enumerate(paths):
63
+ chunks += text_to_chunks(pdf_to_text(path, start_page=start_page), start_page=start_page, file_number=idx+1)
64
+ recommender.fit(chunks)
65
+ return 'Corpus Loaded.'
66
+
67
+
68
+ # calls the OpenAI API to generate a response for the given query
69
+ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
70
+ openai.api_key = openAI_key
71
+ temperature=0.7
72
+ max_tokens=256
73
+ top_p=1
74
+ frequency_penalty=0
75
+ presence_penalty=0
76
+
77
+ if model == "text-davinci-003":
78
+ completions = openai.Completion.create(
79
+ engine=model,
80
+ prompt=prompt,
81
+ max_tokens=max_tokens,
82
+ n=1,
83
+ stop=None,
84
+ temperature=temperature,
85
+ )
86
+ message = completions.choices[0].text
87
+ else:
88
+ message = openai.ChatCompletion.create(
89
+ model=model,
90
+ messages=[
91
+ {"role": "system", "content": "You are a helpful assistant."},
92
+ {"role": "assistant", "content": "Here is some initial assistant message."},
93
+ {"role": "user", "content": prompt}
94
+ ],
95
+ temperature=.3,
96
+ max_tokens=max_tokens,
97
+ top_p=top_p,
98
+ frequency_penalty=frequency_penalty,
99
+ presence_penalty=presence_penalty,
100
+ ).choices[0].message['content']
101
+ return message
102
+
103
+
104
+ # constructs the prompt for the given query
105
+ def construct_prompt(question):
106
+ topn_chunks = recommender(question)
107
+ prompt = 'search results:\n\n'
108
+ for c in topn_chunks:
109
+ prompt += c + '\n\n'
110
+
111
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
112
+ "Cite each reference using [PDF Number][Page Number] notation. "\
113
+ "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
114
+
115
+ prompt += f"{question}\nAnswer:"
116
+ return prompt
117
+
118
+ # main function that is called when the user clicks the submit button, generates an answer for the query
119
+ def question_answer(chat_history, url, files, question, openAI_key, model):
120
+ try:
121
+ if files == None:
122
+ files = []
123
+ if openAI_key.strip()=='':
124
+ return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
125
+ if url.strip() == '' and files == []:
126
+ return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
127
+ if url.strip() != '' and files is not []:
128
+ return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
129
+ if model is None or model =='':
130
+ return '[ERROR]: You have not selected any model. Please choose an LLM model.'
131
+ if url.strip() != '':
132
+ glob_url = url
133
+ download_pdf(glob_url, 'corpus.pdf')
134
+ load_recommender('corpus.pdf')
135
+ else:
136
+ print(files)
137
+ filenames = []
138
+ for file in files:
139
+ old_file_name = file.name
140
+ file_name = file.name
141
+ file_name = file_name[:-12] + file_name[-4:]
142
+ os.rename(old_file_name, file_name)
143
+ filenames.append(file_name)
144
+ load_recommender(filenames)
145
+
146
+
147
+ if question.strip() == '':
148
+ return '[ERROR]: Question field is empty'
149
+ prompt = construct_prompt(question)
150
+ answer = generate_text(openAI_key, prompt, model)
151
+ chat_history.append([question, answer])
152
+ return chat_history
153
+ except openai.error.InvalidRequestError as e:
154
+ return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
semantic_search.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow_hub as hub
3
+ from sklearn.neighbors import NearestNeighbors
4
+
5
+
6
+
7
+ class SemanticSearch:
8
+ def __init__(self):
9
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
10
+ self.fitted = False
11
+
12
+ # fits the recommender
13
+ def fit(self, data, batch=1000, n_neighbors=5):
14
+ self.data = data
15
+ self.embeddings = self.get_text_embedding(data, batch=batch)
16
+ n_neighbors = min(n_neighbors, len(self.embeddings))
17
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
18
+ self.nn.fit(self.embeddings)
19
+ self.fitted = True
20
+
21
+
22
+ def __call__(self, text, return_data=True):
23
+ inp_emb = self.use([text])
24
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
25
+
26
+ if return_data:
27
+ return [self.data[i] for i in neighbors]
28
+ else:
29
+ return neighbors
30
+
31
+
32
+ # returns embeddings
33
+ def get_text_embedding(self, texts, batch=1000):
34
+ embeddings = []
35
+ for i in range(0, len(texts), batch):
36
+ text_batch = texts[i:(i+batch)]
37
+ emb_batch = self.use(text_batch)
38
+ embeddings.append(emb_batch)
39
+ embeddings = np.vstack(embeddings)
40
+ return embeddings
41
+