th8m0z commited on
Commit
2dae07c
1 Parent(s): 71b98c7

increased batch/nn sizes + added summary feature

Browse files
Files changed (3) hide show
  1. app.py +2 -1
  2. functions.py +20 -3
  3. semantic_search.py +1 -1
app.py CHANGED
@@ -47,7 +47,8 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
47
  'gpt-3.5-turbo-16k-0613',
48
  'text-davinci-003',
49
  'gpt-4',
50
- 'gpt-4-32k'
 
51
  ], label='Select Model', default='gpt-3.5-turbo')
52
  btn = gr.Button(value='Submit')
53
 
 
47
  'gpt-3.5-turbo-16k-0613',
48
  'text-davinci-003',
49
  'gpt-4',
50
+ 'gpt-4-32k',
51
+ 'gpt-4-1106-preview'
52
  ], label='Select Model', default='gpt-3.5-turbo')
53
  btn = gr.Button(value='Submit')
54
 
functions.py CHANGED
@@ -36,7 +36,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
36
  return text_list
37
 
38
  # converts a text into a list of chunks
39
- def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
40
 
41
  filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
42
  text_toks = [t.split(' ') for t in filtered_texts]
@@ -102,17 +102,23 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
102
 
103
 
104
  # constructs the prompt for the given query
105
- def construct_prompt(question):
106
  topn_chunks = recommender(question)
 
 
 
107
  prompt = 'search results:\n\n'
108
  for c in topn_chunks:
109
  prompt += c + '\n\n'
 
 
110
 
111
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
112
  "Cite each reference using [PDF Number][Page Number] notation. "\
113
  "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
114
 
115
  prompt += f"{question}\nAnswer:"
 
116
  return prompt
117
 
118
  # main function that is called when the user clicks the submit button, generates an answer for the query
@@ -146,9 +152,20 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
146
 
147
  if question.strip() == '':
148
  return '[ERROR]: Question field is empty'
149
- prompt = construct_prompt(question)
 
150
  answer = generate_text(openAI_key, prompt, model)
151
  chat_history.append([question, answer])
152
  return chat_history
153
  except openai.error.InvalidRequestError as e:
154
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
 
 
 
 
 
 
 
 
 
 
 
36
  return text_list
37
 
38
  # converts a text into a list of chunks
39
+ def text_to_chunks(texts, word_length=300, start_page=1, file_number=1):
40
 
41
  filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
42
  text_toks = [t.split(' ') for t in filtered_texts]
 
102
 
103
 
104
  # constructs the prompt for the given query
105
+ def construct_prompt(question, openAI_key):
106
  topn_chunks = recommender(question)
107
+
108
+ topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-4")
109
+
110
  prompt = 'search results:\n\n'
111
  for c in topn_chunks:
112
  prompt += c + '\n\n'
113
+
114
+
115
 
116
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
117
  "Cite each reference using [PDF Number][Page Number] notation. "\
118
  "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
119
 
120
  prompt += f"{question}\nAnswer:"
121
+ print("prompt == " + str(prompt))
122
  return prompt
123
 
124
  # main function that is called when the user clicks the submit button, generates an answer for the query
 
152
 
153
  if question.strip() == '':
154
  return '[ERROR]: Question field is empty'
155
+ prompt = construct_prompt(question, openAI_key)
156
+
157
  answer = generate_text(openAI_key, prompt, model)
158
  chat_history.append([question, answer])
159
  return chat_history
160
  except openai.error.InvalidRequestError as e:
161
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
162
+
163
+
164
+ def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=2000):
165
+ total_tokens = sum(len(chunk.split()) for chunk in chunks)
166
+ if total_tokens > token_limit:
167
+ print("has to summarize")
168
+ summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n" + " ".join(chunks)
169
+ return generate_text(openAI_key, summary_prompt, model=model)
170
+ else:
171
+ return chunks
semantic_search.py CHANGED
@@ -10,7 +10,7 @@ class SemanticSearch:
10
  self.fitted = False
11
 
12
  # fits the recommender
13
- def fit(self, data, batch=1000, n_neighbors=5):
14
  self.data = data
15
  self.embeddings = self.get_text_embedding(data, batch=batch)
16
  n_neighbors = min(n_neighbors, len(self.embeddings))
 
10
  self.fitted = False
11
 
12
  # fits the recommender
13
+ def fit(self, data, batch=1000, n_neighbors=10):
14
  self.data = data
15
  self.embeddings = self.get_text_embedding(data, batch=batch)
16
  n_neighbors = min(n_neighbors, len(self.embeddings))