th8m0z commited on
Commit
f48c06b
1 Parent(s): 0a443bb

decreased chunk sizes + improved summaries

Browse files
Files changed (2) hide show
  1. functions.py +7 -4
  2. semantic_search.py +1 -1
functions.py CHANGED
@@ -36,7 +36,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
36
  return text_list
37
 
38
  # converts a text into a list of chunks
39
- def text_to_chunks(texts, word_length=300, start_page=1, file_number=1):
40
 
41
  filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
42
  text_toks = [t.split(' ') for t in filtered_texts]
@@ -106,7 +106,7 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
106
  def construct_prompt(question, openAI_key):
107
  topn_chunks = recommender(question)
108
 
109
- topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-4")
110
 
111
  prompt = 'search results:\n\n'
112
  for c in topn_chunks:
@@ -162,11 +162,14 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
162
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
163
 
164
 
165
- def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=2000):
166
  total_tokens = sum(len(chunk.split()) for chunk in chunks)
167
  if total_tokens > token_limit:
168
  print("has to summarize")
169
- summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n" + " ".join(chunks)
 
 
 
170
  return generate_text(openAI_key, summary_prompt, model=model)
171
  else:
172
  return chunks
 
36
  return text_list
37
 
38
  # converts a text into a list of chunks
39
+ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
40
 
41
  filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
42
  text_toks = [t.split(' ') for t in filtered_texts]
 
106
  def construct_prompt(question, openAI_key):
107
  topn_chunks = recommender(question)
108
 
109
+ topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-3.5-turbo")
110
 
111
  prompt = 'search results:\n\n'
112
  for c in topn_chunks:
 
162
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
163
 
164
 
165
+ def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=8000):
166
  total_tokens = sum(len(chunk.split()) for chunk in chunks)
167
  if total_tokens > token_limit:
168
  print("has to summarize")
169
+ summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n"
170
+ for c in chunks:
171
+ summary_prompt += c + '\n\n'
172
+ print(summary_prompt)
173
  return generate_text(openAI_key, summary_prompt, model=model)
174
  else:
175
  return chunks
semantic_search.py CHANGED
@@ -10,7 +10,7 @@ class SemanticSearch:
10
  self.fitted = False
11
 
12
  # fits the recommender
13
- def fit(self, data, batch=1000, n_neighbors=10):
14
  self.data = data
15
  self.embeddings = self.get_text_embedding(data, batch=batch)
16
  n_neighbors = min(n_neighbors, len(self.embeddings))
 
10
  self.fitted = False
11
 
12
  # fits the recommender
13
+ def fit(self, data, batch=1000, n_neighbors=5):
14
  self.data = data
15
  self.embeddings = self.get_text_embedding(data, batch=batch)
16
  n_neighbors = min(n_neighbors, len(self.embeddings))