pdfGPT_Turbo

Sleeping

th8m0z commited on Nov 13, 2023

Commit

f48c06b

1 Parent(s): 0a443bb

decreased chunk sizes + improved summaries

Files changed (2) hide show

functions.py CHANGED Viewed

@@ -36,7 +36,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
     return text_list
 # converts a text into a list of chunks
-def text_to_chunks(texts, word_length=300, start_page=1, file_number=1):
     filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
     text_toks = [t.split(' ') for t in filtered_texts]
@@ -106,7 +106,7 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
 def construct_prompt(question, openAI_key):
     topn_chunks = recommender(question)
-    topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-4")
     prompt = 'search results:\n\n'
     for c in topn_chunks:
@@ -162,11 +162,14 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
         return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
-def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=2000):
     total_tokens = sum(len(chunk.split()) for chunk in chunks)
     if total_tokens > token_limit:
         print("has to summarize")
-        summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n" + " ".join(chunks)
         return generate_text(openAI_key, summary_prompt, model=model)
     else:
         return chunks

     return text_list
 # converts a text into a list of chunks
+def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
     filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
     text_toks = [t.split(' ') for t in filtered_texts]
 def construct_prompt(question, openAI_key):
     topn_chunks = recommender(question)
+    topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-3.5-turbo")
     prompt = 'search results:\n\n'
     for c in topn_chunks:
         return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
+def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=8000):
     total_tokens = sum(len(chunk.split()) for chunk in chunks)
     if total_tokens > token_limit:
         print("has to summarize")
+        summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n"
+        for c in chunks:
+            summary_prompt += c + '\n\n'
+        print(summary_prompt)
         return generate_text(openAI_key, summary_prompt, model=model)
     else:
         return chunks

semantic_search.py CHANGED Viewed

@@ -10,7 +10,7 @@ class SemanticSearch:
         self.fitted = False
     # fits the recommender
-    def fit(self, data, batch=1000, n_neighbors=10):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         n_neighbors = min(n_neighbors, len(self.embeddings))

         self.fitted = False
     # fits the recommender
+    def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         n_neighbors = min(n_neighbors, len(self.embeddings))