jerpint commited on
Commit
24f8c1c
·
1 Parent(s): 463196d

add new formatters

Browse files
Files changed (1) hide show
  1. buster/formatters/prompts.py +41 -0
buster/formatters/prompts.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import logging
3
+
4
+ import pandas as pd
5
+
6
+ logger = logging.getLogger(__name__)
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+ @dataclass
10
+ class SystemPromptFormatter:
11
+ text_before_docs: str = ""
12
+ text_after_docs: str = ""
13
+ max_words: int = 4000
14
+
15
+ def format_documents(self, matched_documents: pd.DataFrame, max_words: int) -> str:
16
+ # gather the documents in one large plaintext variable
17
+ documents_list = matched_documents.content.to_list()
18
+ documents_str = ""
19
+ for idx, doc in enumerate(documents_list):
20
+ documents_str += f"<DOCUMENT> {doc} <\DOCUMENT>"
21
+
22
+ # truncate the documents to fit
23
+ # TODO: increase to actual token count
24
+ word_count = len(documents_str.split(" "))
25
+ if word_count > max_words:
26
+ logger.warning("truncating documents to fit...")
27
+ documents_str = " ".join(documents_str.split(" ")[0:max_words])
28
+ logger.warning(f"Documents after truncation: {documents_str}")
29
+
30
+ return documents_str
31
+
32
+ def format(
33
+ self,
34
+ matched_documents: str,
35
+ ) -> str:
36
+ """
37
+ Prepare the system prompt with prompt engineering.
38
+ """
39
+ documents = self.format_documents(matched_documents, max_words=self.max_words)
40
+ system_prompt = self.text_before_docs + documents + self.text_after_docs
41
+ return system_prompt