File size: 1,650 Bytes
24f8c1c
2a13c73
24f8c1c
 
 
 
 
 
6a4ac5a
24f8c1c
 
 
 
 
 
 
 
 
 
 
2a23253
24f8c1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a4ac5a
d16a006
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import logging
from dataclasses import dataclass

import pandas as pd

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


@dataclass
class SystemPromptFormatter:
    text_before_docs: str = ""
    text_after_docs: str = ""
    max_words: int = 4000

    def format_documents(self, matched_documents: pd.DataFrame, max_words: int) -> str:
        # gather the documents in one large plaintext variable
        documents_list = matched_documents.content.to_list()
        documents_str = ""
        for idx, doc in enumerate(documents_list):
            documents_str += f"<DOCUMENT> {doc} <\\DOCUMENT>"

        # truncate the documents to fit
        # TODO: increase to actual token count
        word_count = len(documents_str.split(" "))
        if word_count > max_words:
            logger.warning("truncating documents to fit...")
            documents_str = " ".join(documents_str.split(" ")[0:max_words])
            logger.warning(f"Documents after truncation: {documents_str}")

        return documents_str

    def format(
        self,
        matched_documents: str,
    ) -> str:
        """
        Prepare the system prompt with prompt engineering.
        """
        documents = self.format_documents(matched_documents, max_words=self.max_words)
        system_prompt = self.text_before_docs + documents + self.text_after_docs
        return system_prompt


def prompt_formatter_factory(prompt_cfg):
    return SystemPromptFormatter(
        text_before_docs=prompt_cfg["text_before_documents"],
        text_after_docs=prompt_cfg["text_before_prompt"],
        max_words=prompt_cfg["max_words"],
    )