File size: 6,064 Bytes
87c3140
e91ac58
9d06861
87c3140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e91ac58
87c3140
 
 
 
 
 
 
 
 
e91ac58
87c3140
 
 
 
e91ac58
87c3140
e91ac58
 
 
 
87c3140
 
 
 
 
 
 
e91ac58
 
87c3140
 
 
 
 
 
 
 
 
 
 
 
e91ac58
 
87c3140
 
 
e91ac58
87c3140
e91ac58
 
87c3140
9d06861
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87c3140
 
 
 
 
 
 
 
 
e91ac58
87c3140
e91ac58
 
 
87c3140
 
e91ac58
 
87c3140
e91ac58
 
 
 
 
 
 
 
 
87c3140
e91ac58
 
 
 
87c3140
 
 
 
 
 
 
 
e91ac58
87c3140
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from dataclasses import dataclass
from langchain_core.pydantic_v1 import Field, create_model
import yaml, json, os, shutil

@dataclass
class PromptCatalog:
    domain_knowledge_example: str = ""
    similarity: str = ""
    OCR: str = ""
    n_fields: int = 0

    
    #############################################################################################
    #############################################################################################
    #############################################################################################
    #############################################################################################
    # These are for dynamically creating your own prompts with n-columns


    def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
        self.OCR = OCR

        self.rules_config_path = rules_config_path
        self.rules_config = self.load_rules_config()

        self.instructions = self.rules_config['instructions']
        self.json_formatting_instructions = self.rules_config['json_formatting_instructions']

        self.rules_list = self.rules_config['rules']
        self.n_fields = len(self.rules_config['rules'])

        # Set the rules for processing OCR into JSON format
        self.rules = self.create_rules(is_palm)

        self.structure, self.dictionary_structure = self.create_structure(is_palm)

        ''' between  instructions and json_formatting_instructions. Made the prompt too long. Better performance without it
        The unstructured OCR text is:
        {self.OCR}
        '''
        if is_palm:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.structure}
                {self.structure}
                {self.structure}
                """
        else:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.structure}
                """
        # xlsx_headers = self.generate_xlsx_headers(is_palm)
        
        # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
        return prompt, self.dictionary_structure


    def copy_prompt_template_to_new_dir(self, new_directory_path, rules_config_path):
        # Ensure the target directory exists, create it if it doesn't
        if not os.path.exists(new_directory_path):
            os.makedirs(new_directory_path)
        
        # Define the path for the new file location
        new_file_path = os.path.join(new_directory_path, os.path.basename(rules_config_path))
        
        # Copy the file to the new location
        try:
            shutil.copy(rules_config_path, new_file_path)
            print(f"Prompt [{os.path.basename(rules_config_path)}] copied successfully to {new_file_path}")
        except Exception as exc:
            print(f"Error copying [{os.path.basename(rules_config_path)}] file: {exc}")


    def load_rules_config(self):
        with open(self.rules_config_path, 'r') as stream:
            try:
                return yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                return None

    def create_rules(self, is_palm=False):
        dictionary_structure = {key: value for key, value in self.rules_list.items()}

        # Convert the structure to a JSON string without indentation
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False)
        return structure_json_str
    
    def create_structure(self, is_palm=False):
        # Create fields for the Pydantic model dynamically
        fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}

        # Dynamically create the Pydantic model
        DynamicJSONParsingModel = create_model('SLTPvA', **fields)
        DynamicJSONParsingModel_use = DynamicJSONParsingModel()

        # Define the structure for the "Dictionary" section
        dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
        
        # Dynamically create the "Dictionary" Pydantic model
        PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)

        # Convert the model to JSON string (for demonstration)
        dictionary_structure = PromptJSONModel().dict()
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
        return structure_json_str, dictionary_structure


    def generate_xlsx_headers(self, is_palm):
        # Extract headers from the 'Dictionary' keys in the JSON template rules
        if is_palm:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers     
        else:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers