Spaces:
Running
Running
File size: 5,290 Bytes
87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
from dataclasses import dataclass
from langchain_core.pydantic_v1 import Field, create_model
import yaml, json
@dataclass
class PromptCatalog:
domain_knowledge_example: str = ""
similarity: str = ""
OCR: str = ""
n_fields: int = 0
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
# These are for dynamically creating your own prompts with n-columns
def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
self.OCR = OCR
self.rules_config_path = rules_config_path
self.rules_config = self.load_rules_config()
self.instructions = self.rules_config['instructions']
self.json_formatting_instructions = self.rules_config['json_formatting_instructions']
self.rules_list = self.rules_config['rules']
self.n_fields = len(self.rules_config['rules'])
# Set the rules for processing OCR into JSON format
self.rules = self.create_rules(is_palm)
self.structure, self.dictionary_structure = self.create_structure(is_palm)
''' between instructions and json_formatting_instructions. Made the prompt too long. Better performance without it
The unstructured OCR text is:
{self.OCR}
'''
if is_palm:
prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
The rules are:
{self.instructions}
{self.json_formatting_instructions}
This is the JSON template that includes instructions for each key:
{self.rules}
The unstructured OCR text is:
{self.OCR}
Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
{self.structure}
{self.structure}
{self.structure}
"""
else:
prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
The rules are:
{self.instructions}
{self.json_formatting_instructions}
This is the JSON template that includes instructions for each key:
{self.rules}
The unstructured OCR text is:
{self.OCR}
Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
{self.structure}
"""
# xlsx_headers = self.generate_xlsx_headers(is_palm)
# return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
return prompt, self.dictionary_structure
def load_rules_config(self):
with open(self.rules_config_path, 'r') as stream:
try:
return yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
return None
def create_rules(self, is_palm=False):
dictionary_structure = {key: value for key, value in self.rules_list.items()}
# Convert the structure to a JSON string without indentation
structure_json_str = json.dumps(dictionary_structure, sort_keys=False)
return structure_json_str
def create_structure(self, is_palm=False):
# Create fields for the Pydantic model dynamically
fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}
# Dynamically create the Pydantic model
DynamicJSONParsingModel = create_model('SLTPvA', **fields)
DynamicJSONParsingModel_use = DynamicJSONParsingModel()
# Define the structure for the "Dictionary" section
dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
# Dynamically create the "Dictionary" Pydantic model
PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)
# Convert the model to JSON string (for demonstration)
dictionary_structure = PromptJSONModel().dict()
structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
return structure_json_str, dictionary_structure
def generate_xlsx_headers(self, is_palm):
# Extract headers from the 'Dictionary' keys in the JSON template rules
if is_palm:
xlsx_headers = list(self.rules_list.keys())
return xlsx_headers
else:
xlsx_headers = list(self.rules_list.keys())
return xlsx_headers
|