File size: 5,145 Bytes
3dd785b
 
 
18626e5
3dd785b
18626e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3dd785b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18626e5
 
 
 
 
 
3dd785b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18626e5
 
 
 
 
 
 
3dd785b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18626e5
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from settings import char_remove
import re
import json
import sys
import logging

class Logger:
    def __init__(self, filename):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
        
    def flush(self):
        self.terminal.flush()
        self.log.flush()
        
    def isatty(self):
        return False
    
sys.stdout = Logger("output.log")
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def remove_na(string):
    for char in char_remove:
        string = string.replace(char, "")
    return string

def save_json(text, filename):
    filename = filename+".json"
    with open(filename, "w", encoding='utf-8') as outfile:
        json.dump(text, outfile, ensure_ascii=False)
    return filename

def format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])

def filter_tables(input_string, table_numbers):
    # Splitting the input_string into tables
    tables = re.split(r"Table # \d+", input_string)[1:] # we start from 1 to exclude the initial empty string

    json_tables = {}
    table_counter = 1 

    for table_number in table_numbers:
        # Picking the specific table
        try: 
            table_str = tables[table_number]
        except (IndexError, UnboundLocalError) as e:
            logging.error(f"Error: {e}, Please check document configuration or document type")
            print(f"Error: {e}, Please check document configuration or document type")
            raise e
        # Extracting cell coordinates and contents
        cells = re.findall(r"Cell\[(\d+)\]\[(\d+)\] has content '(.*?)'", table_str)

        # Find the number of rows and columns
        num_rows = max([int(cell[0]) for cell in cells]) + 1
        num_cols = max([int(cell[1]) for cell in cells]) + 1

        # Initialize table with empty strings
        table = [["" for _ in range(num_cols)] for _ in range(num_rows)]

        # Fill table based on cell coordinates
        for cell in cells:
            row, col, content = int(cell[0]), int(cell[1]), cell[2]
            table[row][col] = content

        # Adding table to the dictionary
        json_tables[f"table_{table_counter}"] = table

        # Increment the table counter
        table_counter += 1

    # Converting the dictionary to a JSON string
    json_string = json.dumps(json_tables)

    return json_string

def extract_text_within_range(input_string, x_range, y_range):
    pattern = r"Line # \d+ text '([^']*)' within bounding polygon '(\[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\])'"
    matches = re.findall(pattern, input_string)

    output = []

    for text, polygon_str in matches:
        polygon = eval(polygon_str)  # Convert string to list of coordinates
        for (x, y) in polygon:
            if x_range[0] <= x <= x_range[1] and y_range[0] <= y <= y_range[1]:
                output.append(text)
                break  # If any coordinate is within range, add the text to the output

    return output

def merge_strings(input_string, input_coords, extract_coords):
    lines1 = input_string.split('\n')
    lines2 = input_coords.split('\n')
    # Filter out empty lines and strip leading/trailing whitespaces
    lines2 = [line.strip() for line in lines2 if line.strip()]

    # Creating dictionaries to store the key-value pairs
    try: 
        dict1 = {line.split(": ")[0]: line.split(": ")[1] for line in lines1}
        dict2 = {line.split(": ")[0]: line.split(": ")[1] for line in lines2}
    except (IndexError, UnboundLocalError) as e:
        logging.error(f"Error: {e}, Please check document configuration or document type")
        print(f"Error: {e}, Please check document configuration or document type")
        raise e

    # Updating the values in dict1 with the ones from dict2 if they share the same key
    for key in dict1.keys():
        if key in dict2:
            dict1[key] = dict2[key]

    for key, coord_str in dict1.items():
        if coord_str.startswith('('):  # check if the string represents a tuple
            # Parse coordinates
            coords = eval(coord_str)
            # Convert coordinates into x and y ranges
            x_range = (coords[0][0], coords[1][0])
            y_range = (coords[0][1], coords[1][1])
            # Use the function to extract the text
            text = extract_text_within_range(extract_coords, x_range, y_range)
            # Update the dictionary with the extracted text or '-||-' if empty
            dict1[key] = ', '.join(text) if text else '-||-'

    # Constructing the updated string1
    input_string = '\n'.join([f"{key}: {value}" for key, value in dict1.items()])

    return input_string

def read_logs():
    sys.stdout.flush()
    with open("output.log","r",encoding="utf-8") as f:
        lines = f.readlines()
    return ''.join(lines[-100:])
    
def clear_logs():
    with open("output.log","w",encoding="utf-8") as f:
        f.write("")