File size: 3,974 Bytes
3dd785b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from settings import char_remove
import re
import json
import logging
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def remove_na(string):
    for char in char_remove:
        string = string.replace(char, "")
    return string

def save_json(text, filename):
    filename = filename+".json"
    with open(filename, "w", encoding='utf-8') as outfile:
        json.dump(text, outfile, ensure_ascii=False)
    return filename

def format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])

def filter_tables(input_string, table_numbers):
    # Splitting the input_string into tables
    tables = re.split(r"Table # \d+", input_string)[1:] # we start from 1 to exclude the initial empty string

    json_tables = {}
    table_counter = 1 

    for table_number in table_numbers:
        # Picking the specific table
        table_str = tables[table_number]

        # Extracting cell coordinates and contents
        cells = re.findall(r"Cell\[(\d+)\]\[(\d+)\] has content '(.*?)'", table_str)

        # Find the number of rows and columns
        num_rows = max([int(cell[0]) for cell in cells]) + 1
        num_cols = max([int(cell[1]) for cell in cells]) + 1

        # Initialize table with empty strings
        table = [["" for _ in range(num_cols)] for _ in range(num_rows)]

        # Fill table based on cell coordinates
        for cell in cells:
            row, col, content = int(cell[0]), int(cell[1]), cell[2]
            table[row][col] = content

        # Adding table to the dictionary
        json_tables[f"table_{table_counter}"] = table

        # Increment the table counter
        table_counter += 1

    # Converting the dictionary to a JSON string
    json_string = json.dumps(json_tables)

    return json_string

def extract_text_within_range(input_string, x_range, y_range):
    pattern = r"Line # \d+ text '([^']*)' within bounding polygon '(\[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\])'"
    matches = re.findall(pattern, input_string)

    output = []

    for text, polygon_str in matches:
        polygon = eval(polygon_str)  # Convert string to list of coordinates
        for (x, y) in polygon:
            if x_range[0] <= x <= x_range[1] and y_range[0] <= y <= y_range[1]:
                output.append(text)
                break  # If any coordinate is within range, add the text to the output

    return output

def merge_strings(input_string, input_coords, extract_coords):
    lines1 = input_string.split('\n')
    lines2 = input_coords.split('\n')
    # Filter out empty lines and strip leading/trailing whitespaces
    lines2 = [line.strip() for line in lines2 if line.strip()]

    logging.info(lines2)
    # Creating dictionaries to store the key-value pairs
    dict1 = {line.split(": ")[0]: line.split(": ")[1] for line in lines1}
    dict2 = {line.split(": ")[0]: line.split(": ")[1] for line in lines2}

    # Updating the values in dict1 with the ones from dict2 if they share the same key
    for key in dict1.keys():
        if key in dict2:
            dict1[key] = dict2[key]

    for key, coord_str in dict1.items():
        if coord_str.startswith('('):  # check if the string represents a tuple
            # Parse coordinates
            coords = eval(coord_str)
            # Convert coordinates into x and y ranges
            x_range = (coords[0][0], coords[1][0])
            y_range = (coords[0][1], coords[1][1])
            # Use the function to extract the text
            text = extract_text_within_range(extract_coords, x_range, y_range)
            # Update the dictionary with the extracted text or '-||-' if empty
            dict1[key] = ', '.join(text) if text else '-||-'

    # Constructing the updated string1
    input_string = '\n'.join([f"{key}: {value}" for key, value in dict1.items()])

    return input_string