File size: 5,145 Bytes
3dd785b 18626e5 3dd785b 18626e5 3dd785b 18626e5 3dd785b 18626e5 3dd785b 18626e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from settings import char_remove
import re
import json
import sys
import logging
class Logger:
def __init__(self, filename):
self.terminal = sys.stdout
self.log = open(filename, "w")
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
self.terminal.flush()
self.log.flush()
def isatty(self):
return False
sys.stdout = Logger("output.log")
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def remove_na(string):
for char in char_remove:
string = string.replace(char, "")
return string
def save_json(text, filename):
filename = filename+".json"
with open(filename, "w", encoding='utf-8') as outfile:
json.dump(text, outfile, ensure_ascii=False)
return filename
def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])
def filter_tables(input_string, table_numbers):
# Splitting the input_string into tables
tables = re.split(r"Table # \d+", input_string)[1:] # we start from 1 to exclude the initial empty string
json_tables = {}
table_counter = 1
for table_number in table_numbers:
# Picking the specific table
try:
table_str = tables[table_number]
except (IndexError, UnboundLocalError) as e:
logging.error(f"Error: {e}, Please check document configuration or document type")
print(f"Error: {e}, Please check document configuration or document type")
raise e
# Extracting cell coordinates and contents
cells = re.findall(r"Cell\[(\d+)\]\[(\d+)\] has content '(.*?)'", table_str)
# Find the number of rows and columns
num_rows = max([int(cell[0]) for cell in cells]) + 1
num_cols = max([int(cell[1]) for cell in cells]) + 1
# Initialize table with empty strings
table = [["" for _ in range(num_cols)] for _ in range(num_rows)]
# Fill table based on cell coordinates
for cell in cells:
row, col, content = int(cell[0]), int(cell[1]), cell[2]
table[row][col] = content
# Adding table to the dictionary
json_tables[f"table_{table_counter}"] = table
# Increment the table counter
table_counter += 1
# Converting the dictionary to a JSON string
json_string = json.dumps(json_tables)
return json_string
def extract_text_within_range(input_string, x_range, y_range):
pattern = r"Line # \d+ text '([^']*)' within bounding polygon '(\[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\])'"
matches = re.findall(pattern, input_string)
output = []
for text, polygon_str in matches:
polygon = eval(polygon_str) # Convert string to list of coordinates
for (x, y) in polygon:
if x_range[0] <= x <= x_range[1] and y_range[0] <= y <= y_range[1]:
output.append(text)
break # If any coordinate is within range, add the text to the output
return output
def merge_strings(input_string, input_coords, extract_coords):
lines1 = input_string.split('\n')
lines2 = input_coords.split('\n')
# Filter out empty lines and strip leading/trailing whitespaces
lines2 = [line.strip() for line in lines2 if line.strip()]
# Creating dictionaries to store the key-value pairs
try:
dict1 = {line.split(": ")[0]: line.split(": ")[1] for line in lines1}
dict2 = {line.split(": ")[0]: line.split(": ")[1] for line in lines2}
except (IndexError, UnboundLocalError) as e:
logging.error(f"Error: {e}, Please check document configuration or document type")
print(f"Error: {e}, Please check document configuration or document type")
raise e
# Updating the values in dict1 with the ones from dict2 if they share the same key
for key in dict1.keys():
if key in dict2:
dict1[key] = dict2[key]
for key, coord_str in dict1.items():
if coord_str.startswith('('): # check if the string represents a tuple
# Parse coordinates
coords = eval(coord_str)
# Convert coordinates into x and y ranges
x_range = (coords[0][0], coords[1][0])
y_range = (coords[0][1], coords[1][1])
# Use the function to extract the text
text = extract_text_within_range(extract_coords, x_range, y_range)
# Update the dictionary with the extracted text or '-||-' if empty
dict1[key] = ', '.join(text) if text else '-||-'
# Constructing the updated string1
input_string = '\n'.join([f"{key}: {value}" for key, value in dict1.items()])
return input_string
def read_logs():
sys.stdout.flush()
with open("output.log","r",encoding="utf-8") as f:
lines = f.readlines()
return ''.join(lines[-100:])
def clear_logs():
with open("output.log","w",encoding="utf-8") as f:
f.write("") |