from settings import char_remove import re import json import sys import logging class Logger: def __init__(self, filename): self.terminal = sys.stdout self.log = open(filename, "w") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): self.terminal.flush() self.log.flush() def isatty(self): return False sys.stdout = Logger("output.log") logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def remove_na(string): for char in char_remove: string = string.replace(char, "") return string def save_json(text, filename): filename = filename+".json" with open(filename, "w", encoding='utf-8') as outfile: json.dump(text, outfile, ensure_ascii=False) return filename def format_polygon(polygon): if not polygon: return "N/A" return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon]) def filter_tables(input_string, table_numbers): # Splitting the input_string into tables tables = re.split(r"Table # \d+", input_string)[1:] # we start from 1 to exclude the initial empty string json_tables = {} table_counter = 1 for table_number in table_numbers: # Picking the specific table try: table_str = tables[table_number] except (IndexError, UnboundLocalError) as e: logging.error(f"Error: {e}, Please check document configuration or document type") print(f"Error: {e}, Please check document configuration or document type") raise e # Extracting cell coordinates and contents cells = re.findall(r"Cell\[(\d+)\]\[(\d+)\] has content '(.*?)'", table_str) # Find the number of rows and columns num_rows = max([int(cell[0]) for cell in cells]) + 1 num_cols = max([int(cell[1]) for cell in cells]) + 1 # Initialize table with empty strings table = [["" for _ in range(num_cols)] for _ in range(num_rows)] # Fill table based on cell coordinates for cell in cells: row, col, content = int(cell[0]), int(cell[1]), cell[2] table[row][col] = content # Adding table to the dictionary json_tables[f"table_{table_counter}"] = table # Increment the table counter table_counter += 1 # Converting the dictionary to a JSON string json_string = json.dumps(json_tables) return json_string def extract_text_within_range(input_string, x_range, y_range): pattern = r"Line # \d+ text '([^']*)' within bounding polygon '(\[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\], \[[\d.]+, [\d.]+\])'" matches = re.findall(pattern, input_string) output = [] for text, polygon_str in matches: polygon = eval(polygon_str) # Convert string to list of coordinates for (x, y) in polygon: if x_range[0] <= x <= x_range[1] and y_range[0] <= y <= y_range[1]: output.append(text) break # If any coordinate is within range, add the text to the output return output def merge_strings(input_string, input_coords, extract_coords): lines1 = input_string.split('\n') lines2 = input_coords.split('\n') # Filter out empty lines and strip leading/trailing whitespaces lines2 = [line.strip() for line in lines2 if line.strip()] # Creating dictionaries to store the key-value pairs try: dict1 = {line.split(": ")[0]: line.split(": ")[1] for line in lines1} dict2 = {line.split(": ")[0]: line.split(": ")[1] for line in lines2} except (IndexError, UnboundLocalError) as e: logging.error(f"Error: {e}, Please check document configuration or document type") print(f"Error: {e}, Please check document configuration or document type") raise e # Updating the values in dict1 with the ones from dict2 if they share the same key for key in dict1.keys(): if key in dict2: dict1[key] = dict2[key] for key, coord_str in dict1.items(): if coord_str.startswith('('): # check if the string represents a tuple # Parse coordinates coords = eval(coord_str) # Convert coordinates into x and y ranges x_range = (coords[0][0], coords[1][0]) y_range = (coords[0][1], coords[1][1]) # Use the function to extract the text text = extract_text_within_range(extract_coords, x_range, y_range) # Update the dictionary with the extracted text or '-||-' if empty dict1[key] = ', '.join(text) if text else '-||-' # Constructing the updated string1 input_string = '\n'.join([f"{key}: {value}" for key, value in dict1.items()]) return input_string def read_logs(): sys.stdout.flush() with open("output.log","r",encoding="utf-8") as f: lines = f.readlines() return ''.join(lines[-100:]) def clear_logs(): with open("output.log","w",encoding="utf-8") as f: f.write("")