import pandas as pd import re # regex import numpy as np import PIL.Image as Image from paddleocr import PPStructure import html_to_json class TableEx: def __init__(self): self.table_engine = PPStructure(lang='en', layout=False, show_log=True, use_gpu=False, download_models=True, rec=True) def extract_table_information(self, pil_image : np.array): #img_byte_arr = toBytes(pil_image) #table_engine = PPStructure(lang='en', recovery=True, ocr=True, show_log=True, mode='kie') result = self.table_engine(pil_image) try: extracted_tables = html_to_json.convert_tables(result[0]['res']['html']) extracted_tables = self.remove_empty_elements(extracted_tables) except Exception as e: print('Structure extraction Failed, using fallback plain text.') x = [x['text'] for x in result[0]['res']] extracted_tables = ' '.join(x) return extracted_tables def remove_empty_elements(self, nested_list): """ Recursively removes empty elements from a nested list. """ cleaned_list = [] for item in nested_list: if isinstance(item, list): # Recurse into sublists cleaned_sublist = self.remove_empty_elements(item) if cleaned_sublist: cleaned_list.append(cleaned_sublist) elif item != '': # Add non-empty items to the cleaned list cleaned_list.append(item) return cleaned_list def extract_table_data(self, img_array, x1, y1, x2, y2): # Crop the detected table region table_region = img_array[max(0, y1):min(img_array.shape[0], y2), max(0, x1):min(img_array.shape[1], x2)] if table_region.size > 0 and table_region.shape[0] > 0 and table_region.shape[1] > 0: try: # Save the table image for display table_images = Image.fromarray(table_region) # Extract table data extracted_info = self.extract_table_information(table_region) # Store the extracted data with position info table_data = extracted_info[0] except Exception as e: print(f"Error extracting table data: {e}") table_data = { "region": f"({x1}, {y1}) to ({x2}, {y2})", "error": str(e), "data": None } return table_images, table_data