import cv2 import pytesseract import numpy as np import pandas as pd from PIL import Image, ImageFile from typing import List, Dict, Any ImageFile.LOAD_TRUNCATED_IMAGES = True def load_local_image(path: str) -> np.ndarray: """Load image from local path.""" img = Image.open(path).convert("RGB") return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) def sort_contours(cnts, method="top-to-bottom"): """Sort contours based on the specified method.""" reverse = False i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0 if method == "right-to-left" or method == "bottom-to-top": reverse = True boundingBoxes = [cv2.boundingRect(c) for c in cnts] (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes), key=lambda b: b[1][i], reverse=reverse)) return cnts, boundingBoxes def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame: """Extract table structure from image using OpenCV.""" gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) # Detect horizontal lines horizontal = binary.copy() cols = horizontal.shape[1] horizontal_size = cols // 15 horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) horizontal = cv2.erode(horizontal, horizontal_structure) horizontal = cv2.dilate(horizontal, horizontal_structure) # Detect vertical lines vertical = binary.copy() rows = vertical.shape[0] vertical_size = rows // 15 vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) vertical = cv2.erode(vertical, vertical_structure) vertical = cv2.dilate(vertical, vertical_structure) # Combine mask mask = cv2.add(horizontal, vertical) contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cells = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) if w > 30 and h > 20: # Filter small contours cell_img = table_img[y:y+h, x:x+w] try: text = pytesseract.image_to_string(cell_img, config='--psm 7').strip() cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text}) except: cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''}) # Sort cells by position to create table structure cells.sort(key=lambda cell: (cell['y'], cell['x'])) # Group cells into rows rows = [] current_row = [] current_y = 0 for cell in cells: if abs(cell['y'] - current_y) > 20: # New row threshold if current_row: rows.append(current_row) current_row = [cell] current_y = cell['y'] else: current_row.append(cell) if current_row: rows.append(current_row) # Convert to DataFrame table_data = [] for row in rows: row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])] table_data.append(row_data) if table_data: max_cols = max(len(row) for row in table_data) for row in table_data: while len(row) < max_cols: row.append('') return pd.DataFrame(table_data) else: return pd.DataFrame() def extract_image_content(image_path: str) -> str: """Extract text content from images using OCR.""" try: # Load image img = load_local_image(image_path) # Basic OCR text = pytesseract.image_to_string(img) # Try to detect if it's a table if '|' in text or '\\t' in text or len(text.split('\\n')) > 3: # Try table extraction try: table_df = extract_cells_from_grid(img) if not table_df.empty: table_text = "\\n".join([" | ".join(row) for row in table_df.values]) return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}" except: pass return text.strip() if text.strip() else "[No text detected in image]" except Exception as e: return f"[Error processing image: {str(e)}]"