quantumbit's picture
Upload 41 files
5ff6b14 verified
raw
history blame
4.43 kB
import cv2
import pytesseract
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from typing import List, Dict, Any
ImageFile.LOAD_TRUNCATED_IMAGES = True
def load_local_image(path: str) -> np.ndarray:
"""Load image from local path."""
img = Image.open(path).convert("RGB")
return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
def sort_contours(cnts, method="top-to-bottom"):
"""Sort contours based on the specified method."""
reverse = False
i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
if method == "right-to-left" or method == "bottom-to-top":
reverse = True
boundingBoxes = [cv2.boundingRect(c) for c in cnts]
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
key=lambda b: b[1][i], reverse=reverse))
return cnts, boundingBoxes
def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
"""Extract table structure from image using OpenCV."""
gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# Detect horizontal lines
horizontal = binary.copy()
cols = horizontal.shape[1]
horizontal_size = cols // 15
horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
horizontal = cv2.erode(horizontal, horizontal_structure)
horizontal = cv2.dilate(horizontal, horizontal_structure)
# Detect vertical lines
vertical = binary.copy()
rows = vertical.shape[0]
vertical_size = rows // 15
vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
vertical = cv2.erode(vertical, vertical_structure)
vertical = cv2.dilate(vertical, vertical_structure)
# Combine mask
mask = cv2.add(horizontal, vertical)
contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cells = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
if w > 30 and h > 20: # Filter small contours
cell_img = table_img[y:y+h, x:x+w]
try:
text = pytesseract.image_to_string(cell_img, config='--psm 7').strip()
cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
except:
cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''})
# Sort cells by position to create table structure
cells.sort(key=lambda cell: (cell['y'], cell['x']))
# Group cells into rows
rows = []
current_row = []
current_y = 0
for cell in cells:
if abs(cell['y'] - current_y) > 20: # New row threshold
if current_row:
rows.append(current_row)
current_row = [cell]
current_y = cell['y']
else:
current_row.append(cell)
if current_row:
rows.append(current_row)
# Convert to DataFrame
table_data = []
for row in rows:
row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])]
table_data.append(row_data)
if table_data:
max_cols = max(len(row) for row in table_data)
for row in table_data:
while len(row) < max_cols:
row.append('')
return pd.DataFrame(table_data)
else:
return pd.DataFrame()
def extract_image_content(image_path: str) -> str:
"""Extract text content from images using OCR."""
try:
# Load image
img = load_local_image(image_path)
# Basic OCR
text = pytesseract.image_to_string(img)
# Try to detect if it's a table
if '|' in text or '\\t' in text or len(text.split('\\n')) > 3:
# Try table extraction
try:
table_df = extract_cells_from_grid(img)
if not table_df.empty:
table_text = "\\n".join([" | ".join(row) for row in table_df.values])
return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}"
except:
pass
return text.strip() if text.strip() else "[No text detected in image]"
except Exception as e:
return f"[Error processing image: {str(e)}]"