Spaces:

quantumbit
/

rag-bajaj

Sleeping

App Files Files Community

rag-bajaj / preprocessing /preprocessing_modules /image_extractor.py

quantumbit

Upload 41 files

5ff6b14 verified 27 days ago

raw

history blame

4.43 kB

	import cv2
	import pytesseract
	import numpy as np
	import pandas as pd
	from PIL import Image, ImageFile
	from typing import List, Dict, Any

	ImageFile.LOAD_TRUNCATED_IMAGES = True

	def load_local_image(path: str) -> np.ndarray:
	"""Load image from local path."""
	img = Image.open(path).convert("RGB")
	return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

	def sort_contours(cnts, method="top-to-bottom"):
	"""Sort contours based on the specified method."""
	reverse = False
	i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
	if method == "right-to-left" or method == "bottom-to-top":
	reverse = True
	boundingBoxes = [cv2.boundingRect(c) for c in cnts]
	(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
	key=lambda b: b[1][i], reverse=reverse))
	return cnts, boundingBoxes

	def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
	"""Extract table structure from image using OpenCV."""
	gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
	_, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)

	# Detect horizontal lines
	horizontal = binary.copy()
	cols = horizontal.shape[1]
	horizontal_size = cols // 15
	horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
	horizontal = cv2.erode(horizontal, horizontal_structure)
	horizontal = cv2.dilate(horizontal, horizontal_structure)

	# Detect vertical lines
	vertical = binary.copy()
	rows = vertical.shape[0]
	vertical_size = rows // 15
	vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
	vertical = cv2.erode(vertical, vertical_structure)
	vertical = cv2.dilate(vertical, vertical_structure)

	# Combine mask
	mask = cv2.add(horizontal, vertical)
	contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

	cells = []
	for contour in contours:
	x, y, w, h = cv2.boundingRect(contour)
	if w > 30 and h > 20: # Filter small contours
	cell_img = table_img[y:y+h, x:x+w]
	try:
	text = pytesseract.image_to_string(cell_img, config='--psm 7').strip()
	cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
	except:
	cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''})

	# Sort cells by position to create table structure
	cells.sort(key=lambda cell: (cell['y'], cell['x']))

	# Group cells into rows
	rows = []
	current_row = []
	current_y = 0

	for cell in cells:
	if abs(cell['y'] - current_y) > 20: # New row threshold
	if current_row:
	rows.append(current_row)
	current_row = [cell]
	current_y = cell['y']
	else:
	current_row.append(cell)

	if current_row:
	rows.append(current_row)

	# Convert to DataFrame
	table_data = []
	for row in rows:
	row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])]
	table_data.append(row_data)

	if table_data:
	max_cols = max(len(row) for row in table_data)
	for row in table_data:
	while len(row) < max_cols:
	row.append('')
	return pd.DataFrame(table_data)
	else:
	return pd.DataFrame()

	def extract_image_content(image_path: str) -> str:
	"""Extract text content from images using OCR."""
	try:
	# Load image
	img = load_local_image(image_path)

	# Basic OCR
	text = pytesseract.image_to_string(img)

	# Try to detect if it's a table
	if '\|' in text or '\\t' in text or len(text.split('\\n')) > 3:
	# Try table extraction
	try:
	table_df = extract_cells_from_grid(img)
	if not table_df.empty:
	table_text = "\\n".join([" \| ".join(row) for row in table_df.values])
	return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}"
	except:
	pass

	return text.strip() if text.strip() else "[No text detected in image]"

	except Exception as e:
	return f"[Error processing image: {str(e)}]"