Spaces:

Zana897465
/

Sparrow

Configuration error

App Files Files Community

Sparrow / sparrow_parse /processors /table_structure_processor.py

Zana897465

Upload 24 files

05e6f93 verified 9 months ago

raw

history blame contribute delete

9.79 kB

	from rich.progress import Progress, SpinnerColumn, TextColumn
	from rich import print
	from transformers import AutoModelForObjectDetection
	import torch
	from PIL import Image
	from torchvision import transforms
	import os


	class TableDetector(object):
	_model = None # Static variable to hold the table detection model
	_device = None # Static variable to hold the device information

	def __init__(self):
	pass

	class MaxResize(object):
	def __init__(self, max_size=800):
	self.max_size = max_size

	def __call__(self, image):
	width, height = image.size
	current_max_size = max(width, height)
	scale = self.max_size / current_max_size
	resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))

	return resized_image

	@classmethod
	def _initialize_model(cls, invoke_pipeline_step, local):
	"""
	Static method to initialize the table detection model if not already initialized.
	"""
	if cls._model is None:
	# Use invoke_pipeline_step to load the model
	cls._model, cls._device = invoke_pipeline_step(
	lambda: cls.load_table_detection_model(),
	"Loading table detection model...",
	local
	)
	print("Table detection model initialized.")


	def detect_tables(self, file_path, local=True, debug_dir=None, debug=False):
	# Ensure the model is initialized using invoke_pipeline_step
	self._initialize_model(self.invoke_pipeline_step, local)

	# Use the static model and device
	model, device = self._model, self._device

	outputs, image = self.invoke_pipeline_step(
	lambda: self.prepare_image(file_path, model, device),
	"Preparing image for table detection...",
	local
	)

	objects = self.invoke_pipeline_step(
	lambda: self.identify_tables(model, outputs, image),
	"Identifying tables in the image...",
	local
	)

	cropped_tables = self.invoke_pipeline_step(
	lambda: self.crop_tables(file_path, image, objects, debug, debug_dir),
	"Cropping tables from the image...",
	local
	)

	return cropped_tables


	@staticmethod
	def load_table_detection_model():
	model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	return model, device


	def prepare_image(self, file_path, model, device):
	image = Image.open(file_path).convert("RGB")

	detection_transform = transforms.Compose([
	self.MaxResize(800),
	transforms.ToTensor(),
	transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
	])

	pixel_values = detection_transform(image).unsqueeze(0)
	pixel_values = pixel_values.to(device)

	with torch.no_grad():
	outputs = model(pixel_values)

	return outputs, image

	def identify_tables(self, model, outputs, image):
	id2label = model.config.id2label
	id2label[len(model.config.id2label)] = "no object"

	objects = self.outputs_to_objects(outputs, image.size, id2label)
	return objects


	def crop_tables(self, file_path, image, objects, debug, debug_dir):
	tokens = []
	detection_class_thresholds = {
	"table": 0.5,
	"table rotated": 0.5,
	"no object": 10
	}
	crop_padding = 30

	tables_crops = self.objects_to_crops(image, tokens, objects, detection_class_thresholds, padding=crop_padding)

	cropped_tables = []

	if len(tables_crops) == 0:
	if debug:
	print("No tables detected in: ", file_path)

	return None
	elif len(tables_crops) > 1:
	for i, table_crop in enumerate(tables_crops):
	if debug:
	print("Table detected in:", file_path, "-", i + 1)

	cropped_table = table_crop['image'].convert("RGB")
	cropped_tables.append(cropped_table)

	if debug_dir:
	file_name_table = self.append_filename(file_path, debug_dir, f"table_cropped_{i + 1}")
	cropped_table.save(file_name_table)
	else:
	if debug:
	print("Table detected in: ", file_path)

	cropped_table = tables_crops[0]['image'].convert("RGB")
	cropped_tables.append(cropped_table)

	if debug_dir:
	file_name_table = self.append_filename(file_path, debug_dir, "table_cropped")
	cropped_table.save(file_name_table)

	return cropped_tables

	# for output bounding box post-processing
	@staticmethod
	def box_cxcywh_to_xyxy(x):
	x_c, y_c, w, h = x.unbind(-1)
	b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
	return torch.stack(b, dim=1)

	def rescale_bboxes(self, out_bbox, size):
	img_w, img_h = size
	b = self.box_cxcywh_to_xyxy(out_bbox)
	b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
	return b

	def outputs_to_objects(self, outputs, img_size, id2label):
	m = outputs.logits.softmax(-1).max(-1)
	pred_labels = list(m.indices.detach().cpu().numpy())[0]
	pred_scores = list(m.values.detach().cpu().numpy())[0]
	pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
	pred_bboxes = [elem.tolist() for elem in self.rescale_bboxes(pred_bboxes, img_size)]

	objects = []
	for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
	class_label = id2label[int(label)]
	if not class_label == 'no object':
	objects.append({'label': class_label, 'score': float(score),
	'bbox': [float(elem) for elem in bbox]})

	return objects

	def objects_to_crops(self, img, tokens, objects, class_thresholds, padding=10):
	"""
	Process the bounding boxes produced by the table detection model into
	cropped table images and cropped tokens.
	"""

	table_crops = []
	for obj in objects:
	if obj['score'] < class_thresholds[obj['label']]:
	continue

	cropped_table = {}

	bbox = obj['bbox']
	bbox = [bbox[0] - padding, bbox[1] - padding, bbox[2] + padding, bbox[3] + padding]

	cropped_img = img.crop(bbox)

	table_tokens = [token for token in tokens if self.iob(token['bbox'], bbox) >= 0.5]
	for token in table_tokens:
	token['bbox'] = [token['bbox'][0] - bbox[0],
	token['bbox'][1] - bbox[1],
	token['bbox'][2] - bbox[0],
	token['bbox'][3] - bbox[1]]

	# If table is predicted to be rotated, rotate cropped image and tokens/words:
	if obj['label'] == 'table rotated':
	cropped_img = cropped_img.rotate(270, expand=True)
	for token in table_tokens:
	bbox = token['bbox']
	bbox = [cropped_img.size[0] - bbox[3] - 1,
	bbox[0],
	cropped_img.size[0] - bbox[1] - 1,
	bbox[2]]
	token['bbox'] = bbox

	cropped_table['image'] = cropped_img
	cropped_table['tokens'] = table_tokens

	table_crops.append(cropped_table)

	return table_crops


	@staticmethod
	def append_filename(file_path, debug_dir, word):
	directory, filename = os.path.split(file_path)
	name, ext = os.path.splitext(filename)
	new_filename = f"{name}_{word}{ext}"
	return os.path.join(debug_dir, new_filename)

	@staticmethod
	def iob(boxA, boxB):
	# Determine the coordinates of the intersection rectangle
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])

	# Compute the area of intersection rectangle
	interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

	# Compute the area of both the prediction and ground-truth rectangles
	boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
	boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

	# Compute the intersection over box (IoB)
	iob = interArea / float(boxAArea)

	return iob


	@staticmethod
	def invoke_pipeline_step(task_call, task_description, local):
	if local:
	with Progress(
	SpinnerColumn(),
	TextColumn("[progress.description]{task.description}"),
	transient=False,
	) as progress:
	progress.add_task(description=task_description, total=None)
	ret = task_call()
	else:
	print(task_description)
	ret = task_call()

	return ret


	if __name__ == "__main__":
	table_detector = TableDetector()

	# file_path = "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.png"
	# cropped_tables = table_detector.detect_tables(file_path, local=True, debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/", debug=True)

	# for i, cropped_table in enumerate(cropped_tables):
	# file_name_table = table_detector.append_filename(file_path, "cropped_" + str(i))
	# cropped_table.save(file_name_table)