Sudhanshu Pandey

Update the files

a7b8c18 about 1 year ago

6.07 kB

	from dataclasses import dataclass
	from typing import Dict, List, Optional, Tuple
	import pandas as pd
	import numpy as np

	@dataclass
	class TableCell:
	"""
	Represents a cell in a table with its value and position.

	Attributes:
	value: The text content of the cell
	bbox: Bounding box coordinates [x1, y1, x2, y2]
	column_name: Name of the column this cell belongs to
	"""
	value: str
	bbox: List[int]
	column_name: str

	@dataclass
	class TableRow:
	"""
	Represents a row in a table with its cells and boundaries.

	Attributes:
	cells: Dictionary of column name to TableCell
	min_x: Minimum x coordinate of the row
	max_x: Maximum x coordinate of the row
	min_y: Minimum y coordinate of the row
	max_y: Maximum y coordinate of the row
	"""
	cells: Dict[str, TableCell]
	min_x: float
	max_x: float
	min_y: float
	max_y: float

	class TableStructure:
	"""
	Maintains the structure of a table using a linked list representation.
	"""

	def __init__(self, debug: bool = False) -> None:
	"""
	Initialize the table structure.

	Args:
	debug: Enable debug logging
	"""
	self.rows: List[TableRow] = []
	self.debug = debug

	def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame:
	"""
	Build table structure from column-wise dataframes.

	Args:
	dataframes: Dictionary of column name to DataFrame containing text and positions

	Returns:
	DataFrame with structured table data
	"""
	if not dataframes:
	return pd.DataFrame()

	# Initialize with first column
	first_col = list(dataframes.keys())[0]
	self._initialize_rows(first_col, dataframes[first_col])

	# Process remaining columns
	for col_name in list(dataframes.keys())[1:]:
	self._process_column(col_name, dataframes[col_name])

	return self._to_dataframe(dataframes.keys())

	def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None:
	"""Initialize rows with the first column's data."""
	for _, row in df.iterrows():
	bbox = row['boundingBox']
	self.rows.append(TableRow(
	cells={column_name: TableCell(row['text'], bbox, column_name)},
	min_x=bbox[0],
	max_x=bbox[2],
	min_y=bbox[1],
	max_y=bbox[3]
	))

	def _process_column(self, column_name: str, df: pd.DataFrame) -> None:
	"""Process additional columns and align with existing rows."""
	search_idx = 0

	for _, row in df.iterrows():
	text = row['text']
	bbox = row['boundingBox']

	matched = False
	for idx, table_row in enumerate(self.rows[search_idx:], search_idx):
	overlap = self._calculate_overlap(
	bbox,
	[bbox[0], table_row.min_y, bbox[2], table_row.max_y]
	)

	if overlap > 10:
	self._update_row(idx, column_name, text, bbox)
	search_idx = idx + 1
	matched = True
	break
	elif bbox[3] <= table_row.min_y:
	self._insert_row(idx, column_name, text, bbox)
	search_idx = idx + 1
	matched = True
	break

	if not matched and bbox[1] >= self.rows[-1].max_y:
	self._append_row(column_name, text, bbox)

	def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float:
	"""Calculate percentage overlap between two rectangles."""
	x_left = max(rect1[0], rect2[0])
	y_top = max(rect1[1], rect2[1])
	x_right = min(rect1[2], rect2[2])
	y_bottom = min(rect1[3], rect2[3])

	if x_right < x_left or y_bottom < y_top:
	return 0.0

	intersection = (x_right - x_left) * (y_bottom - y_top)
	min_area = min(
	(rect1[2] - rect1[0]) * (rect1[3] - rect1[1]),
	(rect2[2] - rect2[0]) * (rect2[3] - rect2[1])
	)

	return (intersection / min_area * 100) if min_area > 0 else 0

	def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
	"""Update existing row with new cell data."""
	self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name)
	self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0])
	self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2])

	def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
	"""Insert new row at specified index."""
	self.rows.insert(idx, TableRow(
	cells={column_name: TableCell(text, bbox, column_name)},
	min_x=bbox[0],
	max_x=bbox[2],
	min_y=bbox[1],
	max_y=bbox[3]
	))

	def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None:
	"""Append new row at the end."""
	self.rows.append(TableRow(
	cells={column_name: TableCell(text, bbox, column_name)},
	min_x=bbox[0],
	max_x=bbox[2],
	min_y=bbox[1],
	max_y=bbox[3]
	))

	def _to_dataframe(self, columns: List[str]) -> pd.DataFrame:
	"""Convert table structure to DataFrame."""
	data = []
	for row in self.rows:
	row_data = {
	col: row.cells[col].value if col in row.cells else None
	for col in columns
	}
	row_data.update({
	'row_min_x': row.min_x,
	'row_max_x': row.max_x,
	'row_min_y': row.min_y,
	'row_max_y': row.max_y
	})
	data.append(row_data)

	return pd.DataFrame(data)