Instructions to use Sudhanshu1304/table-extraction with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PaddleOCR
How to use Sudhanshu1304/table-extraction with PaddleOCR:
# Please refer to the document for information on how to use the model. # https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/module_usage/module_overview.html
- Notebooks
- Google Colab
- Kaggle
| from dataclasses import dataclass | |
| from typing import Dict, List, Optional, Tuple | |
| import pandas as pd | |
| import numpy as np | |
| class TableCell: | |
| """ | |
| Represents a cell in a table with its value and position. | |
| Attributes: | |
| value: The text content of the cell | |
| bbox: Bounding box coordinates [x1, y1, x2, y2] | |
| column_name: Name of the column this cell belongs to | |
| """ | |
| value: str | |
| bbox: List[int] | |
| column_name: str | |
| class TableRow: | |
| """ | |
| Represents a row in a table with its cells and boundaries. | |
| Attributes: | |
| cells: Dictionary of column name to TableCell | |
| min_x: Minimum x coordinate of the row | |
| max_x: Maximum x coordinate of the row | |
| min_y: Minimum y coordinate of the row | |
| max_y: Maximum y coordinate of the row | |
| """ | |
| cells: Dict[str, TableCell] | |
| min_x: float | |
| max_x: float | |
| min_y: float | |
| max_y: float | |
| class TableStructure: | |
| """ | |
| Maintains the structure of a table using a linked list representation. | |
| """ | |
| def __init__(self, debug: bool = False) -> None: | |
| """ | |
| Initialize the table structure. | |
| Args: | |
| debug: Enable debug logging | |
| """ | |
| self.rows: List[TableRow] = [] | |
| self.debug = debug | |
| def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame: | |
| """ | |
| Build table structure from column-wise dataframes. | |
| Args: | |
| dataframes: Dictionary of column name to DataFrame containing text and positions | |
| Returns: | |
| DataFrame with structured table data | |
| """ | |
| if not dataframes: | |
| return pd.DataFrame() | |
| # Initialize with first column | |
| first_col = list(dataframes.keys())[0] | |
| self._initialize_rows(first_col, dataframes[first_col]) | |
| # Process remaining columns | |
| for col_name in list(dataframes.keys())[1:]: | |
| self._process_column(col_name, dataframes[col_name]) | |
| return self._to_dataframe(dataframes.keys()) | |
| def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None: | |
| """Initialize rows with the first column's data.""" | |
| for _, row in df.iterrows(): | |
| bbox = row['boundingBox'] | |
| self.rows.append(TableRow( | |
| cells={column_name: TableCell(row['text'], bbox, column_name)}, | |
| min_x=bbox[0], | |
| max_x=bbox[2], | |
| min_y=bbox[1], | |
| max_y=bbox[3] | |
| )) | |
| def _process_column(self, column_name: str, df: pd.DataFrame) -> None: | |
| """Process additional columns and align with existing rows.""" | |
| search_idx = 0 | |
| for _, row in df.iterrows(): | |
| text = row['text'] | |
| bbox = row['boundingBox'] | |
| matched = False | |
| for idx, table_row in enumerate(self.rows[search_idx:], search_idx): | |
| overlap = self._calculate_overlap( | |
| bbox, | |
| [bbox[0], table_row.min_y, bbox[2], table_row.max_y] | |
| ) | |
| if overlap > 10: | |
| self._update_row(idx, column_name, text, bbox) | |
| search_idx = idx + 1 | |
| matched = True | |
| break | |
| elif bbox[3] <= table_row.min_y: | |
| self._insert_row(idx, column_name, text, bbox) | |
| search_idx = idx + 1 | |
| matched = True | |
| break | |
| if not matched and bbox[1] >= self.rows[-1].max_y: | |
| self._append_row(column_name, text, bbox) | |
| def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float: | |
| """Calculate percentage overlap between two rectangles.""" | |
| x_left = max(rect1[0], rect2[0]) | |
| y_top = max(rect1[1], rect2[1]) | |
| x_right = min(rect1[2], rect2[2]) | |
| y_bottom = min(rect1[3], rect2[3]) | |
| if x_right < x_left or y_bottom < y_top: | |
| return 0.0 | |
| intersection = (x_right - x_left) * (y_bottom - y_top) | |
| min_area = min( | |
| (rect1[2] - rect1[0]) * (rect1[3] - rect1[1]), | |
| (rect2[2] - rect2[0]) * (rect2[3] - rect2[1]) | |
| ) | |
| return (intersection / min_area * 100) if min_area > 0 else 0 | |
| def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None: | |
| """Update existing row with new cell data.""" | |
| self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name) | |
| self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0]) | |
| self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2]) | |
| def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None: | |
| """Insert new row at specified index.""" | |
| self.rows.insert(idx, TableRow( | |
| cells={column_name: TableCell(text, bbox, column_name)}, | |
| min_x=bbox[0], | |
| max_x=bbox[2], | |
| min_y=bbox[1], | |
| max_y=bbox[3] | |
| )) | |
| def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None: | |
| """Append new row at the end.""" | |
| self.rows.append(TableRow( | |
| cells={column_name: TableCell(text, bbox, column_name)}, | |
| min_x=bbox[0], | |
| max_x=bbox[2], | |
| min_y=bbox[1], | |
| max_y=bbox[3] | |
| )) | |
| def _to_dataframe(self, columns: List[str]) -> pd.DataFrame: | |
| """Convert table structure to DataFrame.""" | |
| data = [] | |
| for row in self.rows: | |
| row_data = { | |
| col: row.cells[col].value if col in row.cells else None | |
| for col in columns | |
| } | |
| row_data.update({ | |
| 'row_min_x': row.min_x, | |
| 'row_max_x': row.max_x, | |
| 'row_min_y': row.min_y, | |
| 'row_max_y': row.max_y | |
| }) | |
| data.append(row_data) | |
| return pd.DataFrame(data) |