SPARKNET / tests /unit /test_table_chunker.py
MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
Unit Tests for Table-Aware Chunker (FG-002)
Tests the enhanced table extraction and structure preservation functionality.
"""
import pytest
import sys
from pathlib import Path
from typing import List
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from src.document.schemas.core import (
BoundingBox,
OCRRegion,
LayoutRegion,
LayoutType,
ChunkType,
)
from src.document.chunking.chunker import (
SemanticChunker,
ChunkerConfig,
)
# ==============================================================================
# Fixtures
# ==============================================================================
@pytest.fixture
def chunker():
"""Create a SemanticChunker with default config."""
config = ChunkerConfig(
preserve_table_structure=True,
table_row_threshold=10.0,
table_col_threshold=20.0,
detect_table_headers=True,
)
return SemanticChunker(config)
@pytest.fixture
def simple_table_regions() -> List[OCRRegion]:
"""Create OCR regions representing a simple 3x3 table."""
# Simple table:
# | Name | Age | City |
# | Alice | 25 | New York |
# | Bob | 30 | London |
regions = [
# Header row (y=100)
OCRRegion(
text="Name",
confidence=0.95,
bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120),
page=0
),
OCRRegion(
text="Age",
confidence=0.95,
bbox=BoundingBox(x_min=150, y_min=100, x_max=200, y_max=120),
page=0
),
OCRRegion(
text="City",
confidence=0.95,
bbox=BoundingBox(x_min=250, y_min=100, x_max=300, y_max=120),
page=0
),
# Data row 1 (y=130)
OCRRegion(
text="Alice",
confidence=0.92,
bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150),
page=0
),
OCRRegion(
text="25",
confidence=0.98,
bbox=BoundingBox(x_min=150, y_min=130, x_max=200, y_max=150),
page=0
),
OCRRegion(
text="New York",
confidence=0.90,
bbox=BoundingBox(x_min=250, y_min=130, x_max=320, y_max=150),
page=0
),
# Data row 2 (y=160)
OCRRegion(
text="Bob",
confidence=0.94,
bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180),
page=0
),
OCRRegion(
text="30",
confidence=0.97,
bbox=BoundingBox(x_min=150, y_min=160, x_max=200, y_max=180),
page=0
),
OCRRegion(
text="London",
confidence=0.93,
bbox=BoundingBox(x_min=250, y_min=160, x_max=310, y_max=180),
page=0
),
]
return regions
@pytest.fixture
def numeric_table_regions() -> List[OCRRegion]:
"""Create OCR regions for a numeric data table."""
# Table:
# | Year | Revenue | Growth |
# | 2021 | $1.5M | 15% |
# | 2022 | $2.0M | 33% |
# | 2023 | $2.8M | 40% |
regions = [
# Header row
OCRRegion(text="Year", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0),
OCRRegion(text="Revenue", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=100, x_max=220, y_max=120), page=0),
OCRRegion(text="Growth", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=100, x_max=330, y_max=120), page=0),
# Data rows
OCRRegion(text="2021", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), page=0),
OCRRegion(text="$1.5M", confidence=0.92, bbox=BoundingBox(x_min=150, y_min=130, x_max=220, y_max=150), page=0),
OCRRegion(text="15%", confidence=0.94, bbox=BoundingBox(x_min=270, y_min=130, x_max=330, y_max=150), page=0),
OCRRegion(text="2022", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), page=0),
OCRRegion(text="$2.0M", confidence=0.93, bbox=BoundingBox(x_min=150, y_min=160, x_max=220, y_max=180), page=0),
OCRRegion(text="33%", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=160, x_max=330, y_max=180), page=0),
OCRRegion(text="2023", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=190, x_max=100, y_max=210), page=0),
OCRRegion(text="$2.8M", confidence=0.91, bbox=BoundingBox(x_min=150, y_min=190, x_max=220, y_max=210), page=0),
OCRRegion(text="40%", confidence=0.96, bbox=BoundingBox(x_min=270, y_min=190, x_max=330, y_max=210), page=0),
]
return regions
@pytest.fixture
def table_layout_region() -> LayoutRegion:
"""Create a layout region for a table."""
return LayoutRegion(
id="table_001",
type=LayoutType.TABLE,
confidence=0.95,
bbox=BoundingBox(x_min=40, y_min=90, x_max=350, y_max=220),
page=0,
)
# ==============================================================================
# Table Structure Reconstruction Tests
# ==============================================================================
class TestTableStructureReconstruction:
"""Test table structure reconstruction from OCR regions."""
def test_reconstruct_simple_table(self, chunker, simple_table_regions):
"""Test reconstructing a simple table structure."""
result = chunker._reconstruct_table_structure(simple_table_regions)
assert result["row_count"] == 3
assert result["col_count"] == 3
assert result["has_header"] == True
assert result["headers"] == ["Name", "Age", "City"]
def test_detect_rows_correctly(self, chunker, simple_table_regions):
"""Test that rows are detected based on y-coordinate proximity."""
result = chunker._reconstruct_table_structure(simple_table_regions)
cells = result["cells"]
assert len(cells) == 3 # 3 rows
# First row is header
assert cells[0] == ["Name", "Age", "City"]
# Data rows
assert cells[1] == ["Alice", "25", "New York"]
assert cells[2] == ["Bob", "30", "London"]
def test_detect_columns_correctly(self, chunker, simple_table_regions):
"""Test that columns are detected based on x-coordinate clustering."""
result = chunker._reconstruct_table_structure(simple_table_regions)
# All rows should have 3 columns
for row in result["cells"]:
assert len(row) == 3
def test_header_detection_numeric_data(self, chunker, numeric_table_regions):
"""Test header detection when data rows are numeric."""
result = chunker._reconstruct_table_structure(numeric_table_regions)
assert result["has_header"] == True
assert result["headers"] == ["Year", "Revenue", "Growth"]
def test_empty_table(self, chunker):
"""Test handling of empty table (no OCR regions)."""
result = chunker._reconstruct_table_structure([])
assert result["row_count"] == 0
assert result["col_count"] == 0
assert result["cells"] == []
assert result["has_header"] == False
# ==============================================================================
# Markdown Generation Tests
# ==============================================================================
class TestMarkdownGeneration:
"""Test markdown table generation."""
def test_generate_markdown_with_headers(self, chunker, simple_table_regions):
"""Test markdown generation with detected headers."""
table_data = chunker._reconstruct_table_structure(simple_table_regions)
markdown = chunker._table_to_markdown(
table_data["rows"],
table_data["headers"],
table_data["has_header"]
)
assert "| Name | Age | City |" in markdown
assert "| --- | --- | --- |" in markdown
assert "| Alice | 25 | New York |" in markdown
assert "| Bob | 30 | London |" in markdown
def test_generate_markdown_without_headers(self, chunker):
"""Test markdown generation without headers (generic Col1, Col2...)."""
rows = [
["A", "B", "C"],
["1", "2", "3"],
]
markdown = chunker._table_to_markdown(rows, [], False)
assert "| Col1 | Col2 | Col3 |" in markdown
assert "| A | B | C |" in markdown
assert "| 1 | 2 | 3 |" in markdown
def test_escape_pipe_characters(self, chunker):
"""Test that pipe characters in cell content are escaped."""
rows = [
["Header1", "Header2"],
["Value|With|Pipes", "Normal"],
]
markdown = chunker._table_to_markdown(rows, ["Header1", "Header2"], True)
assert "Value\\|With\\|Pipes" in markdown
def test_empty_table_returns_placeholder(self, chunker):
"""Test that empty table returns placeholder text."""
markdown = chunker._table_to_markdown([], [], False)
assert markdown == "[Empty Table]"
# ==============================================================================
# Table Chunk Creation Tests
# ==============================================================================
class TestTableChunkCreation:
"""Test complete table chunk creation."""
def test_create_table_chunk_with_structure(
self, chunker, simple_table_regions, table_layout_region
):
"""Test creating a table chunk with preserved structure."""
chunk = chunker._create_table_chunk(
simple_table_regions,
table_layout_region,
document_id="test_doc",
source_path="/path/to/doc.pdf"
)
# Basic chunk properties
assert chunk.chunk_type == ChunkType.TABLE
assert chunk.document_id == "test_doc"
assert chunk.page == 0
# Text should be markdown
assert "| Name | Age | City |" in chunk.text
assert "| --- |" in chunk.text
# Extra should contain structured data
assert "table_structure" in chunk.extra
table_struct = chunk.extra["table_structure"]
assert table_struct["row_count"] == 3
assert table_struct["col_count"] == 3
assert table_struct["has_header"] == True
assert table_struct["headers"] == ["Name", "Age", "City"]
assert table_struct["cells"] is not None
def test_create_table_chunk_with_cell_positions(
self, chunker, simple_table_regions, table_layout_region
):
"""Test that cell positions are preserved for highlighting."""
chunk = chunker._create_table_chunk(
simple_table_regions,
table_layout_region,
document_id="test_doc",
source_path=None
)
cell_positions = chunk.extra["table_structure"]["cell_positions"]
# Should have positions for all cells
assert len(cell_positions) == 3 # 3 rows
for row_positions in cell_positions:
assert len(row_positions) == 3 # 3 cols per row
for cell in row_positions:
assert "text" in cell
assert "bbox" in cell
assert "confidence" in cell
def test_create_table_chunk_searchable_text(
self, chunker, simple_table_regions, table_layout_region
):
"""Test that searchable text includes header context."""
chunk = chunker._create_table_chunk(
simple_table_regions,
table_layout_region,
document_id="test_doc",
source_path=None
)
searchable = chunk.extra["searchable_text"]
# Headers should be labeled
assert "Headers:" in searchable
# Data should have header context
assert "Name: Alice" in searchable or "Alice" in searchable
assert "Age: 25" in searchable or "25" in searchable
def test_create_empty_table_chunk(self, chunker, table_layout_region):
"""Test creating chunk for empty table."""
chunk = chunker._create_table_chunk(
[],
table_layout_region,
document_id="test_doc",
source_path=None
)
assert chunk.text == "[Empty Table]"
assert chunk.confidence == 0.0
# ==============================================================================
# Configuration Tests
# ==============================================================================
class TestChunkerConfiguration:
"""Test chunker configuration options."""
def test_disable_table_structure_preservation(self, simple_table_regions, table_layout_region):
"""Test disabling table structure preservation."""
config = ChunkerConfig(preserve_table_structure=False)
chunker = SemanticChunker(config)
chunk = chunker._create_table_chunk(
simple_table_regions,
table_layout_region,
document_id="test_doc",
source_path=None
)
# Should use simple pipe-separated format
assert "|" in chunk.text
assert "| --- |" not in chunk.text # No markdown separator
def test_disable_header_detection(self, simple_table_regions, table_layout_region):
"""Test disabling header detection."""
config = ChunkerConfig(
preserve_table_structure=True,
detect_table_headers=False
)
chunker = SemanticChunker(config)
chunk = chunker._create_table_chunk(
simple_table_regions,
table_layout_region,
document_id="test_doc",
source_path=None
)
# Should use generic headers
table_struct = chunk.extra["table_structure"]
assert table_struct["has_header"] == False
assert table_struct["headers"] == []
def test_custom_row_threshold(self):
"""Test custom row grouping threshold."""
# With small threshold, rows might be split incorrectly
config = ChunkerConfig(table_row_threshold=5.0)
chunker = SemanticChunker(config)
# Create regions with y-positions slightly apart
regions = [
OCRRegion(text="A", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0),
OCRRegion(text="B", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=108, x_max=100, y_max=128), page=0),
]
result = chunker._reconstruct_table_structure(regions)
# With threshold of 5, these should be separate rows (8 > 5)
assert result["row_count"] == 2
# ==============================================================================
# Numeric Detection Tests
# ==============================================================================
class TestNumericDetection:
"""Test numeric value detection for header identification."""
def test_detect_pure_number(self, chunker):
"""Test detection of pure numbers."""
assert chunker._is_numeric("123") == True
assert chunker._is_numeric("0") == True
assert chunker._is_numeric("999999") == True
def test_detect_currency(self, chunker):
"""Test detection of currency values."""
assert chunker._is_numeric("$1,234.56") == True
assert chunker._is_numeric("€100") == True
assert chunker._is_numeric("£50.00") == True
def test_detect_percentage(self, chunker):
"""Test detection of percentage values."""
assert chunker._is_numeric("15%") == True
assert chunker._is_numeric("100.5%") == True
def test_detect_negative_numbers(self, chunker):
"""Test detection of negative numbers."""
assert chunker._is_numeric("-123") == True
assert chunker._is_numeric("(-50)") == True
def test_non_numeric_text(self, chunker):
"""Test that text is not detected as numeric."""
assert chunker._is_numeric("Name") == False
assert chunker._is_numeric("Alice") == False
assert chunker._is_numeric("Revenue Growth") == False
def test_mixed_content(self, chunker):
"""Test mixed alphanumeric content."""
assert chunker._is_numeric("Q1 2023") == False
assert chunker._is_numeric("Rev: $100") == False
# ==============================================================================
# Integration with Full Chunking Pipeline
# ==============================================================================
class TestFullChunkingPipeline:
"""Test table handling in full chunking pipeline."""
def test_chunk_document_with_table(
self, chunker, simple_table_regions, table_layout_region
):
"""Test chunking a document that contains a table."""
layout_regions = [table_layout_region]
chunks = chunker.create_chunks(
ocr_regions=simple_table_regions,
layout_regions=layout_regions,
document_id="test_doc",
source_path="/path/to/doc.pdf"
)
assert len(chunks) == 1
assert chunks[0].chunk_type == ChunkType.TABLE
assert "| Name | Age | City |" in chunks[0].text
def test_chunk_document_mixed_content(self, chunker):
"""Test chunking document with tables and text."""
# Create mixed content: text + table
text_regions = [
OCRRegion(text="Introduction", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=50, x_max=200, y_max=70), page=0),
OCRRegion(text="This document contains data.", confidence=0.92, bbox=BoundingBox(x_min=50, y_min=80, x_max=300, y_max=100), page=0),
]
table_regions = [
OCRRegion(text="Col1", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=150, x_max=100, y_max=170), page=0),
OCRRegion(text="Col2", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=150, x_max=200, y_max=170), page=0),
OCRRegion(text="A", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=180, x_max=100, y_max=200), page=0),
OCRRegion(text="B", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=180, x_max=200, y_max=200), page=0),
]
all_regions = text_regions + table_regions
layout_regions = [
LayoutRegion(
id="text_001",
type=LayoutType.PARAGRAPH,
confidence=0.9,
bbox=BoundingBox(x_min=40, y_min=40, x_max=350, y_max=110),
page=0
),
LayoutRegion(
id="table_001",
type=LayoutType.TABLE,
confidence=0.95,
bbox=BoundingBox(x_min=40, y_min=140, x_max=250, y_max=210),
page=0
),
]
chunks = chunker.create_chunks(
ocr_regions=all_regions,
layout_regions=layout_regions,
document_id="test_doc",
source_path=None
)
# Should have 2 chunks: text and table
assert len(chunks) == 2
chunk_types = [c.chunk_type for c in chunks]
assert ChunkType.PARAGRAPH in chunk_types
assert ChunkType.TABLE in chunk_types
# ==============================================================================
# Main Entry Point
# ==============================================================================
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])