Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / tests /unit /test_table_chunker.py

MHamdan

Initial commit: SPARKNET framework

d520909 26 days ago

raw

history blame contribute delete

19.5 kB

	"""
	Unit Tests for Table-Aware Chunker (FG-002)

	Tests the enhanced table extraction and structure preservation functionality.
	"""

	import pytest
	import sys
	from pathlib import Path
	from typing import List

	# Add project root to path
	sys.path.insert(0, str(Path(__file__).parent.parent.parent))

	from src.document.schemas.core import (
	BoundingBox,
	OCRRegion,
	LayoutRegion,
	LayoutType,
	ChunkType,
	)
	from src.document.chunking.chunker import (
	SemanticChunker,
	ChunkerConfig,
	)


	# ==============================================================================
	# Fixtures
	# ==============================================================================

	@pytest.fixture
	def chunker():
	"""Create a SemanticChunker with default config."""
	config = ChunkerConfig(
	preserve_table_structure=True,
	table_row_threshold=10.0,
	table_col_threshold=20.0,
	detect_table_headers=True,
	)
	return SemanticChunker(config)


	@pytest.fixture
	def simple_table_regions() -> List[OCRRegion]:
	"""Create OCR regions representing a simple 3x3 table."""
	# Simple table:
	# \| Name \| Age \| City \|
	# \| Alice \| 25 \| New York \|
	# \| Bob \| 30 \| London \|

	regions = [
	# Header row (y=100)
	OCRRegion(
	text="Name",
	confidence=0.95,
	bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120),
	page=0
	),
	OCRRegion(
	text="Age",
	confidence=0.95,
	bbox=BoundingBox(x_min=150, y_min=100, x_max=200, y_max=120),
	page=0
	),
	OCRRegion(
	text="City",
	confidence=0.95,
	bbox=BoundingBox(x_min=250, y_min=100, x_max=300, y_max=120),
	page=0
	),
	# Data row 1 (y=130)
	OCRRegion(
	text="Alice",
	confidence=0.92,
	bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150),
	page=0
	),
	OCRRegion(
	text="25",
	confidence=0.98,
	bbox=BoundingBox(x_min=150, y_min=130, x_max=200, y_max=150),
	page=0
	),
	OCRRegion(
	text="New York",
	confidence=0.90,
	bbox=BoundingBox(x_min=250, y_min=130, x_max=320, y_max=150),
	page=0
	),
	# Data row 2 (y=160)
	OCRRegion(
	text="Bob",
	confidence=0.94,
	bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180),
	page=0
	),
	OCRRegion(
	text="30",
	confidence=0.97,
	bbox=BoundingBox(x_min=150, y_min=160, x_max=200, y_max=180),
	page=0
	),
	OCRRegion(
	text="London",
	confidence=0.93,
	bbox=BoundingBox(x_min=250, y_min=160, x_max=310, y_max=180),
	page=0
	),
	]
	return regions


	@pytest.fixture
	def numeric_table_regions() -> List[OCRRegion]:
	"""Create OCR regions for a numeric data table."""
	# Table:
	# \| Year \| Revenue \| Growth \|
	# \| 2021 \| $1.5M \| 15% \|
	# \| 2022 \| $2.0M \| 33% \|
	# \| 2023 \| $2.8M \| 40% \|

	regions = [
	# Header row
	OCRRegion(text="Year", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0),
	OCRRegion(text="Revenue", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=100, x_max=220, y_max=120), page=0),
	OCRRegion(text="Growth", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=100, x_max=330, y_max=120), page=0),
	# Data rows
	OCRRegion(text="2021", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=130, x_max=100, y_max=150), page=0),
	OCRRegion(text="$1.5M", confidence=0.92, bbox=BoundingBox(x_min=150, y_min=130, x_max=220, y_max=150), page=0),
	OCRRegion(text="15%", confidence=0.94, bbox=BoundingBox(x_min=270, y_min=130, x_max=330, y_max=150), page=0),
	OCRRegion(text="2022", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=160, x_max=100, y_max=180), page=0),
	OCRRegion(text="$2.0M", confidence=0.93, bbox=BoundingBox(x_min=150, y_min=160, x_max=220, y_max=180), page=0),
	OCRRegion(text="33%", confidence=0.95, bbox=BoundingBox(x_min=270, y_min=160, x_max=330, y_max=180), page=0),
	OCRRegion(text="2023", confidence=0.98, bbox=BoundingBox(x_min=50, y_min=190, x_max=100, y_max=210), page=0),
	OCRRegion(text="$2.8M", confidence=0.91, bbox=BoundingBox(x_min=150, y_min=190, x_max=220, y_max=210), page=0),
	OCRRegion(text="40%", confidence=0.96, bbox=BoundingBox(x_min=270, y_min=190, x_max=330, y_max=210), page=0),
	]
	return regions


	@pytest.fixture
	def table_layout_region() -> LayoutRegion:
	"""Create a layout region for a table."""
	return LayoutRegion(
	id="table_001",
	type=LayoutType.TABLE,
	confidence=0.95,
	bbox=BoundingBox(x_min=40, y_min=90, x_max=350, y_max=220),
	page=0,
	)


	# ==============================================================================
	# Table Structure Reconstruction Tests
	# ==============================================================================

	class TestTableStructureReconstruction:
	"""Test table structure reconstruction from OCR regions."""

	def test_reconstruct_simple_table(self, chunker, simple_table_regions):
	"""Test reconstructing a simple table structure."""
	result = chunker._reconstruct_table_structure(simple_table_regions)

	assert result["row_count"] == 3
	assert result["col_count"] == 3
	assert result["has_header"] == True
	assert result["headers"] == ["Name", "Age", "City"]

	def test_detect_rows_correctly(self, chunker, simple_table_regions):
	"""Test that rows are detected based on y-coordinate proximity."""
	result = chunker._reconstruct_table_structure(simple_table_regions)

	cells = result["cells"]
	assert len(cells) == 3 # 3 rows

	# First row is header
	assert cells[0] == ["Name", "Age", "City"]

	# Data rows
	assert cells[1] == ["Alice", "25", "New York"]
	assert cells[2] == ["Bob", "30", "London"]

	def test_detect_columns_correctly(self, chunker, simple_table_regions):
	"""Test that columns are detected based on x-coordinate clustering."""
	result = chunker._reconstruct_table_structure(simple_table_regions)

	# All rows should have 3 columns
	for row in result["cells"]:
	assert len(row) == 3

	def test_header_detection_numeric_data(self, chunker, numeric_table_regions):
	"""Test header detection when data rows are numeric."""
	result = chunker._reconstruct_table_structure(numeric_table_regions)

	assert result["has_header"] == True
	assert result["headers"] == ["Year", "Revenue", "Growth"]

	def test_empty_table(self, chunker):
	"""Test handling of empty table (no OCR regions)."""
	result = chunker._reconstruct_table_structure([])

	assert result["row_count"] == 0
	assert result["col_count"] == 0
	assert result["cells"] == []
	assert result["has_header"] == False


	# ==============================================================================
	# Markdown Generation Tests
	# ==============================================================================

	class TestMarkdownGeneration:
	"""Test markdown table generation."""

	def test_generate_markdown_with_headers(self, chunker, simple_table_regions):
	"""Test markdown generation with detected headers."""
	table_data = chunker._reconstruct_table_structure(simple_table_regions)

	markdown = chunker._table_to_markdown(
	table_data["rows"],
	table_data["headers"],
	table_data["has_header"]
	)

	assert "\| Name \| Age \| City \|" in markdown
	assert "\| --- \| --- \| --- \|" in markdown
	assert "\| Alice \| 25 \| New York \|" in markdown
	assert "\| Bob \| 30 \| London \|" in markdown

	def test_generate_markdown_without_headers(self, chunker):
	"""Test markdown generation without headers (generic Col1, Col2...)."""
	rows = [
	["A", "B", "C"],
	["1", "2", "3"],
	]

	markdown = chunker._table_to_markdown(rows, [], False)

	assert "\| Col1 \| Col2 \| Col3 \|" in markdown
	assert "\| A \| B \| C \|" in markdown
	assert "\| 1 \| 2 \| 3 \|" in markdown

	def test_escape_pipe_characters(self, chunker):
	"""Test that pipe characters in cell content are escaped."""
	rows = [
	["Header1", "Header2"],
	["Value\|With\|Pipes", "Normal"],
	]

	markdown = chunker._table_to_markdown(rows, ["Header1", "Header2"], True)

	assert "Value\\\|With\\\|Pipes" in markdown

	def test_empty_table_returns_placeholder(self, chunker):
	"""Test that empty table returns placeholder text."""
	markdown = chunker._table_to_markdown([], [], False)
	assert markdown == "[Empty Table]"


	# ==============================================================================
	# Table Chunk Creation Tests
	# ==============================================================================

	class TestTableChunkCreation:
	"""Test complete table chunk creation."""

	def test_create_table_chunk_with_structure(
	self, chunker, simple_table_regions, table_layout_region
	):
	"""Test creating a table chunk with preserved structure."""
	chunk = chunker._create_table_chunk(
	simple_table_regions,
	table_layout_region,
	document_id="test_doc",
	source_path="/path/to/doc.pdf"
	)

	# Basic chunk properties
	assert chunk.chunk_type == ChunkType.TABLE
	assert chunk.document_id == "test_doc"
	assert chunk.page == 0

	# Text should be markdown
	assert "\| Name \| Age \| City \|" in chunk.text
	assert "\| --- \|" in chunk.text

	# Extra should contain structured data
	assert "table_structure" in chunk.extra
	table_struct = chunk.extra["table_structure"]

	assert table_struct["row_count"] == 3
	assert table_struct["col_count"] == 3
	assert table_struct["has_header"] == True
	assert table_struct["headers"] == ["Name", "Age", "City"]
	assert table_struct["cells"] is not None

	def test_create_table_chunk_with_cell_positions(
	self, chunker, simple_table_regions, table_layout_region
	):
	"""Test that cell positions are preserved for highlighting."""
	chunk = chunker._create_table_chunk(
	simple_table_regions,
	table_layout_region,
	document_id="test_doc",
	source_path=None
	)

	cell_positions = chunk.extra["table_structure"]["cell_positions"]

	# Should have positions for all cells
	assert len(cell_positions) == 3 # 3 rows
	for row_positions in cell_positions:
	assert len(row_positions) == 3 # 3 cols per row
	for cell in row_positions:
	assert "text" in cell
	assert "bbox" in cell
	assert "confidence" in cell

	def test_create_table_chunk_searchable_text(
	self, chunker, simple_table_regions, table_layout_region
	):
	"""Test that searchable text includes header context."""
	chunk = chunker._create_table_chunk(
	simple_table_regions,
	table_layout_region,
	document_id="test_doc",
	source_path=None
	)

	searchable = chunk.extra["searchable_text"]

	# Headers should be labeled
	assert "Headers:" in searchable

	# Data should have header context
	assert "Name: Alice" in searchable or "Alice" in searchable
	assert "Age: 25" in searchable or "25" in searchable

	def test_create_empty_table_chunk(self, chunker, table_layout_region):
	"""Test creating chunk for empty table."""
	chunk = chunker._create_table_chunk(
	[],
	table_layout_region,
	document_id="test_doc",
	source_path=None
	)

	assert chunk.text == "[Empty Table]"
	assert chunk.confidence == 0.0


	# ==============================================================================
	# Configuration Tests
	# ==============================================================================

	class TestChunkerConfiguration:
	"""Test chunker configuration options."""

	def test_disable_table_structure_preservation(self, simple_table_regions, table_layout_region):
	"""Test disabling table structure preservation."""
	config = ChunkerConfig(preserve_table_structure=False)
	chunker = SemanticChunker(config)

	chunk = chunker._create_table_chunk(
	simple_table_regions,
	table_layout_region,
	document_id="test_doc",
	source_path=None
	)

	# Should use simple pipe-separated format
	assert "\|" in chunk.text
	assert "\| --- \|" not in chunk.text # No markdown separator

	def test_disable_header_detection(self, simple_table_regions, table_layout_region):
	"""Test disabling header detection."""
	config = ChunkerConfig(
	preserve_table_structure=True,
	detect_table_headers=False
	)
	chunker = SemanticChunker(config)

	chunk = chunker._create_table_chunk(
	simple_table_regions,
	table_layout_region,
	document_id="test_doc",
	source_path=None
	)

	# Should use generic headers
	table_struct = chunk.extra["table_structure"]
	assert table_struct["has_header"] == False
	assert table_struct["headers"] == []

	def test_custom_row_threshold(self):
	"""Test custom row grouping threshold."""
	# With small threshold, rows might be split incorrectly
	config = ChunkerConfig(table_row_threshold=5.0)
	chunker = SemanticChunker(config)

	# Create regions with y-positions slightly apart
	regions = [
	OCRRegion(text="A", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=100, x_max=100, y_max=120), page=0),
	OCRRegion(text="B", confidence=0.9, bbox=BoundingBox(x_min=50, y_min=108, x_max=100, y_max=128), page=0),
	]

	result = chunker._reconstruct_table_structure(regions)

	# With threshold of 5, these should be separate rows (8 > 5)
	assert result["row_count"] == 2


	# ==============================================================================
	# Numeric Detection Tests
	# ==============================================================================

	class TestNumericDetection:
	"""Test numeric value detection for header identification."""

	def test_detect_pure_number(self, chunker):
	"""Test detection of pure numbers."""
	assert chunker._is_numeric("123") == True
	assert chunker._is_numeric("0") == True
	assert chunker._is_numeric("999999") == True

	def test_detect_currency(self, chunker):
	"""Test detection of currency values."""
	assert chunker._is_numeric("$1,234.56") == True
	assert chunker._is_numeric("€100") == True
	assert chunker._is_numeric("£50.00") == True

	def test_detect_percentage(self, chunker):
	"""Test detection of percentage values."""
	assert chunker._is_numeric("15%") == True
	assert chunker._is_numeric("100.5%") == True

	def test_detect_negative_numbers(self, chunker):
	"""Test detection of negative numbers."""
	assert chunker._is_numeric("-123") == True
	assert chunker._is_numeric("(-50)") == True

	def test_non_numeric_text(self, chunker):
	"""Test that text is not detected as numeric."""
	assert chunker._is_numeric("Name") == False
	assert chunker._is_numeric("Alice") == False
	assert chunker._is_numeric("Revenue Growth") == False

	def test_mixed_content(self, chunker):
	"""Test mixed alphanumeric content."""
	assert chunker._is_numeric("Q1 2023") == False
	assert chunker._is_numeric("Rev: $100") == False


	# ==============================================================================
	# Integration with Full Chunking Pipeline
	# ==============================================================================

	class TestFullChunkingPipeline:
	"""Test table handling in full chunking pipeline."""

	def test_chunk_document_with_table(
	self, chunker, simple_table_regions, table_layout_region
	):
	"""Test chunking a document that contains a table."""
	layout_regions = [table_layout_region]

	chunks = chunker.create_chunks(
	ocr_regions=simple_table_regions,
	layout_regions=layout_regions,
	document_id="test_doc",
	source_path="/path/to/doc.pdf"
	)

	assert len(chunks) == 1
	assert chunks[0].chunk_type == ChunkType.TABLE
	assert "\| Name \| Age \| City \|" in chunks[0].text

	def test_chunk_document_mixed_content(self, chunker):
	"""Test chunking document with tables and text."""
	# Create mixed content: text + table
	text_regions = [
	OCRRegion(text="Introduction", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=50, x_max=200, y_max=70), page=0),
	OCRRegion(text="This document contains data.", confidence=0.92, bbox=BoundingBox(x_min=50, y_min=80, x_max=300, y_max=100), page=0),
	]

	table_regions = [
	OCRRegion(text="Col1", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=150, x_max=100, y_max=170), page=0),
	OCRRegion(text="Col2", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=150, x_max=200, y_max=170), page=0),
	OCRRegion(text="A", confidence=0.95, bbox=BoundingBox(x_min=50, y_min=180, x_max=100, y_max=200), page=0),
	OCRRegion(text="B", confidence=0.95, bbox=BoundingBox(x_min=150, y_min=180, x_max=200, y_max=200), page=0),
	]

	all_regions = text_regions + table_regions

	layout_regions = [
	LayoutRegion(
	id="text_001",
	type=LayoutType.PARAGRAPH,
	confidence=0.9,
	bbox=BoundingBox(x_min=40, y_min=40, x_max=350, y_max=110),
	page=0
	),
	LayoutRegion(
	id="table_001",
	type=LayoutType.TABLE,
	confidence=0.95,
	bbox=BoundingBox(x_min=40, y_min=140, x_max=250, y_max=210),
	page=0
	),
	]

	chunks = chunker.create_chunks(
	ocr_regions=all_regions,
	layout_regions=layout_regions,
	document_id="test_doc",
	source_path=None
	)

	# Should have 2 chunks: text and table
	assert len(chunks) == 2

	chunk_types = [c.chunk_type for c in chunks]
	assert ChunkType.PARAGRAPH in chunk_types
	assert ChunkType.TABLE in chunk_types


	# ==============================================================================
	# Main Entry Point
	# ==============================================================================

	if __name__ == "__main__":
	pytest.main([__file__, "-v", "--tb=short"])