nikhil_staging / src /data /dataset_stats_test.py
nsthorat's picture
Push
55dc3dd
raw
history blame
No virus
3.29 kB
"""Tests for dataset.stats()."""
from typing import Any, cast
import pytest
from pytest_mock import MockerFixture
from ..schema import UUID_COLUMN, Item, schema
from . import dataset_duckdb
from .dataset import StatsResult
from .dataset_test_utils import TestDataMaker
SIMPLE_ITEMS: list[Item] = [{
UUID_COLUMN: '1',
'str': 'a',
'int': 1,
'bool': False,
'float': 3.0,
}, {
UUID_COLUMN: '2',
'str': 'b',
'int': 2,
'bool': True,
'float': 2.0
}, {
UUID_COLUMN: '3',
'str': 'b',
'int': 2,
'bool': True,
'float': 1.0
}, {
UUID_COLUMN: '4',
'float': float('nan')
}]
def test_simple_stats(make_test_data: TestDataMaker) -> None:
dataset = make_test_data(SIMPLE_ITEMS)
result = dataset.stats(leaf_path='str')
assert result == StatsResult(
path=('str',), total_count=3, approx_count_distinct=2, avg_text_length=1)
result = dataset.stats(leaf_path='float')
assert result == StatsResult(
path=('float',), total_count=4, approx_count_distinct=4, min_val=1.0, max_val=3.0)
result = dataset.stats(leaf_path='bool')
assert result == StatsResult(path=('bool',), total_count=3, approx_count_distinct=2)
result = dataset.stats(leaf_path='int')
assert result == StatsResult(
path=('int',), total_count=3, approx_count_distinct=2, min_val=1, max_val=2)
def test_nested_stats(make_test_data: TestDataMaker) -> None:
nested_items: list[Item] = [
{
'name': 'Name1',
'addresses': [{
'zips': [5, 8]
}]
},
{
'name': 'Name2',
'addresses': [{
'zips': [3]
}, {
'zips': [11, 8]
}]
},
{
'name': 'Name2',
'addresses': []
}, # No addresses.
{
'name': 'Name2',
'addresses': [{
'zips': []
}]
} # No zips in the first address.
]
nested_schema = schema({
UUID_COLUMN: 'string',
'name': 'string',
'addresses': [{
'zips': ['int32']
}]
})
dataset = make_test_data(nested_items, schema=nested_schema)
result = dataset.stats(leaf_path='name')
assert result == StatsResult(
path=('name',), total_count=4, approx_count_distinct=2, avg_text_length=5)
result = dataset.stats(leaf_path='addresses.*.zips.*')
assert result == StatsResult(
path=('addresses', '*', 'zips', '*'),
total_count=5,
approx_count_distinct=4,
min_val=3,
max_val=11)
def test_stats_approximation(make_test_data: TestDataMaker, mocker: MockerFixture) -> None:
sample_size = 5
mocker.patch(f'{dataset_duckdb.__name__}.SAMPLE_SIZE_DISTINCT_COUNT', sample_size)
nested_items: list[Item] = [{'feature': str(i)} for i in range(sample_size * 10)]
nested_schema = schema({UUID_COLUMN: 'string', 'feature': 'string'})
dataset = make_test_data(nested_items, schema=nested_schema)
result = dataset.stats(leaf_path='feature')
assert result == StatsResult(
path=('feature',), total_count=50, approx_count_distinct=50, avg_text_length=1)
def test_error_handling(make_test_data: TestDataMaker) -> None:
dataset = make_test_data(SIMPLE_ITEMS)
with pytest.raises(ValueError, match='leaf_path must be provided'):
dataset.stats(cast(Any, None))
with pytest.raises(ValueError, match='Leaf "\\(\'unknown\',\\)" not found in dataset'):
dataset.stats(leaf_path='unknown')