Spaces:
Runtime error
Runtime error
"""Tests for dataset.stats().""" | |
from typing import Any, cast | |
import pytest | |
from pytest_mock import MockerFixture | |
from ..schema import UUID_COLUMN, Item, schema | |
from . import dataset_duckdb | |
from .dataset import StatsResult | |
from .dataset_test_utils import TestDataMaker | |
SIMPLE_ITEMS: list[Item] = [{ | |
UUID_COLUMN: '1', | |
'str': 'a', | |
'int': 1, | |
'bool': False, | |
'float': 3.0, | |
}, { | |
UUID_COLUMN: '2', | |
'str': 'b', | |
'int': 2, | |
'bool': True, | |
'float': 2.0 | |
}, { | |
UUID_COLUMN: '3', | |
'str': 'b', | |
'int': 2, | |
'bool': True, | |
'float': 1.0 | |
}, { | |
UUID_COLUMN: '4', | |
'float': float('nan') | |
}] | |
def test_simple_stats(make_test_data: TestDataMaker) -> None: | |
dataset = make_test_data(SIMPLE_ITEMS) | |
result = dataset.stats(leaf_path='str') | |
assert result == StatsResult( | |
path=('str',), total_count=3, approx_count_distinct=2, avg_text_length=1) | |
result = dataset.stats(leaf_path='float') | |
assert result == StatsResult( | |
path=('float',), total_count=4, approx_count_distinct=4, min_val=1.0, max_val=3.0) | |
result = dataset.stats(leaf_path='bool') | |
assert result == StatsResult(path=('bool',), total_count=3, approx_count_distinct=2) | |
result = dataset.stats(leaf_path='int') | |
assert result == StatsResult( | |
path=('int',), total_count=3, approx_count_distinct=2, min_val=1, max_val=2) | |
def test_nested_stats(make_test_data: TestDataMaker) -> None: | |
nested_items: list[Item] = [ | |
{ | |
'name': 'Name1', | |
'addresses': [{ | |
'zips': [5, 8] | |
}] | |
}, | |
{ | |
'name': 'Name2', | |
'addresses': [{ | |
'zips': [3] | |
}, { | |
'zips': [11, 8] | |
}] | |
}, | |
{ | |
'name': 'Name2', | |
'addresses': [] | |
}, # No addresses. | |
{ | |
'name': 'Name2', | |
'addresses': [{ | |
'zips': [] | |
}] | |
} # No zips in the first address. | |
] | |
nested_schema = schema({ | |
UUID_COLUMN: 'string', | |
'name': 'string', | |
'addresses': [{ | |
'zips': ['int32'] | |
}] | |
}) | |
dataset = make_test_data(nested_items, schema=nested_schema) | |
result = dataset.stats(leaf_path='name') | |
assert result == StatsResult( | |
path=('name',), total_count=4, approx_count_distinct=2, avg_text_length=5) | |
result = dataset.stats(leaf_path='addresses.*.zips.*') | |
assert result == StatsResult( | |
path=('addresses', '*', 'zips', '*'), | |
total_count=5, | |
approx_count_distinct=4, | |
min_val=3, | |
max_val=11) | |
def test_stats_approximation(make_test_data: TestDataMaker, mocker: MockerFixture) -> None: | |
sample_size = 5 | |
mocker.patch(f'{dataset_duckdb.__name__}.SAMPLE_SIZE_DISTINCT_COUNT', sample_size) | |
nested_items: list[Item] = [{'feature': str(i)} for i in range(sample_size * 10)] | |
nested_schema = schema({UUID_COLUMN: 'string', 'feature': 'string'}) | |
dataset = make_test_data(nested_items, schema=nested_schema) | |
result = dataset.stats(leaf_path='feature') | |
assert result == StatsResult( | |
path=('feature',), total_count=50, approx_count_distinct=50, avg_text_length=1) | |
def test_error_handling(make_test_data: TestDataMaker) -> None: | |
dataset = make_test_data(SIMPLE_ITEMS) | |
with pytest.raises(ValueError, match='leaf_path must be provided'): | |
dataset.stats(cast(Any, None)) | |
with pytest.raises(ValueError, match='Leaf "\\(\'unknown\',\\)" not found in dataset'): | |
dataset.stats(leaf_path='unknown') | |