Spaces:
Runtime error
Runtime error
"""Tests for dataset.select_groups().""" | |
import re | |
import pytest | |
from pytest_mock import MockerFixture | |
from ..schema import UUID_COLUMN, Item, field, schema | |
from . import dataset as dataset_module | |
from .dataset import BinaryOp | |
from .dataset_test_utils import TestDataMaker | |
def test_flat_data(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [ | |
{ | |
'name': 'Name1', | |
'age': 34, | |
'active': False | |
}, | |
{ | |
'name': 'Name2', | |
'age': 45, | |
'active': True | |
}, | |
{ | |
'age': 17, | |
'active': True | |
}, # Missing "name". | |
{ | |
'name': 'Name3', | |
'active': True | |
}, # Missing "age". | |
{ | |
'name': 'Name4', | |
'age': 55 | |
} # Missing "active". | |
] | |
dataset = make_test_data(items) | |
result = dataset.select_groups(leaf_path='name') | |
assert result.counts == [('Name1', 1), ('Name2', 1), (None, 1), ('Name3', 1), ('Name4', 1)] | |
result = dataset.select_groups(leaf_path='age', bins=[20, 50, 60]) | |
assert result.counts == [('1', 2), ('0', 1), (None, 1), ('2', 1)] | |
result = dataset.select_groups(leaf_path='active') | |
assert result.counts == [ | |
(True, 3), | |
(False, 1), | |
(None, 1), # Missing "active". | |
] | |
def test_result_counts(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [ | |
{ | |
'active': False | |
}, | |
{ | |
'active': True | |
}, | |
{ | |
'active': True | |
}, | |
{ | |
'active': True | |
}, | |
{} # Missing "active". | |
] | |
dataset = make_test_data(items, schema=schema({UUID_COLUMN: 'string', 'active': 'boolean'})) | |
result = dataset.select_groups(leaf_path='active') | |
assert result.counts == [(True, 3), (False, 1), (None, 1)] | |
def test_list_of_structs(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [{ | |
'list_of_structs': [{ | |
'name': 'a' | |
}, { | |
'name': 'b' | |
}] | |
}, { | |
'list_of_structs': [{ | |
'name': 'c' | |
}, { | |
'name': 'a' | |
}, { | |
'name': 'd' | |
}] | |
}, { | |
'list_of_structs': [{ | |
'name': 'd' | |
}] | |
}] | |
dataset = make_test_data(items) | |
result = dataset.select_groups(leaf_path='list_of_structs.*.name') | |
assert result.counts == [('a', 2), ('d', 2), ('b', 1), ('c', 1)] | |
def test_nested_lists(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [{ | |
'nested_list': [[{ | |
'name': 'a' | |
}], [{ | |
'name': 'b' | |
}]] | |
}, { | |
'nested_list': [[{ | |
'name': 'c' | |
}, { | |
'name': 'a' | |
}], [{ | |
'name': 'd' | |
}]] | |
}, { | |
'nested_list': [[{ | |
'name': 'd' | |
}]] | |
}] | |
dataset = make_test_data(items) | |
result = dataset.select_groups(leaf_path='nested_list.*.*.name') | |
assert result.counts == [('a', 2), ('d', 2), ('b', 1), ('c', 1)] | |
def test_nested_struct(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [ | |
{ | |
'nested_struct': { | |
'struct': { | |
'name': 'c' | |
} | |
} | |
}, | |
{ | |
'nested_struct': { | |
'struct': { | |
'name': 'b' | |
} | |
} | |
}, | |
{ | |
'nested_struct': { | |
'struct': { | |
'name': 'a' | |
} | |
} | |
}, | |
] | |
dataset = make_test_data(items) | |
result = dataset.select_groups(leaf_path='nested_struct.struct.name') | |
assert result.counts == [('c', 1), ('b', 1), ('a', 1)] | |
def test_named_bins(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [{ | |
'age': 34, | |
}, { | |
'age': 45, | |
}, { | |
'age': 17, | |
}, { | |
'age': 80 | |
}, { | |
'age': 55 | |
}] | |
dataset = make_test_data(items) | |
result = dataset.select_groups( | |
leaf_path='age', | |
bins=[ | |
('young', None, 20), | |
('adult', 20, 50), | |
('middle-aged', 50, 65), | |
('senior', 65, None), | |
]) | |
assert result.counts == [('adult', 2), ('young', 1), ('senior', 1), ('middle-aged', 1)] | |
def test_schema_with_bins(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [{ | |
'age': 34, | |
}, { | |
'age': 45, | |
}, { | |
'age': 17, | |
}, { | |
'age': 80 | |
}, { | |
'age': 55 | |
}] | |
data_schema = schema({ | |
UUID_COLUMN: 'string', | |
'age': field( | |
'int32', | |
bins=[ | |
('young', None, 20), | |
('adult', 20, 50), | |
('middle-aged', 50, 65), | |
('senior', 65, None), | |
]) | |
}) | |
dataset = make_test_data(items, data_schema) | |
result = dataset.select_groups(leaf_path='age') | |
assert result.counts == [('adult', 2), ('young', 1), ('senior', 1), ('middle-aged', 1)] | |
def test_filters(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [ | |
{ | |
'name': 'Name1', | |
'age': 34, | |
'active': False | |
}, | |
{ | |
'name': 'Name2', | |
'age': 45, | |
'active': True | |
}, | |
{ | |
'age': 17, | |
'active': True | |
}, # Missing "name". | |
{ | |
'name': 'Name3', | |
'active': True | |
}, # Missing "age". | |
{ | |
'name': 'Name4', | |
'age': 55 | |
} # Missing "active". | |
] | |
dataset = make_test_data(items) | |
# active = True. | |
result = dataset.select_groups(leaf_path='name', filters=[('active', BinaryOp.EQUALS, True)]) | |
assert result.counts == [('Name2', 1), (None, 1), ('Name3', 1)] | |
# age < 35. | |
result = dataset.select_groups(leaf_path='name', filters=[('age', BinaryOp.LESS, 35)]) | |
assert result.counts == [('Name1', 1), (None, 1)] | |
# age < 35 and active = True. | |
result = dataset.select_groups( | |
leaf_path='name', filters=[('age', BinaryOp.LESS, 35), ('active', BinaryOp.EQUALS, True)]) | |
assert result.counts == [(None, 1)] | |
def test_invalid_leaf(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [ | |
{ | |
'nested_struct': { | |
'struct': { | |
'name': 'c' | |
} | |
} | |
}, | |
{ | |
'nested_struct': { | |
'struct': { | |
'name': 'b' | |
} | |
} | |
}, | |
{ | |
'nested_struct': { | |
'struct': { | |
'name': 'a' | |
} | |
} | |
}, | |
] | |
dataset = make_test_data(items) | |
with pytest.raises( | |
ValueError, match=re.escape("Leaf \"('nested_struct',)\" not found in dataset")): | |
dataset.select_groups(leaf_path='nested_struct') | |
with pytest.raises( | |
ValueError, match=re.escape("Leaf \"('nested_struct', 'struct')\" not found in dataset")): | |
dataset.select_groups(leaf_path='nested_struct.struct') | |
with pytest.raises( | |
ValueError, | |
match=re.escape("Leaf \"('nested_struct', 'struct', 'wrong_name')\" not found in dataset")): | |
dataset.select_groups(leaf_path='nested_struct.struct.wrong_name') | |
def test_too_many_distinct(make_test_data: TestDataMaker, mocker: MockerFixture) -> None: | |
too_many_distinct = 5 | |
mocker.patch(f'{dataset_module.__name__}.TOO_MANY_DISTINCT', too_many_distinct) | |
items: list[Item] = [{'feature': str(i)} for i in range(too_many_distinct + 10)] | |
dataset = make_test_data(items) | |
res = dataset.select_groups('feature') | |
assert res.too_many_distinct is True | |
assert res.counts == [] | |
def test_auto_bins_for_float(make_test_data: TestDataMaker) -> None: | |
items: list[Item] = [{'feature': float(i)} for i in range(5)] | |
dataset = make_test_data(items) | |
res = dataset.select_groups('feature') | |
assert res.counts == [('0', 1), ('3', 1), ('7', 1), ('11', 1), ('14', 1)] | |
assert res.too_many_distinct is False | |
assert res.bins | |