Spaces:
Runtime error
Runtime error
"""Tests for the pandas source.""" | |
import os | |
import pathlib | |
# mypy: disable-error-code="attr-defined" | |
from datasets import Dataset, Features, Sequence, Value | |
from ...schema import schema | |
from .huggingface_source import HF_SPLIT_COLUMN, HuggingFaceDataset | |
from .source import SourceSchema | |
def test_hf(tmp_path: pathlib.Path) -> None: | |
dataset = Dataset.from_list([{'x': 1, 'y': 'ten'}, {'x': 2, 'y': 'twenty'}]) | |
dataset_name = os.path.join(tmp_path, 'hf-test-dataset') | |
dataset.save_to_disk(dataset_name) | |
source = HuggingFaceDataset(dataset_name=dataset_name, load_from_disk=True) | |
items = source.process() | |
source.setup() | |
source_schema = source.source_schema() | |
assert source_schema == SourceSchema( | |
fields=schema({ | |
HF_SPLIT_COLUMN: 'string', | |
'x': 'int64', | |
'y': 'string' | |
}).fields, num_items=2) | |
items = list(source.process()) | |
assert items == [{ | |
HF_SPLIT_COLUMN: 'default', | |
'x': 1, | |
'y': 'ten' | |
}, { | |
HF_SPLIT_COLUMN: 'default', | |
'x': 2, | |
'y': 'twenty' | |
}] | |
def test_hf_sequence(tmp_path: pathlib.Path) -> None: | |
dataset = Dataset.from_list([{ | |
'scalar': 1, | |
'seq': [1, 0], | |
'seq_dict': { | |
'x': [1, 2, 3], | |
'y': ['four', 'five', 'six'] | |
} | |
}, { | |
'scalar': 2, | |
'seq': [2, 0], | |
'seq_dict': { | |
'x': [10, 20, 30], | |
'y': ['forty', 'fifty', 'sixty'] | |
} | |
}], | |
features=Features({ | |
'scalar': Value(dtype='int64'), | |
'seq': Sequence(feature=Value(dtype='int64')), | |
'seq_dict': Sequence(feature={ | |
'x': Value(dtype='int64'), | |
'y': Value(dtype='string') | |
}) | |
})) | |
dataset_name = os.path.join(tmp_path, 'hf-test-dataset') | |
dataset.save_to_disk(dataset_name) | |
source = HuggingFaceDataset(dataset_name=dataset_name, load_from_disk=True) | |
items = source.process() | |
source.setup() | |
source_schema = source.source_schema() | |
assert source_schema == SourceSchema( | |
fields=schema({ | |
HF_SPLIT_COLUMN: 'string', | |
'scalar': 'int64', | |
'seq': ['int64'], | |
'seq_dict': { | |
'x': ['int64'], | |
'y': ['string'], | |
}, | |
}).fields, | |
num_items=2) | |
items = list(source.process()) | |
assert items == [{ | |
HF_SPLIT_COLUMN: 'default', | |
'scalar': 1, | |
'seq': [1, 0], | |
'seq_dict': { | |
'x': [1, 2, 3], | |
'y': ['four', 'five', 'six'] | |
} | |
}, { | |
HF_SPLIT_COLUMN: 'default', | |
'scalar': 2, | |
'seq': [2, 0], | |
'seq_dict': { | |
'x': [10, 20, 30], | |
'y': ['forty', 'fifty', 'sixty'] | |
} | |
}] | |
def test_hf_list(tmp_path: pathlib.Path) -> None: | |
dataset = Dataset.from_list([{ | |
'scalar': 1, | |
'list': [{ | |
'x': 1, | |
'y': 'two' | |
}] | |
}, { | |
'scalar': 2, | |
'list': [{ | |
'x': 3, | |
'y': 'four' | |
}] | |
}], | |
features=Features({ | |
'scalar': Value(dtype='int64'), | |
'list': [{ | |
'x': Value(dtype='int64'), | |
'y': Value(dtype='string') | |
}] | |
})) | |
dataset_name = os.path.join(tmp_path, 'hf-test-dataset') | |
dataset.save_to_disk(dataset_name) | |
source = HuggingFaceDataset(dataset_name=dataset_name, load_from_disk=True) | |
items = source.process() | |
source.setup() | |
source_schema = source.source_schema() | |
assert source_schema == SourceSchema( | |
fields=schema({ | |
HF_SPLIT_COLUMN: 'string', | |
'scalar': 'int64', | |
'list': [{ | |
'x': 'int64', | |
'y': 'string', | |
}], | |
}).fields, | |
num_items=2) | |
items = list(source.process()) | |
assert items == [{ | |
HF_SPLIT_COLUMN: 'default', | |
'scalar': 1, | |
'list': [{ | |
'x': 1, | |
'y': 'two' | |
}] | |
}, { | |
HF_SPLIT_COLUMN: 'default', | |
'scalar': 2, | |
'list': [{ | |
'x': 3, | |
'y': 'four' | |
}] | |
}] | |