nikhil_staging / src /data_loader_test.py
nsthorat's picture
Push
e4f9cbe
raw
history blame
No virus
2.06 kB
"""Tests for data_loader.py."""
import os
import pathlib
import uuid
from typing import Iterable
from pytest_mock import MockerFixture
from typing_extensions import override
from .data.dataset_duckdb import read_source_manifest
from .data.dataset_utils import parquet_filename
from .data.sources.source import Source, SourceSchema
from .data_loader import process_source
from .schema import PARQUET_FILENAME_PREFIX, UUID_COLUMN, Item, SourceManifest, schema
from .test_utils import fake_uuid, read_items
from .utils import DATASETS_DIR_NAME
class TestSource(Source):
"""A test source."""
name = 'test_source'
@override
def setup(self) -> None:
pass
@override
def source_schema(self) -> SourceSchema:
"""Return the source schema."""
return SourceSchema(fields=schema({'x': 'int64', 'y': 'string'}).fields, num_items=2)
@override
def process(self) -> Iterable[Item]:
return [{'x': 1, 'y': 'ten'}, {'x': 2, 'y': 'twenty'}]
def test_data_loader(tmp_path: pathlib.Path, mocker: MockerFixture) -> None:
mock_uuid = mocker.patch.object(uuid, 'uuid4', autospec=True)
mock_uuid.side_effect = [fake_uuid(b'1'), fake_uuid(b'2')]
source = TestSource()
setup_mock = mocker.spy(TestSource, 'setup')
output_dir, num_items = process_source(tmp_path, 'test_namespace', 'test_dataset', source)
assert setup_mock.call_count == 1
assert output_dir == os.path.join(tmp_path, DATASETS_DIR_NAME, 'test_namespace', 'test_dataset')
assert num_items == 2
source_manifest = read_source_manifest(output_dir)
assert source_manifest == SourceManifest(
files=[parquet_filename(PARQUET_FILENAME_PREFIX, 0, 1)],
data_schema=schema({
# UUID_COLUMN is generated by the data loader.
UUID_COLUMN: 'string',
'x': 'int64',
'y': 'string'
}),
)
items = read_items(output_dir, source_manifest.files, source_manifest.data_schema)
assert items == [{
UUID_COLUMN: fake_uuid(b'1').hex,
'x': 1,
'y': 'ten'
}, {
UUID_COLUMN: fake_uuid(b'2').hex,
'x': 2,
'y': 'twenty'
}]