VibecoderMcSwaggins's picture
refactor(data): use standard datasets.load_dataset() with neuroimaging-go-brrrr
262b3cb unverified
"""Tests for the data loader."""
from __future__ import annotations
import os
from typing import TYPE_CHECKING
from unittest.mock import MagicMock, patch
import pytest
from stroke_deepisles_demo.data.adapter import LocalDataset
from stroke_deepisles_demo.data.loader import HuggingFaceDatasetWrapper, load_isles_dataset
if TYPE_CHECKING:
from pathlib import Path
# Skip tests that download large datasets in CI (limited disk space)
SKIP_IN_CI = pytest.mark.skipif(
os.environ.get("CI") == "true",
reason="Skips large HuggingFace downloads in CI (disk space)",
)
def test_load_from_local_returns_local_dataset(synthetic_isles_dir: Path) -> None:
"""Test that loading from local path returns a LocalDataset."""
dataset = load_isles_dataset(source=synthetic_isles_dir, local_mode=True)
assert isinstance(dataset, LocalDataset)
assert len(dataset) > 0
def test_load_from_local_finds_all_cases(synthetic_isles_dir: Path) -> None:
"""Test that the loader correctly delegates finding cases to adapter."""
dataset = load_isles_dataset(source=synthetic_isles_dir)
assert len(dataset) == 2
assert dataset.list_case_ids() == ["sub-stroke0001", "sub-stroke0002"]
def test_load_hf_calls_load_dataset() -> None:
"""Test that loading from HF calls datasets.load_dataset."""
with patch("datasets.load_dataset") as mock_load:
mock_ds = MagicMock()
mock_ds.__len__.return_value = 0
# Mock column access for index building
mock_ds.__getitem__.side_effect = lambda key: [] if key == "subject_id" else MagicMock()
mock_load.return_value = mock_ds
ds = load_isles_dataset(source="my/dataset", local_mode=False)
assert isinstance(ds, HuggingFaceDatasetWrapper)
mock_load.assert_called_once()
assert mock_load.call_args[0][0] == "my/dataset"
@pytest.mark.integration
@SKIP_IN_CI
def test_load_from_huggingface_returns_hf_dataset() -> None:
"""Test that loading from HuggingFace returns a HuggingFaceDatasetWrapper.
Note: Skipped in CI due to large download size (~GB) and limited disk space.
Run locally with: pytest -m integration tests/data/test_loader.py
"""
with load_isles_dataset() as dataset: # Default is HuggingFace mode
assert isinstance(dataset, HuggingFaceDatasetWrapper)
# We can't guarantee length if we don't mock, but we can check type
# Real test might fail if network issue or auth issue