turing-space / turing /tests /unit /test_features.py
papri-ka's picture
Deploy FastAPI ML service to Hugging Face Spaces
5fc6e5d
import pandas as pd
import pytest
from turing.features import (
FeatureEngineer,
FeaturePipelineConfig,
TextProcessor,
)
# --- Fixtures ---
@pytest.fixture(scope="module")
def full_config():
"""Returns a config with stopwords and lemmatization enabled."""
return FeaturePipelineConfig(
use_stopwords=True,
use_lemmatization=True,
use_combo_feature=False,
max_features=5000,
min_comment_length=10,
max_comment_length=500,
enable_augmentation=False,
custom_tags="test",
)
@pytest.fixture(scope="module")
def basic_config():
"""Returns a config with all extra steps disabled."""
return FeaturePipelineConfig(
use_stopwords=False,
use_lemmatization=False,
use_combo_feature=False,
max_features=100,
min_comment_length=5,
max_comment_length=200,
enable_augmentation=False,
)
@pytest.fixture(scope="module")
def full_processor(full_config):
"""A TextProcessor with all steps enabled."""
return TextProcessor(config=full_config, language="english")
@pytest.fixture(scope="module")
def basic_processor(basic_config):
"""A TextProcessor with only basic cleaning (lowercase, punctuation)."""
return TextProcessor(config=basic_config, language="english")
# --- Tests ---
class TestFeaturePipelineConfig:
def test_config_id_generation(self, full_config, basic_config):
"""Tests that the readable ID is generated correctly."""
assert full_config.hash_id == "clean-k5000-test"
assert basic_config.hash_id == "clean-k100"
def test_config_attributes(self, full_config):
"""Tests that attributes are set correctly."""
assert full_config.use_stopwords is True
assert full_config.use_lemmatization is True
assert full_config.max_features == 5000
class TestTextProcessor:
def test_clean_text_basic(self, basic_processor):
"""Tests lowercase and punctuation removal."""
text = "This is a TEST... with punctuation!!"
expected = "this is a test with punctuation"
assert basic_processor.clean_text(text) == expected
def test_clean_text_stopwords(self, full_processor, basic_processor):
"""Tests stopword removal logic."""
text = "this is a test with a stopword"
# With stopwords enabled
expected_full = "test stopword"
assert full_processor.clean_text(text) == expected_full
# With stopwords disabled
expected_basic = "this is a test with a stopword"
assert basic_processor.clean_text(text) == expected_basic
def test_clean_text_lemmatization(self, full_processor, basic_processor):
"""Tests lemmatization logic."""
text = "running tests while dogs are barking"
# With lemmatization enabled
expected_full = "running test dog barking" # 'are' and 'while' are stopwords
assert full_processor.clean_text(text) == expected_full
# With lemmatization disabled
expected_basic = "running tests while dogs are barking"
assert basic_processor.clean_text(text) == expected_basic
def test_clean_text_handles_none(self, basic_processor):
"""Tests that it doesn't crash on None or pd.NA."""
assert basic_processor.clean_text(None) == ""
assert basic_processor.clean_text(pd.NA) == ""
class TestFeatureEngineer:
def test_extract_numeric_features(self, basic_config):
"""Tests that extract_features_for_check adds metadata features."""
fe = FeatureEngineer(config=basic_config)
data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]}
df = pd.DataFrame(data)
df_out = fe.extract_features_for_check(df)
assert "f_length" in df_out.columns
assert "f_word_count" in df_out.columns
assert "f_starts_verb" in df_out.columns
assert "text_hash" in df_out.columns
assert df_out["f_length"].tolist() == [14, 25, 0]
assert df_out["f_word_count"].tolist() == [3, 6, 0]