Spaces:

dima806
/

developer_salary_prediction

Running

App Files Files Community

developer_salary_prediction / tests /test_train.py

dima806

Upload 39 files

eeeaee6 verified 2 months ago

raw

history blame contribute delete

12.6 kB

	"""Tests for src/train.py - Training pipeline helper functions."""

	import numpy as np
	import pandas as pd

	from src.train import (
	apply_cardinality_reduction,
	compute_currency_rates,
	drop_other_rows,
	extract_valid_categories,
	filter_salaries,
	)


	def _make_salary_df(countries=None, salaries=None, n=100) -> pd.DataFrame:
	"""Create a minimal DataFrame resembling the survey data."""
	if salaries is not None:
	n = len(salaries)
	if countries is not None:
	n = len(countries)
	if countries is None:
	countries = ["United States of America"] * n
	if salaries is None:
	rng = np.random.default_rng(42)
	salaries = rng.integers(30000, 200000, size=n).astype(float)
	rng = np.random.default_rng(42)
	return pd.DataFrame(
	{
	"Country": countries,
	"YearsCode": rng.integers(0, 30, size=n).astype(float),
	"WorkExp": rng.integers(0, 20, size=n).astype(float),
	"EdLevel": ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"] * n,
	"DevType": ["Developer, full-stack"] * n,
	"Industry": ["Software Development"] * n,
	"Age": ["25-34 years old"] * n,
	"ICorPM": ["Individual contributor"] * n,
	"OrgSize": ["20 to 99 employees"] * n,
	"Employment": ["Employed"] * n,
	"Currency": ["USD United States Dollar"] * n,
	"CompTotal": salaries,
	"ConvertedCompYearly": salaries,
	}
	)


	class TestFilterSalaries:
	"""Tests for filter_salaries()."""

	def test_removes_below_min_salary(self):
	"""Rows with salary below min_salary are removed."""
	salaries = [500.0] * 5 + [2000.0] * 20 + [50000.0] * 20
	df = _make_salary_df(salaries=salaries)
	config = {
	"data": {
	"min_salary": 1000,
	"lower_percentile": 0,
	"upper_percentile": 100,
	}
	}
	result = filter_salaries(df, config)
	assert (result["ConvertedCompYearly"] > 1000).all()
	assert len(result) < len(df)

	def test_removes_outliers_by_percentile(self):
	"""Per-country percentile outlier removal works."""
	salaries = [10000.0] * 50 + [500000.0] + [10000.0] * 49
	df = _make_salary_df(salaries=salaries)
	config = {
	"data": {
	"min_salary": 1000,
	"lower_percentile": 2,
	"upper_percentile": 98,
	}
	}
	result = filter_salaries(df, config)
	assert len(result) < len(df)

	def test_drops_missing_target(self):
	"""Rows with NaN target are dropped."""
	df = _make_salary_df(salaries=[50000.0, np.nan, 60000.0])
	config = {
	"data": {
	"min_salary": 1000,
	"lower_percentile": 0,
	"upper_percentile": 100,
	}
	}
	result = filter_salaries(df, config)
	assert not result["ConvertedCompYearly"].isna().any()

	def test_returns_dataframe(self):
	"""Returns a pandas DataFrame."""
	df = _make_salary_df()
	config = {
	"data": {
	"min_salary": 1000,
	"lower_percentile": 2,
	"upper_percentile": 98,
	}
	}
	result = filter_salaries(df, config)
	assert isinstance(result, pd.DataFrame)


	class TestApplyCardinalityReduction:
	"""Tests for apply_cardinality_reduction()."""

	def test_normalizes_unicode_apostrophes(self):
	"""Unicode right single quotation marks are replaced."""
	df = _make_salary_df(n=100)
	df["EdLevel"] = "Master\u2019s degree"
	result = apply_cardinality_reduction(df)
	# Unicode apostrophe should be normalized to ASCII
	assert "\u2019" not in result["EdLevel"].iloc[0]

	def test_does_not_modify_original(self):
	"""The input DataFrame is not modified."""
	df = _make_salary_df(n=5)
	original_country = df["Country"].iloc[0]
	apply_cardinality_reduction(df)
	assert df["Country"].iloc[0] == original_country

	def test_rare_categories_become_other(self):
	"""Categories below min_frequency are grouped into 'Other'."""
	countries = ["United States of America"] * 100 + ["Narnia"] * 2
	df = _make_salary_df(countries=countries, n=102)
	result = apply_cardinality_reduction(df)
	assert "Narnia" not in result["Country"].values
	assert "Other" in result["Country"].values


	class TestDropOtherRows:
	"""Tests for drop_other_rows()."""

	def test_drops_other_from_specified_columns(self):
	"""Rows with 'Other' in specified columns are dropped."""
	df = pd.DataFrame(
	{
	"Country": ["USA", "Other", "Germany"],
	"DevType": ["Dev", "Dev", "Other"],
	"EdLevel": ["BS", "BS", "BS"],
	"Industry": ["SW", "SW", "SW"],
	"Age": ["25-34", "25-34", "25-34"],
	"ICorPM": ["IC", "IC", "IC"],
	"OrgSize": ["Small", "Small", "Small"],
	"Employment": ["FT", "FT", "FT"],
	}
	)
	config = {
	"features": {
	"cardinality": {
	"other_category": "Other",
	"drop_other_from": ["Country", "DevType"],
	}
	}
	}
	result = drop_other_rows(df, config)
	assert len(result) == 1
	assert result.iloc[0]["Country"] == "USA"

	def test_no_drop_when_list_empty(self):
	"""No rows dropped when drop_other_from is empty."""
	df = pd.DataFrame(
	{
	"Country": ["USA", "Other"],
	"DevType": ["Dev", "Other"],
	"EdLevel": ["BS", "BS"],
	"Industry": ["SW", "SW"],
	"Age": ["25-34", "25-34"],
	"ICorPM": ["IC", "IC"],
	"OrgSize": ["Small", "Small"],
	"Employment": ["FT", "FT"],
	}
	)
	config = {
	"features": {
	"cardinality": {
	"other_category": "Other",
	"drop_other_from": [],
	}
	}
	}
	result = drop_other_rows(df, config)
	assert len(result) == 2

	def test_uses_configured_other_name(self):
	"""Uses the configured other_category name for matching."""
	df = pd.DataFrame(
	{
	"Country": ["USA", "Misc"],
	"DevType": ["Dev", "Dev"],
	"EdLevel": ["BS", "BS"],
	"Industry": ["SW", "SW"],
	"Age": ["25-34", "25-34"],
	"ICorPM": ["IC", "IC"],
	"OrgSize": ["Small", "Small"],
	"Employment": ["FT", "FT"],
	}
	)
	config = {
	"features": {
	"cardinality": {
	"other_category": "Misc",
	"drop_other_from": ["Country"],
	}
	}
	}
	result = drop_other_rows(df, config)
	assert len(result) == 1


	class TestExtractValidCategories:
	"""Tests for extract_valid_categories()."""

	def test_returns_sorted_unique_values(self):
	"""Returns sorted unique values for each categorical feature."""
	df = pd.DataFrame(
	{
	"Country": ["Germany", "USA", "Germany"],
	"EdLevel": ["BS", "MS", "BS"],
	"DevType": ["Front", "Back", "Front"],
	"Industry": ["SW", "Fin", "SW"],
	"Age": ["25-34", "35-44", "25-34"],
	"ICorPM": ["IC", "PM", "IC"],
	"OrgSize": ["Small", "Large", "Small"],
	"Employment": ["FT", "PT", "FT"],
	}
	)
	result = extract_valid_categories(df)
	assert result["Country"] == ["Germany", "USA"]
	assert result["EdLevel"] == ["BS", "MS"]
	assert result["ICorPM"] == ["IC", "PM"]
	assert result["OrgSize"] == ["Large", "Small"]
	assert result["Employment"] == ["FT", "PT"]

	def test_all_categorical_features_present(self):
	"""All 8 categorical features are present as keys."""
	df = pd.DataFrame(
	{
	"Country": ["USA"],
	"EdLevel": ["BS"],
	"DevType": ["Dev"],
	"Industry": ["SW"],
	"Age": ["25-34"],
	"ICorPM": ["IC"],
	"OrgSize": ["Small"],
	"Employment": ["FT"],
	}
	)
	result = extract_valid_categories(df)
	assert set(result.keys()) == {
	"Country",
	"EdLevel",
	"DevType",
	"Industry",
	"Age",
	"ICorPM",
	"OrgSize",
	"Employment",
	}

	def test_excludes_nan_values(self):
	"""NaN values are not included in valid categories."""
	df = pd.DataFrame(
	{
	"Country": ["USA", np.nan],
	"EdLevel": ["BS", "MS"],
	"DevType": ["Dev", "Dev"],
	"Industry": ["SW", "SW"],
	"Age": ["25-34", "25-34"],
	"ICorPM": ["IC", "IC"],
	"OrgSize": ["Small", "Small"],
	"Employment": ["FT", "FT"],
	}
	)
	result = extract_valid_categories(df)
	assert result["Country"] == ["USA"]


	class TestComputeCurrencyRates:
	"""Tests for compute_currency_rates()."""

	def test_computes_rates_for_valid_countries(self):
	"""Returns currency rates for countries present in the data."""
	df = pd.DataFrame(
	{
	"Country": ["USA", "USA", "Germany", "Germany"],
	"Currency": [
	"USD United States Dollar",
	"USD United States Dollar",
	"EUR European Euro",
	"EUR European Euro",
	],
	"CompTotal": [100000.0, 120000.0, 80000.0, 90000.0],
	"ConvertedCompYearly": [100000.0, 120000.0, 80000.0, 90000.0],
	}
	)
	result = compute_currency_rates(df, ["USA", "Germany"])
	assert "USA" in result
	assert "Germany" in result
	assert result["USA"]["code"] == "USD"
	assert result["Germany"]["code"] == "EUR"
	assert isinstance(result["USA"]["rate"], float)

	def test_skips_countries_not_in_data(self):
	"""Countries not in the data are not included."""
	df = pd.DataFrame(
	{
	"Country": ["USA"],
	"Currency": ["USD United States Dollar"],
	"CompTotal": [100000.0],
	"ConvertedCompYearly": [100000.0],
	}
	)
	result = compute_currency_rates(df, ["USA", "Narnia"])
	assert "USA" in result
	assert "Narnia" not in result

	def test_returns_dict_with_expected_keys(self):
	"""Each country entry has code, name, and rate keys."""
	df = pd.DataFrame(
	{
	"Country": ["USA", "USA"],
	"Currency": [
	"USD United States Dollar",
	"USD United States Dollar",
	],
	"CompTotal": [100000.0, 100000.0],
	"ConvertedCompYearly": [100000.0, 100000.0],
	}
	)
	result = compute_currency_rates(df, ["USA"])
	assert set(result["USA"].keys()) == {"code", "name", "rate"}

	def test_filters_extreme_rates(self):
	"""Extreme conversion rates are filtered out."""
	df = pd.DataFrame(
	{
	"Country": ["USA", "USA"],
	"Currency": [
	"USD United States Dollar",
	"USD United States Dollar",
	],
	"CompTotal": [100000.0, 0.0001],
	"ConvertedCompYearly": [100000.0, 100000.0],
	}
	)
	result = compute_currency_rates(df, ["USA"])
	# The rate=1.0 row should be kept, the extreme one filtered
	assert result["USA"]["rate"] == 1.0

	def test_empty_dataframe(self):
	"""Returns empty dict for empty DataFrame."""
	df = pd.DataFrame(
	columns=["Country", "Currency", "CompTotal", "ConvertedCompYearly"]
	)
	result = compute_currency_rates(df, ["USA"])
	assert result == {}