Spaces:

dima806
/

developer_salary_prediction

Running

App Files Files Community

developer_salary_prediction / src /preprocess.py

dima806

Upload 39 files

eeeaee6 verified 19 days ago

raw

history blame contribute delete

4.4 kB

	"""Pre-process survey data and generate config artifacts.

	Validates the raw CSV, applies the same data-cleaning steps used by
	src/train.py, then writes:

	- config/valid_categories.yaml — valid input values for runtime guardrails
	- config/currency_rates.yaml — per-country median currency conversion rates

	Run before ``make train`` to validate data and pre-generate configs, or
	standalone to inspect what categories the current dataset supports.
	"""

	import sys
	from pathlib import Path

	import pandas as pd
	import yaml

	from src.train import (
	CATEGORICAL_FEATURES,
	apply_cardinality_reduction,
	compute_currency_rates,
	drop_other_rows,
	extract_valid_categories,
	filter_salaries,
	)

	REQUIRED_COLUMNS = [
	"Country",
	"YearsCode",
	"WorkExp",
	"EdLevel",
	"DevType",
	"Industry",
	"Age",
	"ICorPM",
	"OrgSize",
	"Employment",
	"Currency",
	"CompTotal",
	"ConvertedCompYearly",
	]


	def validate_columns(data_path: Path) -> None:
	"""Exit 1 if any required column is absent from the CSV header."""
	header = pd.read_csv(data_path, nrows=0)
	missing = [c for c in REQUIRED_COLUMNS if c not in header.columns]
	if missing:
	print(f"Error: missing required columns: {missing}")
	sys.exit(1)
	print(f"All {len(REQUIRED_COLUMNS)} required columns present.")


	def print_category_summary(df: pd.DataFrame) -> None:
	"""Print the number of unique categories per categorical feature."""
	for col in CATEGORICAL_FEATURES:
	n = df[col].dropna().nunique()
	print(f" {col}: {n} categories")


	def main() -> None:
	"""Validate data, apply preprocessing, and write config artifacts."""
	config_path = Path("config/model_parameters.yaml")
	with open(config_path) as f:
	config = yaml.safe_load(f)

	data_path = Path("data/survey_results_public.csv")

	# Step 1 — Validate data file ------------------------------------------------
	print("=" * 60)
	print("STEP 1 — Validate data file")
	print("=" * 60)

	if not data_path.exists():
	print(f"Error: {data_path} not found.")
	print("Download from: https://insights.stackoverflow.com/survey")
	sys.exit(1)

	print(f"Checking columns in {data_path} ...")
	validate_columns(data_path)

	# Step 2 — Load and filter salaries ------------------------------------------
	print("\n" + "=" * 60)
	print("STEP 2 — Load and filter salaries")
	print("=" * 60)

	df = pd.read_csv(data_path, usecols=REQUIRED_COLUMNS)
	print(f"Loaded {len(df):,} rows")

	df = filter_salaries(df, config)
	print(f"After salary filtering: {len(df):,} rows")

	# Step 3 — Cardinality reduction ---------------------------------------------
	print("\n" + "=" * 60)
	print("STEP 3 — Cardinality reduction")
	print("=" * 60)

	df = apply_cardinality_reduction(df)
	before = len(df)
	df = drop_other_rows(df, config)
	drop_cols = config["features"]["cardinality"].get("drop_other_from", [])
	if drop_cols:
	print(f"Dropped {before - len(df):,} rows with 'Other' in {drop_cols}")
	print(f"Final dataset: {len(df):,} rows")

	# Step 4 — Category summary --------------------------------------------------
	print("\n" + "=" * 60)
	print("STEP 4 — Category summary")
	print("=" * 60)

	print_category_summary(df)

	# Step 5 — Write config artifacts --------------------------------------------
	print("\n" + "=" * 60)
	print("STEP 5 — Write config artifacts")
	print("=" * 60)

	valid_categories = extract_valid_categories(df)
	vc_path = Path("config/valid_categories.yaml")
	with open(vc_path, "w") as f:
	yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
	n_total = sum(len(v) for v in valid_categories.values())
	print(f"Saved {vc_path} ({n_total} total valid values)")

	currency_rates = compute_currency_rates(df, valid_categories["Country"])
	cr_path = Path("config/currency_rates.yaml")
	with open(cr_path, "w") as f:
	yaml.dump(
	currency_rates,
	f,
	default_flow_style=False,
	sort_keys=True,
	allow_unicode=True,
	)
	print(f"Saved {cr_path} ({len(currency_rates)} countries)")

	print("\nPre-processing complete. Ready for `make train`.")


	if __name__ == "__main__":
	main()