Spaces:

Dhananjaykhengare
/

ml_model_builder

Sleeping

App Files Files Community

ml_model_builder / utils /preprocessing.py

Dhananjaykhengare

Upload 10 files

b9a43be verified 3 months ago

raw

history blame contribute delete

1.95 kB


	import pandas as pd
	from sklearn.preprocessing import StandardScaler, LabelEncoder

	def preprocess_data(df, target_col, missing_strategy="drop", transformation_map=None):
	df = df.copy()

	# 1. Handle missing values
	if missing_strategy == "drop":
	df = df.dropna()
	elif missing_strategy in ["mean", "median"]:
	numeric_cols = df.select_dtypes(include=["number"]).columns
	non_numeric_cols = df.columns.difference(numeric_cols)
	if missing_strategy == "mean":
	df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
	else:
	df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
	for col in non_numeric_cols:
	if df[col].isna().sum() > 0:
	df[col] = df[col].fillna(df[col].mode()[0])
	elif missing_strategy == "mode":
	for col in df.columns:
	if df[col].isna().sum() > 0:
	df[col] = df[col].fillna(df[col].mode()[0])

	# 2. Apply feature transformations
	if transformation_map:
	for col, transform in transformation_map.items():
	if transform == "Label Encode":
	if df[col].dtype == "object" or str(df[col].dtype).startswith("category"):
	df[col] = LabelEncoder().fit_transform(df[col])
	else:
	df[col] = LabelEncoder().fit_transform(df[col].astype(str))
	elif transform == "Normalize":
	scaler = StandardScaler()
	df[[col]] = scaler.fit_transform(df[[col]])
	# "No Transformation" = leave column as is

	# 3. Label encode target column if it's a string
	if target_col and target_col in df.columns:
	if df[target_col].dtype == "object" or str(df[target_col].dtype).startswith("category"):
	df[target_col] = LabelEncoder().fit_transform(df[target_col])

	return df