Spaces:

KaiquanMah
/

is_click

Sleeping

App Files Files Community

KaiquanMah commited on Feb 15

Commit

2ea05a3

verified ·

1 Parent(s): e31c88e

fixed by yair

Browse files

Files changed (5) hide show

config.py +16 -0
data_loader.py +216 -0
model_manager.py +25 -0
model_predictor.py +34 -0
model_trainer.py +46 -0

config.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+# Directories
+MODEL_DIR = "models"
+# Model File Paths
+CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
+XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
+RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
+# Model Parameters
+CATBOOST_PARAMS = {"iterations": 800, "depth": 6, "learning_rate": 0.05, "random_seed": 42, "task_type": "CPU", "verbose": 100}
+XGB_PARAMS = {"n_estimators": 800, "learning_rate": 0.05, "max_depth": 6, "tree_method": "hist", "random_state": 42}
+RF_PARAMS = {"n_estimators": 200, "max_depth": 15, "random_state": 42, "n_jobs": -1}

data_loader.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import pandas as pd
+import numpy as np
+import time
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from imblearn.over_sampling import SMOTE
+# ===========================
+#  CONFIGURATION
+# ===========================
+TRAIN_PATH = "data/train_dataset_full - train_dataset_full.csv"
+# TRAIN_PATH = "data/train_dataset_full - train_dataset_partial_for_testing.csv"
+TEST_PATH = "data/X_test_1st.csv"  # Replace with actual test dataset path
+CATEGORICAL_COLUMNS = ["gender", "product",]
+IDS_COLUMNS = [ "user_id", "session_id", "campaign_id", "webpage_id"]
+TARGET_COLUMN = "is_click"
+FEATURE_COLUMNS = [
+    "age_level", "gender", "product",
+    "product_category_1", "product_category_2", "user_group_id",
+    "user_depth", "city_development_index", "var_1"
+]
+AGGREGATED_COLUMNS = [
+    "click_sum_age_sex_prod", "click_count_age_sex_prod",
+    "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
+    "click_sum_city_age_prod", "click_count_city_age_prod",
+    "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
+]
+TEMPORAL_COLUMNS = ["year", "month", "day", "hour", "minute", "weekday"]
+# ===========================
+#  LOAD DATASETS
+# ===========================
+def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
+    """Load train & test datasets, handling missing values."""
+    train_df = pd.read_csv(train_path)
+    y_train = train_df[TARGET_COLUMN]
+    train_df = train_df[~y_train.isnull()]
+    test_df = pd.read_csv(test_path)
+    train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
+    test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
+    train_df["DateTime"].fillna(train_df["DateTime"].mode()[0], inplace=True)
+    test_df["DateTime"].fillna(test_df["DateTime"].mode()[0], inplace=True)
+    if "DateTime" in train_df.columns:
+        train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
+        train_df["year"] = train_df["DateTime"].dt.year
+        train_df["month"] = train_df["DateTime"].dt.month
+        train_df["day"] = train_df["DateTime"].dt.day
+        train_df["hour"] = train_df["DateTime"].dt.hour
+        train_df["minute"] = train_df["DateTime"].dt.minute
+        train_df["weekday"] = train_df["DateTime"].dt.weekday
+        train_df.drop("DateTime", axis=1, inplace=True)
+    if "DateTime" in test_df.columns:
+        test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
+        test_df["year"] = test_df["DateTime"].dt.year
+        test_df["month"] = test_df["DateTime"].dt.month
+        test_df["day"] = test_df["DateTime"].dt.day
+        test_df["hour"] = test_df["DateTime"].dt.hour
+        test_df["minute"] = test_df["DateTime"].dt.minute
+        test_df["weekday"] = test_df["DateTime"].dt.weekday
+        test_df.drop("DateTime", axis=1, inplace=True)
+    # Fill missing values
+    train_df.fillna(-1, inplace=True)
+    test_df.fillna(-1, inplace=True)
+    return train_df, test_df
+# ===========================
+#  FEATURE ENGINEERING: AGGREGATIONS
+# ===========================
+def add_aggregated_features(df, test_df):
+    """Creates aggregated features based on age, gender, and product interactions."""
+    # Aggregate by age & gender vs product
+    age_sex_product_agg = df.groupby(["age_level", "gender", "product"]).agg({
+        "is_click": ["sum", "count"],
+        "campaign_id": "nunique",
+        "webpage_id": "nunique"
+    }).reset_index()
+    # Rename columns after aggregation
+    age_sex_product_agg.columns = ["age_level", "gender", "product",
+                                   "click_sum_age_sex_prod", "click_count_age_sex_prod",
+                                   "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
+    # Merge into train & test datasets
+    df = df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
+    test_df = test_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
+    # Aggregate by city, age, product
+    city_age_product_agg = df.groupby(["city_development_index", "age_level", "product"]).agg({
+        "is_click": ["sum", "count"],
+        "campaign_id": "nunique",
+        "webpage_id": "nunique"
+    }).reset_index()
+    # Rename columns
+    city_age_product_agg.columns = ["city_development_index", "age_level", "product",
+                                    "click_sum_city_age_prod", "click_count_city_age_prod",
+                                    "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
+    # Merge into train & test datasets
+    df = df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
+    test_df = test_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
+    # Fill missing values after merging
+    df.fillna(0, inplace=True)
+    test_df.fillna(0, inplace=True)
+    return df, test_df
+# ===========================
+#  ENCODE & NORMALIZE FEATURES
+# ===========================
+def preprocess_data(df, test_df, categorical_columns):
+    """Encodes categorical features, normalizes numerical features, and prepares the dataset."""
+    label_encoders = {}
+    for col in categorical_columns:
+        le = LabelEncoder()
+        df[col] = le.fit_transform(df[col].astype(str))
+        test_df[col] = test_df[col].astype(str).map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
+        label_encoders[col] = le  # Store encoders for later use
+    numerical_columns = [col for col in FEATURE_COLUMNS + AGGREGATED_COLUMNS if col not in categorical_columns]
+    # scaler = StandardScaler()
+    # df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
+    # test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])
+    return df, test_df, label_encoders,# scaler
+# ===========================
+#  SPLIT DATA & HANDLE IMBALANCE
+# ===========================
+def split_and_balance_data(df, target_column):
+    """Splits data into training and validation sets, applies SMOTE to balance classes."""
+    X = df[IDS_COLUMNS + FEATURE_COLUMNS + AGGREGATED_COLUMNS + TEMPORAL_COLUMNS]
+    y = df[target_column]
+    # Handle class imbalance using SMOTE
+    smote = SMOTE(sampling_strategy="auto", random_state=42)
+    X_resampled, y_resampled = smote.fit_resample(X, y)
+    # Split into training & validation sets
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
+    )
+    return X_train, X_val, y_train, y_val
+# ===========================
+#  VISUALIZE FEATURES
+# ===========================
+def visualize_features():
+    """Generates visualizations for aggregated features."""
+    df, _ = load_data()
+    df, _ = add_aggregated_features(df, df)
+    sns.set_style("whitegrid")
+    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+    sns.barplot(x="age_level", y="click_sum_age_sex_prod", hue="gender",
+                data=df, ax=axes[0], palette="coolwarm")
+    axes[0].set_title("Total Clicks by Age & Gender vs Product")
+    sns.barplot(x="city_development_index", y="click_sum_city_age_prod", hue="age_level",
+                data=df, ax=axes[1], palette="viridis")
+    axes[1].set_title("Total Clicks by City Development Index & Age")
+    plt.tight_layout()
+    plt.show()
+# ===========================
+#  RUN FULL DATA PROCESSING PIPELINE
+# ===========================
+def load_and_process_data():
+    """Runs the full data processing pipeline and returns preprocessed training & test data."""
+    df, test_df = load_data()
+    df, test_df = add_aggregated_features(df, test_df)
+    df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
+    X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
+    return X_train, X_val, y_train, y_val, test_df
+if __name__ == "__main__":
+    print("🔹 Loading and processing data...")
+    X_train, X_val, y_train, y_val, test_df = load_and_process_data()
+    print("✅ Data successfully loaded and processed!")

model_manager.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import joblib
+from catboost import CatBoostClassifier
+from xgboost import XGBClassifier
+from config import CATBOOST_MODEL_PATH, XGB_MODEL_PATH, RF_MODEL_PATH
+def save_models(models):
+    """ Save trained models """
+    models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
+    if models["XGBoost"] is not None:
+        # Save XGBoost model in binary format to reduce memory usage
+        models["XGBoost"].get_booster().save_model(XGB_MODEL_PATH)
+    joblib.dump(models["RandomForest"], RF_MODEL_PATH)
+    print("✅ Models saved successfully!")
+def load_models():
+    """ Load trained models """
+    catboost = CatBoostClassifier()
+    catboost.load_model(CATBOOST_MODEL_PATH)
+    xgb = XGBClassifier()  # Load XGBoost model in binary format
+    xgb.load_model(XGB_MODEL_PATH)
+    rf = joblib.load(RF_MODEL_PATH)
+    return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}

model_predictor.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+import pandas as pd
+from catboost import Pool
+from data_loader import CATEGORICAL_COLUMNS, IDS_COLUMNS, TARGET_COLUMN, FEATURE_COLUMNS, AGGREGATED_COLUMNS, TEMPORAL_COLUMNS
+def predict(models, X_test):
+    """ Make predictions using trained models """
+    # Ensure categorical features are properly handled
+    cat_features =  CATEGORICAL_COLUMNS
+    test_predictions = {}
+    #
+    # test_predictions = {name: np.array(model.predict(X_test)).squeeze() for name, model in models.items()}
+    for name, model in models.items():
+        if "CatBoost" in name:  # Handle CatBoost models
+            pool = Pool(data=X_test, cat_features=cat_features)
+            test_predictions[name] = model.predict(pool)
+        else:  # Other models
+            # reordering columns to match the order of columns in the model
+            new_X_test = X_test[IDS_COLUMNS + FEATURE_COLUMNS + AGGREGATED_COLUMNS + TEMPORAL_COLUMNS]
+            test_predictions[name] = np.array(model.predict(new_X_test)).squeeze()
+    test_predictions_df = pd.DataFrame(test_predictions)
+    # Ensure binary values (0 or 1)
+    for col in test_predictions_df.columns:
+        test_predictions_df[col] = (test_predictions_df[col] > 0.5).astype(int)
+    # Apply "at least one model predicts 1" rule
+    test_predictions_df["is_click_predicted"] = test_predictions_df.max(axis=1)
+    return test_predictions_df

model_trainer.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import time
+from catboost import CatBoostClassifier
+from xgboost import XGBClassifier
+from sklearn.ensemble import RandomForestClassifier
+from config import CATBOOST_PARAMS, XGB_PARAMS, RF_PARAMS
+def train_models(X_train, y_train, categorical_columns):
+    """ Train and return machine learning models """
+    models = {}
+    # Train CatBoost
+    start_time = time.time()
+    catboost = CatBoostClassifier(**CATBOOST_PARAMS)
+    catboost.fit(X_train, y_train, cat_features=[X_train.columns.get_loc(col) for col in categorical_columns])
+    models["CatBoost"] = catboost
+    print(f"✅ CatBoost trained in {time.time() - start_time:.2f} sec")
+    # Train XGBoost
+    if set(y_train.unique()) <= {0, 1}:  # Ensure only valid labels exist
+        start_time = time.time()
+        xgb = XGBClassifier(**XGB_PARAMS)
+        xgb.fit(X_train, y_train)
+        models["XGBoost"] = xgb
+        print(f"✅ XGBoost trained in {time.time() - start_time:.2f} sec")
+    else:
+        x_train_xgboost = X_train[~y_train.isna()]
+        y_train_xgboost = y_train.dropna()
+        if set(y_train_xgboost.unique()) <= {0, 1}:
+            start_time = time.time()
+            xgb = XGBClassifier(**XGB_PARAMS)
+            xgb.fit(x_train_xgboost, y_train_xgboost)
+            models["XGBoost"] = xgb
+            print(f"✅ XGBoost trained in {time.time() - start_time:.2f} sec")
+        else:
+            models["XGBoost"] = None
+            print("⚠ XGBoost training skipped due to invalid labels!")
+    # Train RandomForest
+    start_time = time.time()
+    rf = RandomForestClassifier(**RF_PARAMS)
+    rf.fit(X_train, y_train)
+    models["RandomForest"] = rf
+    print(f"✅ RandomForest trained in {time.time() - start_time:.2f} sec")
+    return models