KaiquanMah commited on
Commit
2ea05a3
·
verified ·
1 Parent(s): e31c88e

fixed by yair

Browse files
Files changed (5) hide show
  1. config.py +16 -0
  2. data_loader.py +216 -0
  3. model_manager.py +25 -0
  4. model_predictor.py +34 -0
  5. model_trainer.py +46 -0
config.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Directories
4
+ MODEL_DIR = "models"
5
+
6
+ # Model File Paths
7
+ CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
8
+ XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
9
+ RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
10
+
11
+
12
+
13
+ # Model Parameters
14
+ CATBOOST_PARAMS = {"iterations": 800, "depth": 6, "learning_rate": 0.05, "random_seed": 42, "task_type": "CPU", "verbose": 100}
15
+ XGB_PARAMS = {"n_estimators": 800, "learning_rate": 0.05, "max_depth": 6, "tree_method": "hist", "random_state": 42}
16
+ RF_PARAMS = {"n_estimators": 200, "max_depth": 15, "random_state": 42, "n_jobs": -1}
data_loader.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import time
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
8
+ from imblearn.over_sampling import SMOTE
9
+
10
+ # ===========================
11
+ # CONFIGURATION
12
+ # ===========================
13
+
14
+ TRAIN_PATH = "data/train_dataset_full - train_dataset_full.csv"
15
+ # TRAIN_PATH = "data/train_dataset_full - train_dataset_partial_for_testing.csv"
16
+ TEST_PATH = "data/X_test_1st.csv" # Replace with actual test dataset path
17
+
18
+ CATEGORICAL_COLUMNS = ["gender", "product",]
19
+ IDS_COLUMNS = [ "user_id", "session_id", "campaign_id", "webpage_id"]
20
+ TARGET_COLUMN = "is_click"
21
+ FEATURE_COLUMNS = [
22
+ "age_level", "gender", "product",
23
+ "product_category_1", "product_category_2", "user_group_id",
24
+ "user_depth", "city_development_index", "var_1"
25
+ ]
26
+
27
+ AGGREGATED_COLUMNS = [
28
+ "click_sum_age_sex_prod", "click_count_age_sex_prod",
29
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
30
+ "click_sum_city_age_prod", "click_count_city_age_prod",
31
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
32
+ ]
33
+
34
+ TEMPORAL_COLUMNS = ["year", "month", "day", "hour", "minute", "weekday"]
35
+ # ===========================
36
+ # LOAD DATASETS
37
+ # ===========================
38
+
39
+ def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
40
+ """Load train & test datasets, handling missing values."""
41
+ train_df = pd.read_csv(train_path)
42
+ y_train = train_df[TARGET_COLUMN]
43
+ train_df = train_df[~y_train.isnull()]
44
+
45
+
46
+ test_df = pd.read_csv(test_path)
47
+
48
+ train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
49
+ test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
50
+ train_df["DateTime"].fillna(train_df["DateTime"].mode()[0], inplace=True)
51
+ test_df["DateTime"].fillna(test_df["DateTime"].mode()[0], inplace=True)
52
+
53
+ if "DateTime" in train_df.columns:
54
+ train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
55
+ train_df["year"] = train_df["DateTime"].dt.year
56
+ train_df["month"] = train_df["DateTime"].dt.month
57
+ train_df["day"] = train_df["DateTime"].dt.day
58
+ train_df["hour"] = train_df["DateTime"].dt.hour
59
+ train_df["minute"] = train_df["DateTime"].dt.minute
60
+ train_df["weekday"] = train_df["DateTime"].dt.weekday
61
+ train_df.drop("DateTime", axis=1, inplace=True)
62
+
63
+ if "DateTime" in test_df.columns:
64
+ test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
65
+ test_df["year"] = test_df["DateTime"].dt.year
66
+ test_df["month"] = test_df["DateTime"].dt.month
67
+ test_df["day"] = test_df["DateTime"].dt.day
68
+ test_df["hour"] = test_df["DateTime"].dt.hour
69
+ test_df["minute"] = test_df["DateTime"].dt.minute
70
+ test_df["weekday"] = test_df["DateTime"].dt.weekday
71
+ test_df.drop("DateTime", axis=1, inplace=True)
72
+
73
+ # Fill missing values
74
+ train_df.fillna(-1, inplace=True)
75
+ test_df.fillna(-1, inplace=True)
76
+
77
+ return train_df, test_df
78
+
79
+
80
+ # ===========================
81
+ # FEATURE ENGINEERING: AGGREGATIONS
82
+ # ===========================
83
+
84
+ def add_aggregated_features(df, test_df):
85
+ """Creates aggregated features based on age, gender, and product interactions."""
86
+
87
+ # Aggregate by age & gender vs product
88
+ age_sex_product_agg = df.groupby(["age_level", "gender", "product"]).agg({
89
+ "is_click": ["sum", "count"],
90
+ "campaign_id": "nunique",
91
+ "webpage_id": "nunique"
92
+ }).reset_index()
93
+
94
+ # Rename columns after aggregation
95
+ age_sex_product_agg.columns = ["age_level", "gender", "product",
96
+ "click_sum_age_sex_prod", "click_count_age_sex_prod",
97
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
98
+
99
+ # Merge into train & test datasets
100
+ df = df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
101
+ test_df = test_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
102
+
103
+ # Aggregate by city, age, product
104
+ city_age_product_agg = df.groupby(["city_development_index", "age_level", "product"]).agg({
105
+ "is_click": ["sum", "count"],
106
+ "campaign_id": "nunique",
107
+ "webpage_id": "nunique"
108
+ }).reset_index()
109
+
110
+ # Rename columns
111
+ city_age_product_agg.columns = ["city_development_index", "age_level", "product",
112
+ "click_sum_city_age_prod", "click_count_city_age_prod",
113
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
114
+
115
+ # Merge into train & test datasets
116
+ df = df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
117
+ test_df = test_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
118
+
119
+ # Fill missing values after merging
120
+ df.fillna(0, inplace=True)
121
+ test_df.fillna(0, inplace=True)
122
+
123
+ return df, test_df
124
+
125
+
126
+ # ===========================
127
+ # ENCODE & NORMALIZE FEATURES
128
+ # ===========================
129
+
130
+ def preprocess_data(df, test_df, categorical_columns):
131
+ """Encodes categorical features, normalizes numerical features, and prepares the dataset."""
132
+
133
+ label_encoders = {}
134
+ for col in categorical_columns:
135
+ le = LabelEncoder()
136
+ df[col] = le.fit_transform(df[col].astype(str))
137
+ test_df[col] = test_df[col].astype(str).map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
138
+ label_encoders[col] = le # Store encoders for later use
139
+
140
+ numerical_columns = [col for col in FEATURE_COLUMNS + AGGREGATED_COLUMNS if col not in categorical_columns]
141
+
142
+ # scaler = StandardScaler()
143
+ # df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
144
+ # test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])
145
+
146
+
147
+ return df, test_df, label_encoders,# scaler
148
+
149
+
150
+ # ===========================
151
+ # SPLIT DATA & HANDLE IMBALANCE
152
+ # ===========================
153
+
154
+ def split_and_balance_data(df, target_column):
155
+ """Splits data into training and validation sets, applies SMOTE to balance classes."""
156
+
157
+ X = df[IDS_COLUMNS + FEATURE_COLUMNS + AGGREGATED_COLUMNS + TEMPORAL_COLUMNS]
158
+ y = df[target_column]
159
+
160
+ # Handle class imbalance using SMOTE
161
+ smote = SMOTE(sampling_strategy="auto", random_state=42)
162
+ X_resampled, y_resampled = smote.fit_resample(X, y)
163
+
164
+ # Split into training & validation sets
165
+ X_train, X_val, y_train, y_val = train_test_split(
166
+ X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
167
+ )
168
+
169
+ return X_train, X_val, y_train, y_val
170
+
171
+
172
+ # ===========================
173
+ # VISUALIZE FEATURES
174
+ # ===========================
175
+
176
+ def visualize_features():
177
+ """Generates visualizations for aggregated features."""
178
+
179
+ df, _ = load_data()
180
+ df, _ = add_aggregated_features(df, df)
181
+
182
+ sns.set_style("whitegrid")
183
+
184
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
185
+
186
+ sns.barplot(x="age_level", y="click_sum_age_sex_prod", hue="gender",
187
+ data=df, ax=axes[0], palette="coolwarm")
188
+ axes[0].set_title("Total Clicks by Age & Gender vs Product")
189
+
190
+ sns.barplot(x="city_development_index", y="click_sum_city_age_prod", hue="age_level",
191
+ data=df, ax=axes[1], palette="viridis")
192
+ axes[1].set_title("Total Clicks by City Development Index & Age")
193
+
194
+ plt.tight_layout()
195
+ plt.show()
196
+
197
+
198
+ # ===========================
199
+ # RUN FULL DATA PROCESSING PIPELINE
200
+ # ===========================
201
+
202
+ def load_and_process_data():
203
+ """Runs the full data processing pipeline and returns preprocessed training & test data."""
204
+
205
+ df, test_df = load_data()
206
+ df, test_df = add_aggregated_features(df, test_df)
207
+ df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
208
+ X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
209
+
210
+ return X_train, X_val, y_train, y_val, test_df
211
+
212
+
213
+ if __name__ == "__main__":
214
+ print("🔹 Loading and processing data...")
215
+ X_train, X_val, y_train, y_val, test_df = load_and_process_data()
216
+ print("✅ Data successfully loaded and processed!")
model_manager.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from catboost import CatBoostClassifier
3
+ from xgboost import XGBClassifier
4
+ from config import CATBOOST_MODEL_PATH, XGB_MODEL_PATH, RF_MODEL_PATH
5
+
6
+ def save_models(models):
7
+ """ Save trained models """
8
+ models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
9
+ if models["XGBoost"] is not None:
10
+ # Save XGBoost model in binary format to reduce memory usage
11
+ models["XGBoost"].get_booster().save_model(XGB_MODEL_PATH)
12
+ joblib.dump(models["RandomForest"], RF_MODEL_PATH)
13
+ print("✅ Models saved successfully!")
14
+
15
+ def load_models():
16
+ """ Load trained models """
17
+ catboost = CatBoostClassifier()
18
+ catboost.load_model(CATBOOST_MODEL_PATH)
19
+
20
+ xgb = XGBClassifier() # Load XGBoost model in binary format
21
+ xgb.load_model(XGB_MODEL_PATH)
22
+
23
+ rf = joblib.load(RF_MODEL_PATH)
24
+
25
+ return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}
model_predictor.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from catboost import Pool
4
+
5
+ from data_loader import CATEGORICAL_COLUMNS, IDS_COLUMNS, TARGET_COLUMN, FEATURE_COLUMNS, AGGREGATED_COLUMNS, TEMPORAL_COLUMNS
6
+
7
+ def predict(models, X_test):
8
+ """ Make predictions using trained models """
9
+ # Ensure categorical features are properly handled
10
+ cat_features = CATEGORICAL_COLUMNS
11
+ test_predictions = {}
12
+ #
13
+ # test_predictions = {name: np.array(model.predict(X_test)).squeeze() for name, model in models.items()}
14
+ for name, model in models.items():
15
+ if "CatBoost" in name: # Handle CatBoost models
16
+ pool = Pool(data=X_test, cat_features=cat_features)
17
+ test_predictions[name] = model.predict(pool)
18
+ else: # Other models
19
+ # reordering columns to match the order of columns in the model
20
+ new_X_test = X_test[IDS_COLUMNS + FEATURE_COLUMNS + AGGREGATED_COLUMNS + TEMPORAL_COLUMNS]
21
+ test_predictions[name] = np.array(model.predict(new_X_test)).squeeze()
22
+
23
+
24
+ test_predictions_df = pd.DataFrame(test_predictions)
25
+
26
+ # Ensure binary values (0 or 1)
27
+ for col in test_predictions_df.columns:
28
+ test_predictions_df[col] = (test_predictions_df[col] > 0.5).astype(int)
29
+
30
+ # Apply "at least one model predicts 1" rule
31
+ test_predictions_df["is_click_predicted"] = test_predictions_df.max(axis=1)
32
+
33
+ return test_predictions_df
34
+
model_trainer.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from catboost import CatBoostClassifier
3
+ from xgboost import XGBClassifier
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from config import CATBOOST_PARAMS, XGB_PARAMS, RF_PARAMS
6
+
7
+
8
+ def train_models(X_train, y_train, categorical_columns):
9
+ """ Train and return machine learning models """
10
+ models = {}
11
+
12
+ # Train CatBoost
13
+ start_time = time.time()
14
+ catboost = CatBoostClassifier(**CATBOOST_PARAMS)
15
+ catboost.fit(X_train, y_train, cat_features=[X_train.columns.get_loc(col) for col in categorical_columns])
16
+ models["CatBoost"] = catboost
17
+ print(f"✅ CatBoost trained in {time.time() - start_time:.2f} sec")
18
+
19
+ # Train XGBoost
20
+ if set(y_train.unique()) <= {0, 1}: # Ensure only valid labels exist
21
+ start_time = time.time()
22
+ xgb = XGBClassifier(**XGB_PARAMS)
23
+ xgb.fit(X_train, y_train)
24
+ models["XGBoost"] = xgb
25
+ print(f"✅ XGBoost trained in {time.time() - start_time:.2f} sec")
26
+ else:
27
+ x_train_xgboost = X_train[~y_train.isna()]
28
+ y_train_xgboost = y_train.dropna()
29
+ if set(y_train_xgboost.unique()) <= {0, 1}:
30
+ start_time = time.time()
31
+ xgb = XGBClassifier(**XGB_PARAMS)
32
+ xgb.fit(x_train_xgboost, y_train_xgboost)
33
+ models["XGBoost"] = xgb
34
+ print(f"✅ XGBoost trained in {time.time() - start_time:.2f} sec")
35
+ else:
36
+ models["XGBoost"] = None
37
+ print("⚠ XGBoost training skipped due to invalid labels!")
38
+
39
+ # Train RandomForest
40
+ start_time = time.time()
41
+ rf = RandomForestClassifier(**RF_PARAMS)
42
+ rf.fit(X_train, y_train)
43
+ models["RandomForest"] = rf
44
+ print(f"✅ RandomForest trained in {time.time() - start_time:.2f} sec")
45
+
46
+ return models