|
|
|
import os, shutil |
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import joblib |
|
import os |
|
from huggingface_hub import hf_hub_download |
|
from sklearn.preprocessing import LabelEncoder, StandardScaler |
|
from catboost import Pool |
|
|
|
|
|
MODEL_REPO = "chagu13/is_click_predictor" |
|
MODEL_DIR = "models" |
|
os.makedirs(MODEL_DIR, exist_ok=True) |
|
|
|
|
|
CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl" |
|
XGB_MODEL_FILENAME = "models/xgb_model.pkl" |
|
RF_MODEL_FILENAME = "models/rf_model.pkl" |
|
|
|
|
|
CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl") |
|
XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl") |
|
RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl") |
|
|
|
|
|
CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"] |
|
NUMERICAL_COLUMNS = [ |
|
"age_level", "city_development_index", "user_group_id", "user_depth", "var_1", |
|
"click_sum_age_sex_prod", "click_count_age_sex_prod", |
|
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", |
|
"click_sum_city_age_prod", "click_count_city_age_prod", |
|
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod" |
|
] |
|
|
|
FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS |
|
|
|
from sklearn.preprocessing import LabelEncoder, StandardScaler |
|
from catboost import Pool |
|
|
|
|
|
def preprocess_input(input_df, expected_feature_order): |
|
""" |
|
Ensure preprocessing is correct: |
|
- Removes duplicate columns |
|
- Computes aggregations using only test data |
|
- Ensures categorical variables are properly encoded |
|
- Normalizes numerical features |
|
- Adds `is_click` column with 0 for compatibility |
|
- Orders columns as expected by the model |
|
""" |
|
|
|
if "DateTime" in input_df.columns: |
|
input_df.drop(columns=["DateTime"], inplace=True) |
|
|
|
|
|
input_df = input_df.loc[:, ~input_df.columns.duplicated()] |
|
input_df.fillna(0, inplace=True) |
|
|
|
|
|
age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({ |
|
"campaign_id": "nunique", |
|
"webpage_id": "nunique" |
|
}).reset_index() |
|
|
|
|
|
age_sex_product_agg.columns = ["age_level", "gender", "product", |
|
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"] |
|
|
|
input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left") |
|
|
|
|
|
city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({ |
|
"campaign_id": "nunique", |
|
"webpage_id": "nunique" |
|
}).reset_index() |
|
|
|
|
|
city_age_product_agg.columns = ["city_development_index", "age_level", "product", |
|
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"] |
|
|
|
input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left") |
|
input_df.fillna(0, inplace=True) |
|
|
|
|
|
missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod", |
|
"click_sum_city_age_prod", "click_count_city_age_prod"] |
|
|
|
for col in missing_columns: |
|
if col not in input_df.columns: |
|
print(f"Warning: Missing column {col}. Filling with 0.") |
|
input_df[col] = 0 |
|
|
|
|
|
if "is_click" not in input_df.columns: |
|
print("Adding `is_click` column with all values set to 0.") |
|
input_df["is_click"] = 0 |
|
|
|
|
|
features = ["age_level", "gender", "product", "campaign_id", "webpage_id", |
|
"product_category_1", "product_category_2", "user_group_id", |
|
"user_depth", "city_development_index", "var_1", |
|
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", |
|
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod", |
|
"click_sum_age_sex_prod", "click_count_age_sex_prod", |
|
"click_sum_city_age_prod", "click_count_city_age_prod", |
|
"is_click"] |
|
|
|
categorical_columns = ["gender", "product", "campaign_id", "webpage_id"] |
|
|
|
|
|
|
|
|
|
|
|
label_encoders = {} |
|
for col in categorical_columns: |
|
le = LabelEncoder() |
|
input_df[col] = le.fit_transform(input_df[col].astype(str)) |
|
label_encoders[col] = le |
|
|
|
|
|
numerical_columns = [col for col in features if col not in categorical_columns] |
|
scaler = StandardScaler() |
|
input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns]) |
|
|
|
|
|
|
|
|
|
missing_features = set(expected_feature_order) - set(input_df.columns) |
|
extra_features = set(input_df.columns) - set(expected_feature_order) |
|
|
|
|
|
for col in missing_features: |
|
print(f"Warning: Missing feature {col}. Filling with 0.") |
|
input_df[col] = 0 |
|
|
|
|
|
if extra_features: |
|
print(f"Warning: Dropping unexpected features: {extra_features}") |
|
input_df = input_df.drop(columns=list(extra_features)) |
|
|
|
|
|
input_df = input_df[expected_feature_order] |
|
|
|
return input_df |
|
|
|
|
|
def download_model(filename, local_path): |
|
"""Download model from Hugging Face and move it to the correct location.""" |
|
temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR) |
|
|
|
|
|
if temp_path != local_path: |
|
shutil.move(temp_path, local_path) |
|
|
|
return local_path |
|
|
|
|
|
def load_models(): |
|
"""Download and load models from Hugging Face.""" |
|
try: |
|
print("π Checking and downloading models...") |
|
|
|
|
|
if not os.path.exists(CATBOOST_MODEL_PATH): |
|
print("π Downloading CatBoost model...") |
|
download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH) |
|
|
|
if not os.path.exists(XGB_MODEL_PATH): |
|
print("π Downloading XGBoost model...") |
|
download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH) |
|
|
|
if not os.path.exists(RF_MODEL_PATH): |
|
print("π Downloading RandomForest model...") |
|
download_model(RF_MODEL_FILENAME, RF_MODEL_PATH) |
|
|
|
|
|
print("π¦ Loading models...") |
|
catboost_model = joblib.load(CATBOOST_MODEL_PATH) |
|
xgb_model = joblib.load(XGB_MODEL_PATH) |
|
rf_model = joblib.load(RF_MODEL_PATH) |
|
|
|
print("β
Models loaded successfully!") |
|
return catboost_model, xgb_model, rf_model |
|
|
|
except Exception as e: |
|
print(f"β Error loading models: {e}") |
|
return None, None, None |
|
|
|
|
|
st.title("Is_Click Predictor - ML Model Inference") |
|
st.info("Upload a CSV file, and the trained models will predict click probability.") |
|
|
|
catboost, xgb, rf = load_models() |
|
|
|
expected_feature_order = catboost.feature_names_ |
|
print("Expected Feature Order:", expected_feature_order) |
|
|
|
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) |
|
if uploaded_file: |
|
input_df = pd.read_csv(uploaded_file) |
|
st.success("File uploaded successfully!") |
|
|
|
|
|
input_df = preprocess_input(input_df, expected_feature_order) |
|
|
|
|
|
st.subheader("Predictions in Progress...") |
|
from catboost import Pool |
|
|
|
|
|
cat_features = ["gender", "product", "campaign_id", "webpage_id"] |
|
|
|
|
|
for col in cat_features: |
|
input_df[col] = input_df[col].astype(str) |
|
|
|
expected_feature_order = catboost.feature_names_ |
|
print("Expected Feature Order:", expected_feature_order) |
|
|
|
|
|
input_df = input_df[expected_feature_order] |
|
|
|
input_pool = Pool(input_df, cat_features=cat_features) |
|
catboost_preds = catboost.predict(input_pool) |
|
catboost_probs = catboost.predict_proba(input_df)[:, 1] |
|
label_encoders = {} |
|
|
|
for col in cat_features: |
|
le = LabelEncoder() |
|
input_df[col] = input_df[col].astype(str) |
|
le.fit(input_df[col]) |
|
label_encoders[col] = le |
|
input_df[col] = le.transform(input_df[col]) |
|
|
|
|
|
xgb_training_features = [ |
|
"age_level", "gender", "product", "campaign_id", "webpage_id", |
|
"product_category_1", "product_category_2", "user_group_id", |
|
"user_depth", "city_development_index", "var_1", |
|
"click_sum_age_sex_prod", "click_count_age_sex_prod", |
|
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", |
|
"click_sum_city_age_prod", "click_count_city_age_prod", |
|
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod" |
|
] |
|
|
|
xgb_preds = xgb.predict(input_df[xgb_training_features]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xgb_probs = xgb.predict_proba(input_df)[:, 1] |
|
|
|
|
|
|
|
|
|
THRESHOLD = 0.7 |
|
|
|
|
|
print("π Probability Distributions Before Thresholding:") |
|
print("CatBoost:\n", pd.Series(catboost_probs).describe()) |
|
print("XGBoost:\n", pd.Series(xgb_probs).describe()) |
|
|
|
|
|
THRESHOLD = np.percentile(catboost_probs, 95) |
|
print(f"β
Adjusted CatBoost Threshold: {THRESHOLD:.3f}") |
|
|
|
catboost_preds = (catboost_probs >= THRESHOLD).astype(int) |
|
xgb_preds = (xgb_probs >= 0.7).astype(int) |
|
|
|
|
|
print("\nPost-threshold Distribution:") |
|
print(f"CatBoost 1s: {np.sum(catboost_preds)} / {len(catboost_preds)}") |
|
print(f"XGBoost 1s: {np.sum(xgb_preds)} / {len(xgb_preds)}") |
|
|
|
|
|
predictions_df = pd.DataFrame({ |
|
"CatBoost": catboost_preds, |
|
"XGBoost": xgb_preds |
|
}) |
|
|
|
|
|
if predictions_df["CatBoost"].sum() == len(predictions_df) or predictions_df["XGBoost"].sum() == len( |
|
predictions_df): |
|
print("β Warning: Model is predicting only 1s! Consider adjusting thresholds.") |
|
|
|
|
|
predictions_df["is_click_predicted"] = predictions_df.max(axis=1) |
|
|
|
|
|
probabilities_df = pd.DataFrame({ |
|
"CatBoost_Prob": catboost_probs, |
|
"XGBoost_Prob": xgb_probs, |
|
|
|
}) |
|
|
|
|
|
binary_predictions_path = "binary_predictions.csv" |
|
filtered_predictions_path = "filtered_predictions.csv" |
|
probabilities_path = "model_probabilities.csv" |
|
|
|
predictions_df.to_csv(binary_predictions_path, index=False) |
|
predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False) |
|
probabilities_df.to_csv(probabilities_path, index=False) |
|
|
|
st.success("Predictions completed! Download results below.") |
|
|
|
|
|
with open(binary_predictions_path, "rb") as f: |
|
st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv") |
|
|
|
with open(filtered_predictions_path, "rb") as f: |
|
st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv") |
|
|
|
with open(probabilities_path, "rb") as f: |
|
st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv") |