import os, shutil |
import streamlit as st |
import pandas as pd |
import numpy as np |
import joblib |
from huggingface_hub import hf_hub_download |
from sklearn.preprocessing import LabelEncoder, StandardScaler |
from catboost import Pool |
MODEL_REPO = "chagu13/is_click_predictor" |
MODEL_DIR = "models" |
os.makedirs(MODEL_DIR, exist_ok=True) |
CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl" |
XGB_MODEL_FILENAME = "models/xgb_model.pkl" |
RF_MODEL_FILENAME = "models/rf_model.pkl" |
CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl") |
XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl") |
RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl") |
CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"] |
"age_level", "city_development_index", "user_group_id", "user_depth", "var_1", |
"click_sum_age_sex_prod", "click_count_age_sex_prod", |
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", |
"click_sum_city_age_prod", "click_count_city_age_prod", |
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod" |
] |
def preprocess_input(input_df, expected_feature_order): |
""" |
Ensure preprocessing is correct: |
- Removes duplicate columns |
- Computes aggregations using only test data |
- Ensures categorical variables are properly encoded |
- Normalizes numerical features |
- Adds `is_click` column with 0 for compatibility |
- Orders columns as expected by the model |
""" |
if "DateTime" in input_df.columns: |
input_df.drop(columns=["DateTime"], inplace=True) |
input_df = input_df.loc[:, ~input_df.columns.duplicated()] |
input_df.fillna(0, inplace=True) |
age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({ |
"campaign_id": "nunique", |
"webpage_id": "nunique" |
}).reset_index() |
age_sex_product_agg.columns = ["age_level", "gender", "product", |
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"] |
input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left") |
city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({ |
"campaign_id": "nunique", |
"webpage_id": "nunique" |
}).reset_index() |
city_age_product_agg.columns = ["city_development_index", "age_level", "product", |
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"] |
input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left") |
input_df.fillna(0, inplace=True) |
missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod", |
"click_sum_city_age_prod", "click_count_city_age_prod"] |
for col in missing_columns: |
if col not in input_df.columns: |
print(f"Warning: Missing column {col}. Filling with 0.") |
input_df[col] = 0 |
if "is_click" not in input_df.columns: |
print("Adding `is_click` column with all values set to 0.") |
input_df["is_click"] = 0 |
features = ["age_level", "gender", "product", "campaign_id", "webpage_id", |
"product_category_1", "product_category_2", "user_group_id", |
"user_depth", "city_development_index", "var_1", |
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", |
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod", |
"click_sum_age_sex_prod", "click_count_age_sex_prod", |
"click_sum_city_age_prod", "click_count_city_age_prod", |
"is_click"] |
categorical_columns = ["gender", "product", "campaign_id", "webpage_id"] |
label_encoders = {} |
for col in categorical_columns: |
le = LabelEncoder() |
input_df[col] = le.fit_transform(input_df[col].astype(str)) |
label_encoders[col] = le |
numerical_columns = [col for col in features if col not in categorical_columns] |
scaler = StandardScaler() |
input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns]) |
missing_features = set(expected_feature_order) - set(input_df.columns) |
extra_features = set(input_df.columns) - set(expected_feature_order) |
for col in missing_features: |
print(f"Warning: Missing feature {col}. Filling with 0.") |
input_df[col] = 0 |
if extra_features: |
print(f"Warning: Dropping unexpected features: {extra_features}") |
input_df = input_df.drop(columns=list(extra_features)) |
input_df = input_df[expected_feature_order] |
return input_df |
def download_model(filename, local_path): |
"""Download model from Hugging Face and move it to the correct location.""" |
temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR) |
if temp_path != local_path: |
shutil.move(temp_path, local_path) |
return local_path |
def load_models(): |
"""Download and load models from Hugging Face.""" |
try: |
print("π Checking and downloading models...") |
if not os.path.exists(CATBOOST_MODEL_PATH): |
print("π Downloading CatBoost model...") |
if not os.path.exists(XGB_MODEL_PATH): |
print("π Downloading XGBoost model...") |
if not os.path.exists(RF_MODEL_PATH): |
print("π Downloading RandomForest model...") |
print("π¦ Loading models...") |
catboost_model = joblib.load(CATBOOST_MODEL_PATH) |
xgb_model = joblib.load(XGB_MODEL_PATH) |
rf_model = joblib.load(RF_MODEL_PATH) |
Models loaded successfully!") |
return catboost_model, xgb_model, rf_model |
except Exception as e: |
print(f"β Error loading models: {e}") |
return None, None, None |
st.title("Is_Click Predictor - ML Model Inference") |
st.info("Upload a CSV file, and the trained models will predict click probability.") |
catboost, xgb, rf = load_models() |
expected_feature_order = catboost.feature_names_ |
print("Expected Feature Order:", expected_feature_order) |
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) |
if uploaded_file: |
input_df = pd.read_csv(uploaded_file) |
st.success("File uploaded successfully!") |
input_df = preprocess_input(input_df, expected_feature_order) |
st.subheader("Predictions in Progress...") |
cat_features = ["gender", "product", "campaign_id", "webpage_id"] |
for col in cat_features: |
input_df[col] = input_df[col].astype(str) |
expected_feature_order = catboost.feature_names_ |
print("Expected Feature Order:", expected_feature_order) |
input_df = input_df[expected_feature_order] |
input_pool = Pool(input_df, cat_features=cat_features) |
catboost_preds = catboost.predict(input_pool) |
catboost_probs = catboost.predict_proba(input_df)[:, 1] |
label_encoders = {} |
for col in cat_features: |
le = LabelEncoder() |
input_df[col] = input_df[col].astype(str) |
le.fit(input_df[col]) |
label_encoders[col] = le |
input_df[col] = le.transform(input_df[col]) |
xgb_training_features = [ |
"age_level", "gender", "product", "campaign_id", "webpage_id", |
"product_category_1", "product_category_2", "user_group_id", |
"user_depth", "city_development_index", "var_1", |
"click_sum_age_sex_prod", "click_count_age_sex_prod", |
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", |
"click_sum_city_age_prod", "click_count_city_age_prod", |
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod" |
] |
xgb_preds = xgb.predict(input_df[xgb_training_features]) |
xgb_probs = xgb.predict_proba(input_df)[:, 1] |
print("π Probability Distributions Before Thresholding:") |
print("CatBoost:\n", pd.Series(catboost_probs).describe()) |
print("XGBoost:\n", pd.Series(xgb_probs).describe()) |
THRESHOLD = np.percentile(catboost_probs, 95) |
Adjusted CatBoost Threshold: {THRESHOLD:.3f}") |
catboost_preds = (catboost_probs >= THRESHOLD).astype(int) |
xgb_preds = (xgb_probs >= 0.7).astype(int) |
print("\nPost-threshold Distribution:") |
print(f"CatBoost 1s: {np.sum(catboost_preds)} / {len(catboost_preds)}") |
print(f"XGBoost 1s: {np.sum(xgb_preds)} / {len(xgb_preds)}") |
predictions_df = pd.DataFrame({ |
"CatBoost": catboost_preds, |
"XGBoost": xgb_preds |
}) |
if predictions_df["CatBoost"].sum() == len(predictions_df) or predictions_df["XGBoost"].sum() == len( |
predictions_df): |
print("β Warning: Model is predicting only 1s! Consider adjusting thresholds.") |
predictions_df["is_click_predicted"] = predictions_df.max(axis=1) |
probabilities_df = pd.DataFrame({ |
"CatBoost_Prob": catboost_probs, |
"XGBoost_Prob": xgb_probs, |
}) |
binary_predictions_path = "binary_predictions.csv" |
filtered_predictions_path = "filtered_predictions.csv" |
probabilities_path = "model_probabilities.csv" |
predictions_df.to_csv(binary_predictions_path, index=False) |
predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False) |
probabilities_df.to_csv(probabilities_path, index=False) |
st.success("Predictions completed! Download results below.") |
with open(binary_predictions_path, "rb") as f: |
st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv") |
with open(filtered_predictions_path, "rb") as f: |
st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv") |
with open(probabilities_path, "rb") as f: |
st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv") |