is_click / app.py
chkp-talexm
update
bda761e
import os, shutil
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import os
from huggingface_hub import hf_hub_download
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import Pool
# Hugging Face Model Repo
MODEL_REPO = "chagu13/is_click_predictor"
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)
# Model Filenames
CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
XGB_MODEL_FILENAME = "models/xgb_model.pkl"
RF_MODEL_FILENAME = "models/rf_model.pkl"
# Local Paths
CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
# Define Features
CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
NUMERICAL_COLUMNS = [
"age_level", "city_development_index", "user_group_id", "user_depth", "var_1",
"click_sum_age_sex_prod", "click_count_age_sex_prod",
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
"click_sum_city_age_prod", "click_count_city_age_prod",
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
]
FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import Pool
def preprocess_input(input_df, expected_feature_order):
"""
Ensure preprocessing is correct:
- Removes duplicate columns
- Computes aggregations using only test data
- Ensures categorical variables are properly encoded
- Normalizes numerical features
- Adds `is_click` column with 0 for compatibility
- Orders columns as expected by the model
"""
# Drop the DateTime column if it exists
if "DateTime" in input_df.columns:
input_df.drop(columns=["DateTime"], inplace=True)
# Remove duplicate columns
input_df = input_df.loc[:, ~input_df.columns.duplicated()]
input_df.fillna(0, inplace=True)
# Aggregate by age & gender vs product
age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({
"campaign_id": "nunique",
"webpage_id": "nunique"
}).reset_index()
# Fix renaming: Remove missing columns
age_sex_product_agg.columns = ["age_level", "gender", "product",
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
# Aggregate by city, age, product
city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({
"campaign_id": "nunique",
"webpage_id": "nunique"
}).reset_index()
# Fix renaming: Remove missing columns
city_age_product_agg.columns = ["city_development_index", "age_level", "product",
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
input_df.fillna(0, inplace=True)
# **Ensure missing columns exist (Important Fix)**
missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod",
"click_sum_city_age_prod", "click_count_city_age_prod"]
for col in missing_columns:
if col not in input_df.columns:
print(f"Warning: Missing column {col}. Filling with 0.")
input_df[col] = 0 # Fill missing columns with default values
# **Add `is_click` column with 0 for compatibility**
if "is_click" not in input_df.columns:
print("Adding `is_click` column with all values set to 0.")
input_df["is_click"] = 0 # Model will ignore this for prediction
# Feature List (Now includes `is_click`)
features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
"product_category_1", "product_category_2", "user_group_id",
"user_depth", "city_development_index", "var_1",
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
"click_sum_age_sex_prod", "click_count_age_sex_prod",
"click_sum_city_age_prod", "click_count_city_age_prod",
"is_click"] # Included for compatibility
categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
# ===========================
# ENCODE CATEGORICAL FEATURES
# ===========================
label_encoders = {}
for col in categorical_columns:
le = LabelEncoder()
input_df[col] = le.fit_transform(input_df[col].astype(str)) # Apply transformation correctly
label_encoders[col] = le # Store encoder for reference
# Normalize numerical features
numerical_columns = [col for col in features if col not in categorical_columns]
scaler = StandardScaler()
input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
# ===========================
# ENFORCE FEATURE ORDER
# ===========================
missing_features = set(expected_feature_order) - set(input_df.columns)
extra_features = set(input_df.columns) - set(expected_feature_order)
# Add missing features with default values
for col in missing_features:
print(f"Warning: Missing feature {col}. Filling with 0.")
input_df[col] = 0
# Drop unexpected features
if extra_features:
print(f"Warning: Dropping unexpected features: {extra_features}")
input_df = input_df.drop(columns=list(extra_features))
# Reorder columns to match the model's expected input
input_df = input_df[expected_feature_order]
return input_df
def download_model(filename, local_path):
"""Download model from Hugging Face and move it to the correct location."""
temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
# Ensure correct file placement
if temp_path != local_path:
shutil.move(temp_path, local_path)
return local_path
def load_models():
"""Download and load models from Hugging Face."""
try:
print("πŸ”„ Checking and downloading models...")
# Ensure models are downloaded and placed correctly
if not os.path.exists(CATBOOST_MODEL_PATH):
print("πŸš€ Downloading CatBoost model...")
download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
if not os.path.exists(XGB_MODEL_PATH):
print("πŸš€ Downloading XGBoost model...")
download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
if not os.path.exists(RF_MODEL_PATH):
print("πŸš€ Downloading RandomForest model...")
download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
# βœ… Load models
print("πŸ“¦ Loading models...")
catboost_model = joblib.load(CATBOOST_MODEL_PATH)
xgb_model = joblib.load(XGB_MODEL_PATH)
rf_model = joblib.load(RF_MODEL_PATH)
print("βœ… Models loaded successfully!")
return catboost_model, xgb_model, rf_model
except Exception as e:
print(f"❌ Error loading models: {e}")
return None, None, None
# Streamlit UI
st.title("Is_Click Predictor - ML Model Inference")
st.info("Upload a CSV file, and the trained models will predict click probability.")
catboost, xgb, rf = load_models()
expected_feature_order = catboost.feature_names_
print("Expected Feature Order:", expected_feature_order)
# Upload File
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
input_df = pd.read_csv(uploaded_file)
st.success("File uploaded successfully!")
# βœ… Compute aggregations & preprocess
input_df = preprocess_input(input_df, expected_feature_order)
# βœ… Make Predictions
st.subheader("Predictions in Progress...")
from catboost import Pool
# Define categorical features (MUST MATCH what was used during training)
cat_features = ["gender", "product", "campaign_id", "webpage_id"]
# Convert categorical features to strings (MUST be string, not float)
for col in cat_features:
input_df[col] = input_df[col].astype(str)
expected_feature_order = catboost.feature_names_
print("Expected Feature Order:", expected_feature_order)
# Ensure input_df has the correct column order
input_df = input_df[expected_feature_order]
input_pool = Pool(input_df, cat_features=cat_features)
catboost_preds = catboost.predict(input_pool)
catboost_probs = catboost.predict_proba(input_df)[:, 1]
label_encoders = {} # Store encoders to ensure consistency
for col in cat_features:
le = LabelEncoder()
input_df[col] = input_df[col].astype(str) # Ensure it's a string
le.fit(input_df[col]) # Fit only on input_df (since training is done)
label_encoders[col] = le # Save encoder for reference
input_df[col] = le.transform(input_df[col])
# List of features used during training for XGBoost
xgb_training_features = [
"age_level", "gender", "product", "campaign_id", "webpage_id",
"product_category_1", "product_category_2", "user_group_id",
"user_depth", "city_development_index", "var_1",
"click_sum_age_sex_prod", "click_count_age_sex_prod",
"unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
"click_sum_city_age_prod", "click_count_city_age_prod",
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
]
xgb_preds = xgb.predict(input_df[xgb_training_features])
# # πŸ”₯ List of features RandomForest was trained with
# rf_training_features = [
# "age_level", "gender", "product", "campaign_id", "webpage_id",
# "product_category_1", "product_category_2", "user_group_id",
# "user_depth", "city_development_index", "var_1",
# "click_sum_age_sex_prod", "click_count_age_sex_prod",
# "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
# "click_sum_city_age_prod", "click_count_city_age_prod",
# "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
# ]
#
# # βœ… Ensure all training features exist in `input_df`
# for col in rf_training_features:
# if col not in input_df.columns:
# input_df[col] = 0 # Default missing columns to 0
#
# # Get intersection of trained features and current input_df columns
# common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
#
# # Select only the matching features
# input_df_rf = input_df[common_features]
#
# # Predict without needing to add missing features
# rf_preds = rf.predict(input_df_rf)
#
#
# print("RF Model Trained Features:", rf.feature_names_in_)
# print("Input Data Features:", input_df_rf.columns.tolist())
#
# # Debugging: Check for missing or extra features
# missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
# extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
#
# print("Missing Features in Input:", missing_features)
# print("Extra Features in Input:", extra_features)
# # βœ… Make Predictions with RandomForest
# rf_preds = rf.predict(input_df_rf)
xgb_probs = xgb.predict_proba(input_df)[:, 1]
#rf_probs = rf.predict_proba(input_df)[:, 1]
#test
# Combine results
# βœ… Apply Threshold to Convert Probabilities into Binary Predictions
THRESHOLD = 0.7 # Adjust to control false positives
# βœ… Debugging: Print probability distributions before thresholding
print("πŸ” Probability Distributions Before Thresholding:")
print("CatBoost:\n", pd.Series(catboost_probs).describe())
print("XGBoost:\n", pd.Series(xgb_probs).describe())
# βœ… Dynamically Adjust Threshold Based on Probability Distribution
THRESHOLD = np.percentile(catboost_probs, 95) # Use 95th percentile
print(f"βœ… Adjusted CatBoost Threshold: {THRESHOLD:.3f}")
catboost_preds = (catboost_probs >= THRESHOLD).astype(int)
xgb_preds = (xgb_probs >= 0.7).astype(int) # Keep static for comparison
# βœ… Debugging: Count of 1s and 0s after thresholding
print("\nPost-threshold Distribution:")
print(f"CatBoost 1s: {np.sum(catboost_preds)} / {len(catboost_preds)}")
print(f"XGBoost 1s: {np.sum(xgb_preds)} / {len(xgb_preds)}")
# βœ… Fix `predictions_df` After Thresholding
predictions_df = pd.DataFrame({
"CatBoost": catboost_preds,
"XGBoost": xgb_preds
})
# βœ… Ensure Not All Are Predicted as Clicks
if predictions_df["CatBoost"].sum() == len(predictions_df) or predictions_df["XGBoost"].sum() == len(
predictions_df):
print("⚠ Warning: Model is predicting only 1s! Consider adjusting thresholds.")
# Apply "at least one model predicts 1" rule
predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
# Generate probability file
probabilities_df = pd.DataFrame({
"CatBoost_Prob": catboost_probs,
"XGBoost_Prob": xgb_probs,
# "RandomForest_Prob": rf_probs
})
# Save results
binary_predictions_path = "binary_predictions.csv"
filtered_predictions_path = "filtered_predictions.csv"
probabilities_path = "model_probabilities.csv"
predictions_df.to_csv(binary_predictions_path, index=False)
predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
probabilities_df.to_csv(probabilities_path, index=False)
st.success("Predictions completed! Download results below.")
# Download Buttons
with open(binary_predictions_path, "rb") as f:
st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")
with open(filtered_predictions_path, "rb") as f:
st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")
with open(probabilities_path, "rb") as f:
st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")