import numpy as np import pandas as pd import seaborn as sns import streamlit as st import joblib import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.metrics import confusion_matrix from xgboost import XGBClassifier # Page configuration st.set_page_config(page_title="Bank Account Prediction Dashboard", page_icon="💳") st.title('Bank Account Prediction Dashboard') # Load model and preprocessing objects def load_model_objects(): model_xgb = joblib.load('xgb_clf.joblib') scaler = joblib.load('scaler.joblib') encoder_y = joblib.load('encoder.joblib') # For target variable le_country_economy = joblib.load('country_encoder.joblib') le_regionwb = joblib.load('regionwb_encoder.joblib') return model_xgb, scaler, encoder_y, le_country_economy, le_regionwb model_xgb, _scaler, _label_encoder, le_country_economy, le_regionwb = load_model_objects() @st.cache_data def load_data(): # Load the actual data from the CSV file return pd.read_csv( 'micro_world_139countries.csv', encoding='ISO-8859-1' ) @st.cache_data def process_data(df, _scaler, _label_encoder, _country_encoder, _regionwb_encoder): # Select relevant columns and sample sample_df = df[['remittances', 'educ', 'age', 'female', 'mobileowner', 'internetaccess', 'pay_utilities', 'receive_transfers', 'receive_pension', 'economy', 'regionwb', 'account']].sample( n=5000, random_state=42, replace=True) # Drop rows with missing values in specified columns sample_df = sample_df.dropna(subset=['account', 'remittances', 'educ', 'age', 'female', 'mobileowner', 'internetaccess', 'pay_utilities', 'receive_transfers', 'receive_pension', 'economy', 'regionwb']) # Encode 'economy' using the loaded LabelEncoder sample_df['economy'] = _country_encoder.transform(sample_df['economy']) # Encode 'regionwb' using the loaded LabelEncoder sample_df['regionwb'] = _regionwb_encoder.transform(sample_df['regionwb']) # Manual encoding for 'educ' educ_mapping = {'None': 0, 'Primary': 1, 'Secondary': 2, 'Tertiary': 3} sample_df['educ'] = sample_df['educ'].map(educ_mapping).fillna(-1).astype(int) # Manual encoding for 'female' gender_mapping = {'Male': 0, 'Female': 1} sample_df['female'] = sample_df['female'].map(gender_mapping).fillna(-1).astype(int) # Convert boolean columns to integers boolean_columns = ['mobileowner', 'internetaccess', 'pay_utilities', 'receive_transfers', 'receive_pension'] for col in boolean_columns: sample_df[col] = sample_df[col].astype(int) # Separate features and target X = sample_df.drop('account', axis=1) y = sample_df['account'] # Encode target variable y = _label_encoder.transform(y) # Scale features using the loaded scaler X_scaled = _scaler.transform(X) X_scaled = pd.DataFrame(X_scaled, columns=X.columns) return X_scaled, y # Load data df = load_data() df = df.drop('inc_q', axis=1, errors='ignore') # Ensure 'inc_q' is dropped if it exists # Adding a sidebar for user input with st.sidebar: st.title("Input User Data for Prediction") with st.form("user_inputs"): remittances = st.number_input('Remittances', min_value=0, max_value=100000, step=100) educ = st.selectbox('Education Level', options=['None', 'Primary', 'Secondary', 'Tertiary']) age = st.number_input('Age', min_value=18, max_value=100, step=1) female = st.selectbox('Gender', options=['Male', 'Female']) mobileowner = st.radio('Owns a Mobile', options=[True, False]) internetaccess = st.radio('Has Internet Access', options=[True, False]) pay_utilities = st.radio('Pays Utilities Online', options=[True, False]) receive_transfers = st.radio('Receives Transfers', options=[True, False]) receive_pension = st.radio('Receives Pension', options=[True, False]) economy = st.selectbox('Country', options=list(le_country_economy.classes_)) regionwb = st.selectbox('Region', options=list(le_regionwb.classes_)) account = 1 # Placeholder or default value submit_button = st.form_submit_button("Predict") # Processing user input for prediction if submit_button: user_data = pd.DataFrame({ 'remittances': [remittances], 'educ': [educ], 'age': [age], 'female': [female], 'mobileowner': [mobileowner], 'internetaccess': [internetaccess], 'pay_utilities': [pay_utilities], 'receive_transfers': [receive_transfers], 'receive_pension': [receive_pension], 'economy': [economy], 'regionwb': [regionwb], 'account': [account] }) try: processed_user_data, _ = process_data( user_data, _scaler, _label_encoder, le_country_economy, le_regionwb ) prediction = model_xgb.predict(processed_user_data) result = 'Has Bank Account' if prediction[0] == 1 else 'Does Not Have Bank Account' st.sidebar.write(f'Prediction: {result}') except Exception as e: st.sidebar.error(f"Error in processing data: {e}") # Process example data scaled_data, _ = process_data(df, _scaler, _label_encoder, le_country_economy, le_regionwb) # Display the processed data in your Streamlit app if scaled_data is not None: st.write("Scaled Data:", scaled_data) # Main prediction logic # Process the main dataset for predictions processed_data, y_main = process_data(df, _scaler, _label_encoder, le_country_economy, le_regionwb) if processed_data is not None: # Prepare features for prediction X = processed_data # 'account' has been dropped in process_data y = y_main # Make predictions predictions = model_xgb.predict(X) # Show predictions st.write("Predictions:") st.write(predictions) # Plotting a confusion matrix st.subheader("Confusion Matrix") cm = confusion_matrix(y, predictions) cm_fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) plt.ylabel('Actual') plt.xlabel('Predicted') st.pyplot(cm_fig) # Feature importance if st.button('Show Feature Importances'): feat_importances = pd.Series(model_xgb.feature_importances_, index=X.columns) st.bar_chart(feat_importances)