import torch import torch.nn as nn import torch.optim as optim from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler,OrdinalEncoder from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score from sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt import seaborn as sns import zipfile from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.families import Gamma from statsmodels.genmod.families.links import Log from statsmodels.tools import add_constant from pygam import LinearGAM, GammaGAM, s, f import pickle import streamlit.components.v1 as components import streamlit as st import numpy as np import pandas as pd import matplotlib.pyplot as plt from streamlit_option_menu import option_menu from pygam import LinearGAM import lime from lime import lime_tabular class HousePriceModel(nn.Module): def __init__(self, input_size): super(HousePriceModel, self).__init__() self.model = nn.Sequential( nn.Linear(input_size, 128), nn.LeakyReLU(0.2,inplace=True), nn.Linear(128, 64), nn.LeakyReLU(0.2,inplace=True), nn.Linear(64, 32), nn.LeakyReLU(0.2,inplace=True), nn.Linear(32, 1) ) def forward(self, x): x = self.model(x) return x def lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max): """ Generate and display a LIME (Local Interpretable Model-agnostic Explanations) component. :param X: pandas.DataFrame The feature matrix used to train the explainer. :param input_data: list or numpy.array The input data point to explain. :param dnn_model: torch.nn.Module The trained neural network model. :param gam_model: object The trained Generalized Additive Model :param terms: list List of feature names. :param y_min: float Minimum y-axis value :param y_max: float Maximum y-axis value :return: None """ def nn_prediction(input_data): input_tensor = torch.FloatTensor(input_data) dnn_model.eval() with torch.no_grad(): output = dnn_model(input_tensor) return output.cpu().numpy() kernel_width = 3 explainer = lime.lime_tabular.LimeTabularExplainer(X.values,mode='regression',feature_names=terms, kernel_width=kernel_width) exp = explainer.explain_instance(np.array(input_data), nn_prediction, num_features=len(input_data)) feature_importance = exp.as_list() intercept = exp.intercept[0] formula = f"> _**y =** {intercept:.4f}" i = 0 for feature, importance in feature_importance: if importance > 0: formula += f" + {importance:.4f} * **{terms[i]}**" else: formula += f" - {abs(importance):.4f} * **{terms[i]}**" i += 1 formula += '_' st.markdown(formula) lime_graph = exp.as_pyplot_figure() st.pyplot(lime_graph) def spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max): """ Generate the spline functions in a component. :param X: pandas.DataFrame The feature matrix used to train the explainer. :param input_data: list or numpy.array The input data point to explain. :param dnn_model: torch.nn.Module The trained neural network model. :param gam_model: object The trained Generalized Additive Model :param terms: list List of feature names. :param y_min: float Minimum y-axis value :param y_max: float Maximum y-axis value :return: None """ col1, col2 = st.columns(2) for i in range(len(terms)): XX = gam_model.generate_X_grid(term=i) pdep, confi = gam_model.partial_dependence(term=i, X=XX, width=0.95) fig, ax = plt.subplots(figsize=(6, 6)) # Plot partial dependence ax.plot(XX[:, i], pdep, label='Partial Dependence') ax.plot(XX[:, i], confi, c='r', ls='--', label='Confidence Interval') # Find y-value corresponding to user input user_x = input_data[i] user_y = np.interp(user_x, XX[:, i], pdep) # Plot vertical and horizontal lines ax.axvline(x=user_x, color='b', linestyle='--', label='Model Input',ymin=0, ymax=(user_y-y_min)/(y_max-y_min)) ax.axhline(y=user_y, color='b', linestyle='--', xmin=0, xmax=(user_x - XX[0, i]) / (XX[-1, i] - XX[0, i])) ax.annotate(f'{user_y:.2f}', (user_x, user_y), textcoords="offset points", xytext=(0,20), ha='center') ax.set_title(f'{terms[i]}') ax.set_xlabel(terms[i]) ax.set_ylabel('Partial Dependence') ax.legend(loc='best', fontsize='x-small') # Set consistent y-axis limits ax.set_ylim(y_min, y_max) plt.tight_layout(pad=1.5) # Alternate between columns if i % 2 == 0: with col1: st.write(f"Spline function: {terms[i]}:") st.pyplot(fig) else: with col2: st.write(f"Spline function: {terms[i]}:") st.pyplot(fig) plt.close(fig) def prediction_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max, scale=1): """ Generates the main component for the spline functions and LIME based upon the GAM and the neural network. :param X: pandas.DataFrame The feature matrix used to train the explainer. :param input_data: list or numpy.array The input data point to explain. :param dnn_model: torch.nn.Module The trained neural network model. :param gam_model: object The trained Generalized Additive Model :param terms: list List of feature names. :param y_min: float Minimum y-axis value :param y_max: float Maximum y-axis value :param scale: int The scale of the model's dataset's price :return: None """ input_array = np.array(input_data).reshape(1, -1) dnn_prediction = dnn_model(torch.FloatTensor(input_array))[0][0] gam_prediction = gam_model.predict(input_array)[0] st.markdown("# Predictions") st.write(f"- _General Additive Model Prediction:_ ${gam_prediction*scale:.2f}") st.write(f"- _Neural Network Prediction:_ ${dnn_prediction*scale:.2f}") st.markdown("----------------") st.markdown("## Spline Functions") spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max) st.markdown("----------------") st.markdown("## LIME Explanation for Neural Network") lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max) def show(): st.markdown("# _Explainable AI for Housing Costs_") st.markdown(""" ## _Models_ ### California Housing Dataset: > This dataset is more concentrated and is far less sparse as the dataset includes a smaller population from 1990 and only in California ### HUD Housing Dataset: > This dataset is far more sparse and is calculated over the entire country with data from all the different regions of the country """) st.markdown("----------------") model_versions = ['California Housing Dataset', 'HUD Housing Dataset'] value = st.selectbox(f"Select model version", model_versions) st.markdown("----------------") st.write('Generate model predictions:') if value == 'HUD Housing Dataset': dnn_model = HousePriceModel(8) dnn_model.load_state_dict(torch.load('models/dnn_model_hud.pth')) dnn_model.eval() with open('models/gam_model_hud.pkl', 'rb') as file: gam_model = pickle.load(file) dnn_mse = 77202161664.0000 gam_mse = 57308274833.9465 input_data = [None] * 8 terms = [ 'City/Suburban Status', 'Census Region', 'Area median income (average)', '# of bedrooms in unit', 'Age of the house (years)', '# of rooms in unit', '# of Persons in Household', 'Monthly utility cost' ] region_code = { 'Northeast': 1, 'Midwest': 2, 'South': 3, 'West': 4, } metro_code = { 'Central cities of metropolitan areas': 1, 'Inside metropolitan area, but not in central city': 2, 'Inside metropolitan area, but not in central city - rural': 3, 'Outside metropolitan areas, urbanized': 4, 'Outside metropolitan areas, rural': 5, } with st.sidebar: st.title("Model Inputs") value = st.selectbox(f"{terms[0]}", metro_code.keys()) input_data[0] =metro_code[value] value = st.selectbox(f"{terms[0]}", region_code.keys()) input_data[1] =region_code[value] value = st.number_input(f"{terms[2]}", value=84200) input_data[2] =value value = st.number_input(f"{terms[3]}", value=4) input_data[3] =value value = st.number_input(f"{terms[4]}", value=9) input_data[4] =value value = st.number_input(f"{terms[5]}", value=8) input_data[5] =value value = st.number_input(f"{terms[6]}", value=3) input_data[6] =value value = st.number_input(f"{terms[7]}", value=300) input_data[7] =value df = pd.read_csv('data/hud_dataset.csv', index_col=False) X, y = df.drop(columns=['VALUE']), df['VALUE'] if st.button("Predict"): st.markdown("# Model Loss") st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}") st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}") st.markdown("----------------") prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -500000, y_max = 500000) else: dnn_model = HousePriceModel(8) dnn_model.load_state_dict(torch.load('models/dnn_model_california.pth')) dnn_model.eval() with open('models/gam_model_california.pkl', 'rb') as file: gam_model = pickle.load(file) dnn_mse = 0.9678 gam_mse = 0.3081 input_data = [None] * 8 terms = [ 'Median Income', 'House Age (years)', 'Average Rooms', 'Average Bedrooms', 'Population (average of census block group per county)', 'Average Occupancy', 'Latitude', 'Longitude' ] counties = pd.read_csv('data/california_counties.csv') distinct_counties = list(counties['County'].unique()) with st.sidebar: st.title("Model Inputs") county = st.selectbox(f"{terms[0]}", distinct_counties) counties = counties[counties['County'] == county] county_vals = counties.values[0] population = county_vals[4] / 442 lat = county_vals[2] long = county_vals[3] value = st.number_input(f"{terms[0]}", value=84200) input_data[0] =value/10000 value = st.number_input(f"{terms[1]}", value=4) input_data[1] =value value = st.number_input(f"{terms[2]}", value=9) input_data[2] =value value = st.number_input(f"{terms[3]}", value=8) input_data[3] =value input_data[4] = population value = st.number_input(f"{terms[5]}", value=3) input_data[5] =value input_data[6] = lat input_data[7] = long housing = fetch_california_housing() X, y = pd.DataFrame(housing.data), pd.DataFrame(housing.target) X.columns = housing.feature_names if st.button("Predict"): st.markdown("# Model Loss") st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}") st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}") st.markdown("----------------") prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -10, y_max = 10, scale=100000) if __name__ == '__main__': st.set_page_config(page_title="Generalized Additive Models", page_icon="🚀") page = option_menu( menu_title=None, options=["Home", "About"], icons=["house", "book"], menu_icon="cast", default_index=0, orientation="horizontal", ) if page == "Home": show() elif page == "About": st.switch_page("pages/about.py") st.info("This app is for explaining the problem domain using Generalized Additive Models")