Spaces:
Sleeping
Sleeping
suppressPackageStartupMessages(library(dplyr)) | |
suppressPackageStartupMessages(library(fastshap)) # for fast (approximate) Shapley values | |
suppressPackageStartupMessages(library(caret)) | |
#suppressPackageStartupMessages(library(doMC)) | |
#registerDoMC(cores = 10) | |
p_function_G <- | |
function(object, newdata) | |
caret::predict.train(object, newdata = newdata, type = "prob")[, "G"] | |
p_function_GM <- | |
function(object, newdata) | |
caret::predict.train(object, newdata = newdata, type = "prob")[, "GM"] | |
p_function_R <- | |
function(object, newdata) | |
caret::predict.train(object, newdata = newdata, type = "prob")[, "R"] | |
p_function_W <- | |
function(object, newdata) | |
caret::predict.train(object, newdata = newdata, type = "prob")[, "W"] | |
# DEPRECATED | |
calculate_shap_deprecated <- function(dataset,model,nsim=10) { | |
# library(doParallel) | |
# registerDoParallel(8) | |
trainset <- dataset %>% na.omit() %>% | |
as.data.frame() | |
trainset_y <- dataset %>% | |
select(Activity) %>% | |
na.omit() %>% | |
unlist() %>% | |
unname() | |
trainset <- trainset %>% select(-Activity) | |
trainset_G <- trainset[which(trainset_y == "G"), ] | |
trainset_GM <- trainset[which(trainset_y == "GM"), ] | |
trainset_R <- trainset[which(trainset_y == "R"), ] | |
trainset_W <- trainset[which(trainset_y == "W"), ] | |
# Compute fast (approximate) Shapley values using 50 Monte Carlo repetitions | |
message(" - Calculating SHAP values for class G") | |
shap_values_G <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_G, | |
nsim = nsim, | |
newdata = trainset_G | |
) | |
message(" - Calculating SHAP values for class GM") | |
shap_values_GM <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_GM, | |
nsim = nsim, | |
newdata = trainset_GM | |
) | |
message(" - Calculating SHAP values for class R") | |
shap_values_R <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_R, | |
nsim = nsim, | |
newdata = trainset_R | |
) | |
message(" - Calculating SHAP values for class W") | |
shap_values_W <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_W, | |
nsim = nsim, | |
newdata = trainset_W | |
# adjust = TRUE | |
) | |
shap_values_GM$class<-"GM" | |
shap_values_G$class<-"G" | |
shap_values_R$class<-"R" | |
shap_values_W$class<-"W" | |
shap_values<-rbind(shap_values_G, | |
shap_values_GM, | |
shap_values_R, | |
shap_values_W) | |
shap_values | |
} | |
#' A new function for calcualting SHAP values | |
#' the function returns a dataframe with SHAP values in the same | |
#' order of the original dataset. | |
#' | |
#' SHAP value dataframe also contains information about Animal and | |
#' the prediction of the model. Notice that SHAP are calculated considering | |
#' the class (ground truth) and not the prediction. The prediction column is only | |
#' used for filtering ana analysis. The function `calculate_shapp_class()` can be | |
#' used for calculating SHAP values on prediction | |
#' | |
#' @param dataset a dataset used for calcuating SHAP. The dataset is used for | |
#' permutation during SHAP calculation and also each class is filtered and SHAP | |
#' value for each class is calculated. | |
#' @param model a model | |
#' @param nsim number of monte carlo simulation | |
#' | |
#' @return | |
#' @export | |
#' | |
#' @examples | |
calculate_shap <- function(dataset,model,nsim=10) { | |
trainset <- dataset %>% na.omit() %>% | |
as.data.frame() | |
trainset_y <- dataset %>% | |
select(Activity) %>% | |
na.omit() %>% | |
unlist() %>% | |
unname() | |
## Create an ID for maintaining the order | |
trainset <- cbind(id=seq(1:nrow(trainset)), trainset) | |
trainset <- trainset %>% select(-Activity) | |
trainset_G <- trainset[which(trainset_y == "G"), ] | |
trainset_GM <- trainset[which(trainset_y == "GM"), ] | |
trainset_R <- trainset[which(trainset_y == "R"), ] | |
trainset_W <- trainset[which(trainset_y == "W"), ] | |
id <- c(trainset_G$id, | |
trainset_GM$id, | |
trainset_R$id, | |
trainset_W$id) | |
trainset <- trainset %>% select(-id) | |
trainset_G <- trainset_G %>% select(-id) | |
trainset_GM <- trainset_GM %>% select(-id) | |
trainset_R <- trainset_R %>% select(-id) | |
trainset_W <- trainset_W %>% select(-id) | |
Anim <- c(trainset_G$Anim, | |
trainset_GM$Anim, | |
trainset_R$Anim, | |
trainset_W$Anim) | |
trainset <- trainset %>% select(-Anim) | |
trainset_G <- trainset_G %>% select(-Anim) | |
trainset_GM <- trainset_GM %>% select(-Anim) | |
trainset_R <- trainset_R %>% select(-Anim) | |
trainset_W <- trainset_W %>% select(-Anim) | |
predictions <- c(trainset_G$predictions, | |
trainset_GM$predictions, | |
trainset_R$predictions, | |
trainset_W$predictions) | |
trainset <- trainset %>% select(-predictions) | |
trainset_G <- trainset_G %>% select(-predictions) | |
trainset_GM <- trainset_GM %>% select(-predictions) | |
trainset_R <- trainset_R %>% select(-predictions) | |
trainset_W <- trainset_W %>% select(-predictions) | |
# Compute fast (approximate) Shapley values using 50 Monte Carlo repetitions | |
message(" - Calculating SHAP values for class G") | |
shap_values_G <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_G, | |
nsim = nsim, | |
newdata = trainset_G | |
) | |
message(" - Calculating SHAP values for class GM") | |
shap_values_GM <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_GM, | |
nsim = nsim, | |
newdata = trainset_GM | |
) | |
message(" - Calculating SHAP values for class R") | |
shap_values_R <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_R, | |
nsim = nsim, | |
newdata = trainset_R | |
) | |
message(" - Calculating SHAP values for class W") | |
shap_values_W <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = p_function_W, | |
nsim = nsim, | |
newdata = trainset_W | |
# adjust = TRUE | |
) | |
shap_values_G <- shap_values_G %>% as.data.frame() | |
shap_values_GM <- shap_values_GM %>% as.data.frame() | |
shap_values_R <- shap_values_R %>% as.data.frame() | |
shap_values_W <- shap_values_W %>% as.data.frame() | |
shap_values_G$class<-"G" | |
shap_values_GM$class<-"GM" | |
shap_values_R$class<-"R" | |
shap_values_W$class<-"W" | |
shap_values<-rbind(shap_values_G, | |
shap_values_GM, | |
shap_values_R, | |
shap_values_W) | |
shap_values <- shap_values %>% tibble::add_column(Anim) | |
shap_values <- shap_values %>% tibble::add_column(predictions) | |
#shap_values <-shap_values %>% tibble::add_column(id) | |
shap_values[order(id),] | |
} | |
#' Calculate SHAP values for a given PREDICTED class | |
#' | |
#' @param dataset the dataset used for permutation during SHAP calculation | |
#' @param new_data the new data we want to calculate SHAP | |
#' @param model the model used for explanation | |
#' @param nsim the number of Monte Carlos Simulations | |
#' @param function_class a wrapper function to obtain only a particular class | |
#' @param class_name the name of the class | |
#' | |
#' @return | |
#' @export | |
#' | |
#' @examples | |
#' | |
#' # Calculate the SHAP values for class G on new data | |
#' shap_values_G <- calculate_shap_class( | |
#' dataset, | |
#' new_data = newdata, | |
#' model = goat_model | |
#' nsim = 100, | |
#' function_class = p_function_G, | |
#' class_name = "G") | |
#' | |
#' | |
calculate_shap_class <- function(dataset, new_data, model,nsim=10, | |
function_class, class_name = "G") { | |
trainset <- dataset %>% na.omit() %>% | |
as.data.frame() | |
trainset_y <- dataset %>% | |
select(predictions) %>% | |
na.omit() %>% | |
unlist() %>% | |
unname() | |
trainset<- trainset %>%select (-Activity,-predictions,-Anim) | |
new_data_class <- new_data | |
Anim <- new_data_class$Anim | |
new_data_class <- new_data_class %>% select(-Anim) | |
Activity <- new_data_class$Activity | |
new_data_class <- new_data_class %>% select(-Activity) | |
predictions <- new_data_class$predictions | |
new_data_class <- new_data_class %>% select(-predictions) | |
# Compute fast (approximate) Shapley values using 50 Monte Carlo repetitions | |
message(" - Calculating SHAP values for class ",class_name) | |
shap_values_class <- | |
fastshap::explain( | |
model, | |
X = trainset, | |
pred_wrapper = function_class, | |
nsim = nsim, | |
newdata = new_data_class, | |
) | |
shap_values_class$class<-Activity | |
shap_values<-shap_values_class | |
shap_values <- shap_values %>% tibble::add_column(Anim) | |
shap_values <- shap_values %>% tibble::add_column(predictions) | |
shap_values | |
} | |
shap_summary_plot<-function(shap_values){ | |
summary_plot <- | |
shap_values %>% reshape2::melt() %>% group_by(class, variable) %>% | |
summarise(mean = mean(abs(value))) %>% | |
arrange(desc(mean)) %>% | |
ggplot() + | |
ggdark::dark_theme_classic() + | |
geom_col(aes( | |
y = variable, | |
x = mean, | |
group = class, | |
fill = class | |
), position = "stack") + | |
xlab("Mean(|Shap Value|) Average impact on model output magnitude") | |
summary_plot | |
} | |
shap_beeswarm_plot<-function(shap_values,dataset){ | |
shap_values <- shap_values %>% reshape2::melt() | |
dataset<-dataset %>% mutate(class=Activity) %>% select(-Activity) %>% | |
reshape2::melt() %>% group_by(variable) %>% | |
mutate(value_scale=range01(value)) | |
beeswarm_plot<-cbind(shap_values, feature_value=dataset$value_scale) %>% | |
# filter(class=="GM") %>% | |
ggplot()+ | |
facet_wrap(~class)+ | |
#ggdark::dark_theme_bw()+ | |
theme_classic()+ | |
geom_hline(yintercept=0, | |
color = "red", size=0.5)+ | |
ggforce::geom_sina(aes(x=variable,y=value,color=feature_value),size=0.5,bins=4,alpha=0.9,shape=15)+ | |
scale_colour_gradient(low = "yellow", high = "red", na.value = NA)+ | |
scale_colour_gradient(low = "skyblue", high = "orange", na.value = NA)+ | |
xlab("Feature")+ylab("SHAP value")+ | |
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) | |
beeswarm_plot | |
} | |