vxbrandon's picture
End of training
79d92d5 verified
raw
history blame
35.6 kB
from transformers import TrainerCallback, Trainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import PeftModel
from datasets import Dataset
from transformers.utils import is_sagemaker_mp_enabled, is_sagemaker_dp_enabled
from typing import Any, Dict, Union, Optional, Tuple
from torch.nn import MSELoss
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import time
import os
import copy
from transformers.models.mistral.modeling_mistral import (
MistralMLP,
MistralAttention,
MistralModel,
MistralDecoderLayer,
MistralConfig,
MISTRAL_ATTENTION_CLASSES,
MistralRMSNorm,
MistralForCausalLM,
)
from experiments.models.sparse_mistral.svd_router import (
low_rank_approximation,
SparsePredictor,
)
from utils.utils import (
print_size_of_model,
is_running_deepspeed,
is_mainprocess,
get_datetime,
ds_print,
)
class SparseSFTTTrainer(SFTTrainer):
def __init__(self, *args, **kwargs):
self.regularization_coefficient = kwargs.pop("regularization_coefficient", 10)
self.use_sparse_regularization = kwargs.pop("use_sparse_regularization", False)
self.use_spm_loss = False
self.freeze_original_weights = False
self.regularization_type = kwargs.pop(
"regularization_type", "L1 positive activation"
)
assert self.regularization_type in [
"L2 activation",
"L1 positive activation",
], f"Invalid regularization type: {self.regularization_type}"
self.sparse_layers = []
self.sparse_decoder_layers = []
super(SparseSFTTTrainer, self).__init__(*args, **kwargs)
def initialize_sparse_silu_layers(self, model):
self.sparse_layers = [
m for m in model.modules() if isinstance(m, MistralSparseSiluMLP)
]
def initialize_sparse_decoder_layers(self, model):
self.sparse_decoder_layers = [
m for m in model.modules() if isinstance(m, SparseMistralDecoderLayer)
]
def training_step(
self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]
) -> torch.Tensor:
"""
Override the huggingface's training_step function to add a regularization term.
A regularization term is computed with intermediate values, which are freed after "backward()."
You need to set `retain_graph=True` inside `backward` function to keep the values.
"""
model.train()
inputs = self._prepare_inputs(inputs)
with self.compute_loss_context_manager():
loss = self.compute_loss(model, inputs)
if self.args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if not self.freeze_original_weights:
if loss is not None:
self.accelerator.backward(loss, retain_graph=False)
if self.use_sparse_regularization:
regularization_loss = self.compute_regularization(model)
if self.args.n_gpu > 1:
regularization_loss = regularization_loss.mean()
if regularization_loss is not None:
self.accelerator.backward(regularization_loss, retain_graph=True)
loss += regularization_loss
if self.use_spm_loss:
spm_loss = self.compute_spm_loss(model)
if self.args.n_gpu > 1:
spm_loss = spm_loss.mean()
if spm_loss is not None:
self.accelerator.backward(spm_loss, retain_graph=False)
loss += spm_loss
return loss.detach() / self.args.gradient_accumulation_steps
def compute_regularization(self, model):
"""
Compute a sparse regularization loss for SiLU
"""
loss = 0
if len(self.sparse_layers) == 0:
self.initialize_sparse_silu_layers(model)
num_layers = len(self.sparse_layers)
for module in self.sparse_layers:
if module.activation_norm is not None:
loss += module.activation_norm
loss /= num_layers
loss *= self.regularization_coefficient
if self.state.global_step % 20 == 0 and loss != 0:
print("Negative relularizer loss: ", loss.item())
return loss
def compute_spm_loss(self, model):
loss = 0
if len(self.sparse_decoder_layers) == 0:
self.initialize_sparse_decoder_layers(model)
for module in self.sparse_decoder_layers:
if module.distill_loss != None:
loss += module.distill_loss
if self.state.global_step % 20 == 0 and loss != 0:
print("Sparse Predictor Distillation loss: ", loss.item())
return loss
# def compute_loss(self, model, inputs, return_outputs=False):
# loss = super().compute_loss(model, inputs, return_outputs)
#
# if is_sagemaker_mp_enabled():
# import smdistributed.modelparallel.torch as smp
# @smp.step()
# def smp_forward_backward(model, inputs, gradient_accumulation_steps=1):
# outputs = model(**inputs)
# loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
# loss /= gradient_accumulation_steps
# model.backward(loss)
# return loss
#
# loss_mb = smp_forward_backward(
# model, inputs, self.args.gradient_accumulation_steps
# )
# if self.use_sparse_regularization:
# return loss_mb.reduce_mean().detach().to(
# self.args.device
# ) + self.regularization_coefficient * self.compute_regularization(model)
# else:
# return loss_mb.reduce_mean().detach().to(self)
#
# if return_outputs:
# classification_loss, outputs = loss
# else:
# classification_loss = loss
#
# loss = classification_loss
# if self.use_sparse_regularization:
# regularization_loss = self.compute_regularization(model)
# loss += self.regularization_coefficient * regularization_loss
#
# return (loss, outputs) if return_outputs else loss
class SparseTrainer(Trainer):
def __init__(self, *args, **kwargs):
self.regularization_coefficient = kwargs.pop("regularization_coefficient", 10)
self.use_sparse_regularization = kwargs.pop("use_sparse_regularization", False)
self.use_spm_loss = False
self.freeze_original_weights = False
self.regularization_type = kwargs.pop(
"regularization_type", "L1 positive activation"
)
assert self.regularization_type in [
"L2 activation",
"L1 positive activation",
], f"Invalid regularization type: {self.regularization_type}"
self.sparse_layers = []
self.sparse_decoder_layers = []
super(SparseTrainer, self).__init__(*args, **kwargs)
def initialize_sparse_silu_layers(self, model):
self.sparse_layers = [
m for m in model.modules() if isinstance(m, MistralSparseSiluMLP)
]
def initialize_sparse_decoder_layers(self, model):
self.sparse_decoder_layers = [
m for m in model.modules() if isinstance(m, SparseMistralDecoderLayer)
]
def training_step(
self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]
) -> torch.Tensor:
"""
Override the huggingface's training_step function to add a regularization term.
A regularization term is computed with intermediate values, which are freed after "backward()."
You need to set `retain_graph=True` inside `backward` function to keep the values.
"""
model.train()
inputs = self._prepare_inputs(inputs)
with self.compute_loss_context_manager():
loss = self.compute_loss(model, inputs)
if self.args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if not self.freeze_original_weights:
if loss is not None:
self.accelerator.backward(loss, retain_graph=False)
if self.use_sparse_regularization:
regularization_loss = self.compute_regularization(model)
if self.args.n_gpu > 1:
regularization_loss = regularization_loss.mean()
if regularization_loss is not None:
self.accelerator.backward(regularization_loss, retain_graph=True)
loss += regularization_loss
if self.use_spm_loss:
spm_loss = self.compute_spm_loss(model)
if self.args.n_gpu > 1:
spm_loss = spm_loss.mean()
if spm_loss is not None:
self.accelerator.backward(spm_loss, retain_graph=False)
loss += spm_loss
return loss.detach() / self.args.gradient_accumulation_steps
def compute_regularization(self, model):
"""
Compute a sparse regularization loss for SiLU
"""
loss = 0
if len(self.sparse_layers) == 0:
self.initialize_sparse_silu_layers(model)
num_layers = len(self.sparse_layers)
for module in self.sparse_layers:
if module.activation_norm is not None:
loss += module.activation_norm
loss /= num_layers
loss *= self.regularization_coefficient
if self.state.global_step % 20 == 0 and loss != 0:
print("Negative relularizer loss: ", loss.item())
return loss
def compute_spm_loss(self, model):
loss = 0
if len(self.sparse_decoder_layers) == 0:
self.initialize_sparse_decoder_layers(model)
for module in self.sparse_decoder_layers:
if module.distill_loss != None:
loss += module.distill_loss
if self.state.global_step % 20 == 0 and loss != 0:
print("Sparse Predictor Distillation loss: ", loss.item())
return loss
class SparseSiLU(nn.SiLU):
def __init__(self, threshold):
super(SparseSiLU, self).__init__()
self.threshold = threshold
self.m = nn.Threshold(self.threshold, 0)
def set_new_threshold(self, threshold):
self.threshold = threshold
self.m = nn.Threshold(threshold, 0)
def forward(self, x):
act = super(SparseSiLU, self).forward(x)
return self.m(act) - self.m(-act)
class MistralSparseSiluMLP(MistralMLP):
def __init__(self, config, *args, **kwargs):
super().__init__(config)
self.swish_outputs = None
self.relu = nn.ReLU()
self.kill_sparse_swish_outputs = False
self.dead_percentage = 0
self.is_stats = False
self.visit_counts = 0
# Hyperparameters to tune
self.dead_threshold = kwargs.pop("dead_threshold", 0)
self.use_sparse_regularization = kwargs.pop("use_sparse_regularization", True)
self.regularization_type = kwargs.pop(
"regularization_type", "L1 regularization"
)
self.regularization_threshold = kwargs.pop("regularization_threshold", 0.5)
self.use_relu = kwargs.pop("use_relu", False)
self.activation_norm = None
# Activation Histograms
self.is_collect_histogram = False
num_bins = 1000
self.histogram_bins = torch.linspace(-1, 1, num_bins - 2)
self.histogram_bins = torch.cat(
[torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
)
self.pre_act_hist_counts = torch.zeros(num_bins - 1)
self.post_act_hist_counts = torch.zeros(num_bins - 1)
self.t = 0
self.count = 0
self.agg_sparsity = 0
# Sparse activation function
self.sparse_act_fn = SparseSiLU(threshold=self.dead_threshold)
def activate_stats(self, is_collect_histogram: bool = True):
self.is_stats = True
self.dead_percentage = 0
self.visit_counts = 0
self.is_collect_histogram = is_collect_histogram
self.histogram_counts = torch.zeros(2000) # .to(self.down_proj.weight.device)
def deactivate_stats(self):
self.is_stats = False
def collect_stats(self, pre_activation, post_activation):
start_time = time.time()
pre_activation = pre_activation.float().cpu().detach()
post_activation = post_activation.float().cpu().detach()
# self.histogram_bins=self.histogram_bins.to(pre_activation.device).type(pre_activation.dtype)
self.pre_act_hist_counts += torch.histogram(
pre_activation, bins=self.histogram_bins
)[0]
self.post_act_hist_counts += torch.histogram(
torch.abs(post_activation), bins=self.histogram_bins
)[0]
self.t += time.time() - start_time
if self.visit_counts % 30 == 0:
print(f"Time taken to collect stats: {self.t}s.")
def forward(
self,
x,
sp_mask: torch.tensor = None,
):
"""
If kill_sparse_swish_outputs is set to False, this layer functions exactly like a normal MLP layer.
"""
if sp_mask != None: # When sparse mask is given
return self.down_proj(
self.sparse_act_fn(self.gate_proj(x) * sp_mask) * self.up_proj(x)
) # Todo: This doesn't accelerate runtime (instead slowing down)
elif self.use_relu:
post_act = self.relu(self.gate_proj(x))
self.count += 1
if self.count <= 1:
print("USING RELU!!!!")
if self.is_stats:
dead_neurons = post_act == 0
dead_percentage = dead_neurons.float().mean()
agg_sparsity = dead_neurons.all(dim=0).float().mean()
self.dead_percentage = (
self.dead_percentage * self.visit_counts + dead_percentage
) / (self.visit_counts + 1)
self.agg_sparsity = (
self.agg_sparsity * self.visit_counts + agg_sparsity
) / (self.visit_counts + 1)
self.visit_counts += 1
return self.down_proj(post_act * self.up_proj(x))
else:
self.count += 1
if self.count <= 1:
print("USING SparseSILU!!!!")
pre_act = self.gate_proj(x)
post_act = self.act_fn(pre_act)
if self.kill_sparse_swish_outputs:
dead_neurons = post_act.abs() <= self.dead_threshold
# print("pre act sparsity: ", (pre_act==0).float().mean())
dead_percentage = dead_neurons.float().mean()
agg_sparsity = dead_neurons.all(dim=0).float().mean()
if self.is_stats:
self.dead_percentage = (
self.dead_percentage * self.visit_counts + dead_percentage
) / (self.visit_counts + 1)
self.agg_sparsity = (
self.agg_sparsity * self.visit_counts + agg_sparsity
) / (self.visit_counts + 1)
self.visit_counts += 1
self.a = dead_percentage
# print(self.agg_sparsity)
# Collect histogram stats
if (
self.is_collect_histogram
and pre_act.eq(0).float().mean() < 0.99
): # Padded dataset
self.collect_stats(pre_act, post_act)
post_act[dead_neurons] = 0
out = self.down_proj(post_act * self.up_proj(x))
if self.use_sparse_regularization:
if self.regularization_type == "L1 regularization":
self.activation_norm = torch.abs(post_act)[
post_act < self.regularization_threshold
].mean()
elif self.regularization_type == "L2 regularization":
self.activation_norm = torch.sqrt(
torch.square(post_act)[post_act < self.regularization_threshold]
).mean()
return out
class SparseMistralDecoderLayer(MistralDecoderLayer):
def __init__(
self,
config: MistralConfig,
layer_idx: int,
decoder_layer: MistralDecoderLayer,
init_svd: bool = True,
*args,
**kwargs,
):
assert isinstance(
decoder_layer.mlp, MistralSparseSiluMLP
), f"{type(decoder_layer.mlp)} should MistralSparseSiluMLP."
super().__init__(config, layer_idx)
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.init_svd = init_svd
self.self_attn = decoder_layer.self_attn
self.mlp = decoder_layer.mlp
self.input_layernorm = decoder_layer.input_layernorm
self.post_attention_layernorm = decoder_layer.post_attention_layernorm
# Sparse predictor for mlp (initialized with SVD decomposed matrix)
self.low_rank = kwargs.pop("low_rank", 64)
self.sparse_act_func = decoder_layer.mlp.sparse_act_fn
print(
f"Setting {layer_idx}th mlp layer's sparse predictor... svd init: {init_svd}"
)
self.sp_mlp = low_rank_approximation(
decoder_layer.mlp.gate_proj,
act_func=self.sparse_act_func,
init_svd=init_svd,
)
self.use_async = kwargs.pop("use_async", False)
self.use_sparse_predictor = False
self.distill_loss = None
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
**kwargs,
) -> Tuple[
torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
]:
print("hidden_states shape: ", hidden_states.shape)
if "padding_mask" in kwargs:
warnings.warn(
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
)
residual = hidden_states
sp_mask = None
if self.use_async:
sp_mask = self.sp_mlp(hidden_states)
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
if not self.use_async:
sp_mask = self.sp_mlp(hidden_states)
# Compute distillation loss
gating_output = self.mlp.sparse_act_fn(self.mlp.gate_proj(hidden_states))
loss_func = MSELoss()
self.distill_loss = loss_func(sp_mask, gating_output)
# Convert sp mask into binary form
sp_mask = sp_mask > 0
if self.training:
sp_mask = None
# if not self.use_sparse_predictor:
# sp_mask = None
hidden_states = self.mlp(hidden_states, sp_mask)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
class SparseMistralConfig(MistralConfig):
model_type = "sparse_mistral"
def __init__(self, **kwargs):
super().__init__(**kwargs)
class SparseMistralforCausalLM(MistralForCausalLM):
config_class = SparseMistralConfig
def __init__(self, config):
super().__init__(config)
self.config = config
if config.use_sparse_model:
self.apply_sparse_mlp()
if config.thresholds is not None:
for idx, m in enumerate(self.model.layers):
if isinstance(m.mlp, MistralSparseSiluMLP):
m.mlp.dead_threshold = config.thresholds[idx]
m.mlp.sparse_act_fn.set_new_threshold(m.mlp.dead_threshold)
m.mlp.kill_sparse_swish_outputs = True
m.mlp.use_relu = config.use_relu
if config.use_sparse_predictor:
self.apply_sparse_predictor(init_svd=config.init_svd)
def apply_sparse_mlp(self):
apply_mistral_sparse_silu_mlp(
self,
config=self.config,
use_sparse_regularization=self.config.use_sparse_regularization,
)
def apply_sparse_predictor(self, init_svd: bool = True):
apply_mistral_sparse_decoder_layer(self, config=self.config, init_svd=init_svd)
class GracefulRegularizationScheduler(TrainerCallback):
def __init__(
self,
num_warmup_steps=40,
is_enabled: bool = False,
model_name: str = "mistral",
test_dataset: Dataset = None,
targeted_sparsity: float = 0.5,
keep_regularization_with_kill: bool = False,
):
"""Scheduler for regularizing the model first before applying the dead threshold.
:param num_warmup_steps: number of training steps required to reach the dead threshold, defaults to 40
:param increment_ratio: by how much to increase the dead threshold.
For example, 0.5 means "increase the threshold by 0.5 * desired threshold
"""
self.num_warmup_steps = num_warmup_steps
self.is_enabled = is_enabled
self.model_name = model_name
self.test_dataset = test_dataset
self.targeted_sparsity = targeted_sparsity
self.keep_regularization_with_kill = keep_regularization_with_kill
self.act_hist_path = (
f"/matx/u/vxbrando/histograms/warm_up_reg_{targeted_sparsity}/act_hist.pt"
)
if self.is_enabled:
print("GracefulRegularizationScheduler is enabled.")
self.trainer = None
def set_trainer(self, trainer):
self.trainer = trainer
def on_step_end(self, args, state, control, **kwargs):
if not self.is_enabled:
return
model = kwargs["model"]
if isinstance(model, PeftModel):
base_model = model.get_base_model()
else:
base_model = model
if state.global_step == 1:
ds_print("Setting an initial reg threshold to 0.1")
set_regularization_threshold(base_model, 0.1)
# if state.global_step >= self.num_warmup_steps and state.global_step % 50 == 0:
if state.global_step == self.num_warmup_steps:
activate_stats(base_model)
enable_sparse_silu(base_model)
self.trainer.evaluate()
save_act_hist(base_model, self.act_hist_path)
set_sparse_threshold(base_model, self.targeted_sparsity, True)
deactivate_stats(base_model)
self.trainer.use_sparse_regularization = self.keep_regularization_with_kill
# set_layer_specific_regularization(model.get_base_model())
print_dead_neuron_stats(model.get_base_model())
if state.global_step % 2000 == 0:
if is_mainprocess():
ds_print(
f"Saving to /scr/lukeai/{self.model_name}_{state.global_step}.pt",
)
torch.save(
model.state_dict(),
f"/scr/lukeai/{self.model_name}_{state.global_step}.pt",
)
class GradualSparsificationScheduler(TrainerCallback):
def __init__(
self,
num_warmup_steps=40,
increment_ratio=0.5,
is_enabled: bool = False,
model_name: str = "mistral",
):
"""Scheduler for gradually increasing a dead threshold until it reaches the desired threshold.
:param num_warmup_steps: number of training steps required to reach the dead threshold, defaults to 40
:param increment_ratio: by how much to increase the dead threshold.
For example, 0.5 means "increase the threshold by 0.5 * desired threshold
"""
self.num_warmup_steps = num_warmup_steps
self.increment_ratio = increment_ratio
self.step_size = int(num_warmup_steps * increment_ratio)
self.is_enabled = is_enabled
self.model_name = model_name
def on_step_end(self, args, state, control, **kwargs):
model = kwargs["model"]
if not self.is_enabled:
if state.global_step <= 10:
for module in model.modules():
if isinstance(module, MistralSparseSiluMLP):
module.current_dead_threshold = module.dead_threshold
return
current_dead_threshold = 0
desired_dead_threshold = 0
if is_mainprocess():
ds_print(state.global_step)
if state.global_step % self.step_size == 2:
for module in model.modules():
if isinstance(module, MistralSparseSiluMLP):
desired_dead_threshold = copy.deepcopy(module.dead_threshold)
current_dead_threshold = module.current_dead_threshold
current_dead_threshold += (
self.increment_ratio * desired_dead_threshold
)
module.current_dead_threshold = min(
desired_dead_threshold, current_dead_threshold
)
if is_running_deepspeed and is_mainprocess():
ds_print(
state.global_step,
current_dead_threshold,
desired_dead_threshold,
)
if state.global_step % 2000 == 0:
if is_running_deepspeed and is_mainprocess():
ds_print(
f"Saving to /matx/u/lukeai/{self.model_name}_{state.global_step - 2}.pt",
)
torch.save(
model.state_dict(),
f"/matx/u/lukeai/{self.model_name}_{state.global_step - 2}.pt",
)
def get_sparse_mistral_config(
config: MistralConfig,
use_sparse_model=False,
use_sparse_predictor=False,
use_sparse_regularization=False,
thresholds=None,
):
new_config = SparseMistralConfig()
new_config.__dict__.update(config.__dict__)
config = new_config
config.use_sparse_model = use_sparse_model
config.use_sparse_predictor = use_sparse_predictor
config.use_sparse_regularization = use_sparse_regularization
config.thresholds = thresholds
return config
def apply_mistral_sparse_silu_mlp(
model,
config,
use_sparse_regularization: bool = False,
):
# counts = 0
for layer in model.model.layers:
# counts += 1
# if counts < 4:
# continue
original_mlp = layer.mlp
new_mlp = MistralSparseSiluMLP(
config, use_sparse_regularization=use_sparse_regularization
)
new_mlp.gate_proj = original_mlp.gate_proj
new_mlp.up_proj = original_mlp.up_proj
new_mlp.down_proj = original_mlp.down_proj
layer.mlp = new_mlp
def apply_mistral_sparse_decoder_layer(
model,
config,
init_svd: bool = True,
):
assert isinstance(model.model, MistralModel), "model.model must be a MistralModel."
new_layers = []
for layer_idx, layer in enumerate(model.model.layers):
if isinstance(layer.mlp, MistralSparseSiluMLP):
new_layers.append(
SparseMistralDecoderLayer(
config=config,
layer_idx=layer_idx,
decoder_layer=layer,
init_svd=init_svd,
)
)
print(f"{layer_idx}th mlp layer activation: {layer.mlp.sparse_act_fn}")
else:
new_layers.append(layer)
model.model.layers = nn.ModuleList(new_layers)
def enable_sparse_predictor(
model,
):
for layer_idx, layer in enumerate(model.model.layers):
if isinstance(layer, MistralDecoderLayer):
layer.use_sparse_predictor = True
def disable_sparse_predictor(
model,
):
for layer_idx, layer in enumerate(model.model.layers):
if isinstance(layer, MistralDecoderLayer):
layer.use_sparse_predictor = False
def activate_stats(model, is_collect_histogram: bool = True):
for layer in model.model.layers:
if isinstance(layer.mlp, MistralSparseSiluMLP):
layer.mlp.activate_stats(is_collect_histogram=is_collect_histogram)
def deactivate_stats(model):
for layer in model.model.layers:
if isinstance(layer.mlp, MistralSparseSiluMLP):
layer.mlp.deactivate_stats()
def enable_sparse_silu(model):
print("Enabling SparseSilu")
for i, layer in enumerate(model.model.layers):
if isinstance(layer.mlp, MistralSparseSiluMLP):
layer.mlp.kill_sparse_swish_outputs = True
def print_dead_neuron_stats(model):
total_sparsity = 0
counts = 0
for i, layer in enumerate(model.model.layers):
if isinstance(layer.mlp, MistralSparseSiluMLP):
dead_percentage = layer.mlp.dead_percentage * 100
agg_sparsity = layer.mlp.agg_sparsity * 100
print(f"layer {i} sparsity: {dead_percentage:.3f}%")
print(f"layer {i} agg sparsity: {agg_sparsity:.3f}%")
total_sparsity += dead_percentage
counts += 1
print(f"Total sparsity: {total_sparsity/counts: .3f}%")
return total_sparsity / counts
def get_sparse_layers(model: MistralModel):
sparse_layers = [
m.mlp for m in model.layers() if isinstance(m.mlp, MistralSparseSiluMLP)
]
return sparse_layers
def get_threshold(
bin_edges: torch.tensor, histogram_counts: torch.tensor, sparsity_level: float
): # Only for L1 Regularization
assert (
len(bin_edges.shape) == len(histogram_counts.shape) == 1
), "bin_edges and histogram are expected to be 1-dimensional."
histogram_counts /= histogram_counts.sum()
threshold_idx = torch.searchsorted(
histogram_counts.cumsum(0), sparsity_level, side="right"
)
return bin_edges[threshold_idx]
def set_regularization_threshold(model, threshold: float = 0.1):
for i, layer in enumerate(model.model.layers):
if (
isinstance(layer.mlp, MistralSparseSiluMLP) and layer.mlp.is_stats
): # Can set the threshold only the relevant statistics is collected.
layer.mlp.regularization_threshold = threshold # TODO: find better param
def set_sparse_threshold(model, sparsity_level: float, use_relu: bool = False):
for i, layer in enumerate(model.model.layers):
if (
isinstance(layer.mlp, MistralSparseSiluMLP) and layer.mlp.is_stats
): # Can set the threshold only the relevant statistics is collected.
if use_relu:
layer.mlp.sparse_act_fn = nn.ReLU()
layer.mlp.use_relu = True
else:
layer.mlp.dead_threshold = get_threshold(
layer.mlp.histogram_bins,
layer.mlp.post_act_hist_counts,
sparsity_level,
)
layer.mlp.sparse_act_fn.set_new_threshold(layer.mlp.dead_threshold)
layer.mlp.regularization_threshold = (
layer.mlp.dead_threshold * 1.2
) # TODO: find better param
def plot_histogram(
bin_edges,
histogram_counts: torch.tensor,
title: str = "Activation Distribution",
fig_dir: str = "figures",
):
plt.bar(
bin_edges[:-1], histogram_counts, width=np.diff(bin_edges), edgecolor="black"
)
plt.title(title)
plt.xlabel("Activation Value")
plt.ylabel("Frequency")
os.makedirs(fig_dir, exist_ok=True)
plt.savefig(f"{fig_dir}/{title}.png")
# plt.show()
plt.clf()
def plot_act(model, fig_dir: str = "figures"):
for i, layer in enumerate(model.model.layers):
if (
isinstance(layer.mlp, MistralSparseSiluMLP) and layer.mlp.is_stats
): # Can set the threshold only the relevant statistics is collected.
plot_title = f"Layer: {i} Pre-Activation Distribution"
plot_histogram(
layer.mlp.histogram_bins, layer.mlp.pre_act_hist_counts, plot_title
)
plot_title = f"Layer: {i} Post-Activation Absolute Distribution"
plot_histogram(
layer.mlp.histogram_bins, layer.mlp.post_act_hist_counts, plot_title
)
def save_act_hist(
model, filename="/scr/jay/models/mistral/pre_finetune/cola_act_hist.pt"
):
os.makedirs(os.path.dirname(filename), exist_ok=True)
act_dict = {}
for i, layer in enumerate(model.model.layers):
if (
isinstance(layer.mlp, MistralSparseSiluMLP) and layer.mlp.is_stats
): # Can set the threshold only the relevant statistics is collected.
act_dict[i] = (
layer.mlp.histogram_bins,
layer.mlp.pre_act_hist_counts,
layer.mlp.post_act_hist_counts,
)
print("Saving activation histograms...\n\n\n")
torch.save(act_dict, filename)
def load_act_hist(
model, filename="/scr/jay/models/mistral/pre_finetune/cola_act_hist.pt"
):
assert os.path.exists(
filename
), f"{filename} does not exist when loading pre/post-activation histogram of SparseMistralSiluMLP."
print("Loading activation histograms...\n\n\n")
act_dict = torch.load(filename)
for i, layer in enumerate(model.model.layers):
if (
isinstance(layer.mlp, MistralSparseSiluMLP) and layer.mlp.is_stats
): # Can set the threshold only the relevant statistics is collected.
(
layer.mlp.histogram_bins,
layer.mlp.pre_act_hist_counts,
layer.mlp.post_act_hist_counts,
) = act_dict[i]
def enable_last_k_modules(model, start_module_idx: int):
assert 32 > start_module_idx >= 0
new_modules = []
new_idx = 0
for idx in range(start_module_idx, len(model.model.original_layers)):
module = model.model.original_layers[idx]
module.layer_idx = new_idx
module.self_attn.layer_idx = new_idx
new_modules.append(module)
new_idx += 1
print(module.layer_idx)
model.model.layers = nn.ModuleList(new_modules)
def enable_first_k_modules(model, end_module_idx: int):
assert 32 > end_module_idx >= 0
new_modules = []
new_idx = 0
for idx in range(0, end_module_idx + 1):
module = model.model.original_layers[idx]
module.layer_idx = new_idx
module.self_attn.layer_idx = new_idx
new_modules.append(module)
new_idx += 1
print(module.layer_idx)
model.model.layers = nn.ModuleList(new_modules)