gpt2_vs_pythia / GPT2_L0_sparsity_coefficient_0.0012_transformer.h.0.attn_config.json
jbrinkma's picture
Upload GPT2_L0_sparsity_coefficient_0.0012_transformer.h.0.attn_config.json with huggingface_hub
76dfde3 verified
{
"seed": 42,
"model_name_or_path": "gpt2",
"revision": "",
"hook_point": "transformer.h.0.attn",
"dataset_name_or_path": "jbrinkma/pile-100k",
"activation_size": -1,
"add_bos_token": false,
"evaluation_batches": 10,
"expansion_factor": 4,
"b_dec_init_method": "",
"use_pre_encoder_bias": true,
"tied": false,
"n_steps": -1,
"device": "cuda",
"batch_size": 32,
"ctx_length": 256,
"lr": 0.001,
"min_lr": 0.0,
"lr_warmup_steps": 5000,
"sparsity_coefficient": 0.0012000000000000001,
"evaluation_interval": 200,
"beta1": 0.9,
"beta2": 0.999,
"l1_sqrt": false,
"cos_sim_reg": false,
"cos_sim_alpha": 0.0,
"decoder_normalization": true,
"decoder_norm_smaller_than_one": false,
"l1_with_norm": false,
"sqrt_mse": false,
"dynamic_weighting": false,
"l1_warmup_steps": 2000,
"target_l0": 40,
"n_tokens_in_feature_cache": 500000.0,
"use_ghost_grads": false,
"use_neuron_resampling": false,
"resampling_steps": -1,
"output_dir": "outputs",
"cache_dir": "cache",
"checkpoint_interval": 200,
"use_wandb": true,
"wandb_entity": "jannikbrinkmann",
"wandb_project": "best-sae",
"wandb_name": "GPT2_L0_sparsity_coefficient_0.0012_20240323115650953664",
"wandb_group": "L0_L1_Sweeps"
}