File size: 1,330 Bytes
d8fab76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
{
"seed": 42,
"model_name_or_path": "gpt2",
"revision": "",
"hook_point": "transformer.h.0.attn",
"dataset_name_or_path": "jbrinkma/pile-100k",
"activation_size": -1,
"add_bos_token": false,
"evaluation_batches": 10,
"expansion_factor": 4,
"b_dec_init_method": "",
"use_pre_encoder_bias": true,
"tied": false,
"n_steps": -1,
"device": "cuda",
"batch_size": 32,
"ctx_length": 256,
"lr": 0.007525000000000001,
"min_lr": 0.0,
"lr_warmup_steps": 5000,
"sparsity_coefficient": 0.003,
"evaluation_interval": 200,
"beta1": 0.9,
"beta2": 0.999,
"l1_sqrt": false,
"cos_sim_reg": false,
"cos_sim_alpha": 0.0,
"decoder_normalization": true,
"decoder_norm_smaller_than_one": false,
"l1_with_norm": false,
"sqrt_mse": false,
"dynamic_weighting": false,
"l1_warmup_steps": 2000,
"target_l0": 40,
"n_tokens_in_feature_cache": 500000.0,
"use_ghost_grads": false,
"use_neuron_resampling": false,
"resampling_steps": -1,
"output_dir": "outputs",
"cache_dir": "cache",
"checkpoint_interval": 200,
"use_wandb": true,
"wandb_entity": "jannikbrinkmann",
"wandb_project": "best-sae",
"wandb_name": "GPT2_L0_lr_0.007525_20240323115650954291",
"wandb_group": "L0_L1_Sweeps"
} |