nostalgebraist's picture
un-tar'd version of nostalgebraist/nostalgebraist-autoresponder-6_1b
1c11e50
raw
history blame
3.74 kB
{
"constructor_args": {
"params": {
"layer_nums": [
6
],
"n_head": 64,
"mlp_ratio": 1.5,
"attn_dropout": 0,
"res_dropout": 0,
"init_gain": 1.0,
"init_gain_logit_head": 0.02,
"classic_behavior_attn_init": true,
"proj_ratio": 1,
"use_final_mlp": true,
"n_blocks": 1,
"mlp_ratio_blocks": 4,
"n_head_blocks": 16,
"qk_dim_blocks": 4096,
"qk_dim_final": 4096,
"v_dim_final": 4096,
"rotary_blocks": false,
"rotary_dim_blocks": 32,
"init_gain_blocks": 1.0,
"use_block_out_gain": false,
"init_gain_blocks_out": 1.0,
"gain_scale_blocks_out": 1.0,
"tune_base_block_attn": true,
"tune_base_block_mlp": true,
"mlp_only_blocks": false,
"no_orth_init_in_final_mlp": false
},
"opt_params": {
"epochs": 2,
"batch_size": 16,
"base_lr": 0.00014959221354488223,
"weight_decay": 0.025,
"min_lr_frac": 0.1,
"warmup_ratio": 0.05,
"decay_ratio": 0.75,
"adam_beta1": 0.9,
"adam_beta2": 0.995,
"classic_behavior_lr_sched": false,
"block_lr": 3e-05,
"no_weight_decay_in_blocks": true
},
"params_extras": {
"use_proj": false
},
"device": "cuda:0",
"length": 2048,
"regression_target": false,
"calibrate": true,
"calibration_val_size": 0.11111111111111112,
"calibration_split_type": "tts",
"calibration_val_seed": null,
"shuffle_seed": null,
"evaluate_during_training": true,
"huber_delta": 1.0,
"flooding": false,
"flood_level": 0.0,
"cleanup_on_exception": false,
"show_running_loss": true,
"use_amp_training": true,
"use_amp_inference": true,
"pad_to_mult": 8,
"display_interval_secs": 3,
"partial_forward_type": "tfu",
"use_wandb": true,
"wandb_init_args": {
"tags": [
"autoreviewer"
],
"config": {
"acti_dropout": 0,
"adam_beta1": 0.9,
"adam_beta2": 0.995,
"attn_dropout": 0,
"base_lr": 0.00014959221354488223,
"batch_size": 16,
"calibrate": true,
"calibrate_prefixes_separately": false,
"calibration_split_type": "tts",
"calibration_val_size": 0.11111111111111112,
"classic_init": true,
"epochs": 2,
"evaluate_during_training": true,
"flood_level": 0.0,
"flooding": false,
"grad_clip": 1000.0,
"init_gain": 1.0,
"init_gain_logit_head": 0.02,
"layer_nums": [
6
],
"length": 2048,
"min_lr_frac": 0.1,
"mlp_n_layer": 1,
"mlp_ratio": 1.5,
"n_head": 64,
"orth_init": true,
"res_dropout": 0,
"resid_mlp": true,
"selector_style_attn": true,
"supervise_logits": false,
"supervise_only_logit_diff": false,
"use_mlp": true,
"use_only_logit_diff": false,
"warm_resets": false,
"warmup_ratio": 0.05,
"weight_decay": 0.025,
"classic_behavior_lr_sched": false,
"classic_behavior_attn_init": true,
"use_amp_training": true,
"pad_to_mult": 8,
"proj_ratio": 1,
"params_extras": {
"use_proj": false
},
"n_blocks": 1,
"tune_base_block_attn": true,
"tune_base_block_mlp": true,
"grad_acc_steps": 2,
"decay_ratio": 0.75,
"block_lr": 3e-05,
"no_weight_decay_in_blocks": true,
"init_gain_blocks": 1.0,
"init_gain_blocks_out": 1.0,
"gain_scale_blocks_out": 1.0,
"qk_dim_blocks": 4096,
"v_dim_final": 4096,
"rotary_dim_blocks": 32,
"rotary_blocks": false,
"use_block_out_gain": false,
"mlp_ratio_blocks": 4,
"no_orth_init_in_final_mlp": false,
"n_head_blocks": 16,
"qk_dim_final": 4096,
"use_final_mlp": true,
"mlp_only_blocks": false
}
},
"use_galileo": false,
"galileo_separate_runs_for_epochs": false,
"blocks_inference_device_attn": "cuda:0",
"blocks_inference_device_mlp": "cuda:0",
"grad_acc_steps": 2
}
}