|
{ |
|
"constructor_args": { |
|
"params": { |
|
"layer_nums": [ |
|
6 |
|
], |
|
"n_head": 64, |
|
"mlp_ratio": 1.5, |
|
"attn_dropout": 0, |
|
"res_dropout": 0, |
|
"init_gain": 1.0, |
|
"init_gain_logit_head": 0.02, |
|
"classic_behavior_attn_init": true, |
|
"proj_ratio": 1, |
|
"use_final_mlp": true, |
|
"n_blocks": 1, |
|
"mlp_ratio_blocks": 4, |
|
"n_head_blocks": 16, |
|
"qk_dim_blocks": 4096, |
|
"qk_dim_final": 4096, |
|
"v_dim_final": 4096, |
|
"rotary_blocks": false, |
|
"rotary_dim_blocks": 32, |
|
"init_gain_blocks": 1.0, |
|
"use_block_out_gain": false, |
|
"init_gain_blocks_out": 1.0, |
|
"gain_scale_blocks_out": 1.0, |
|
"tune_base_block_attn": true, |
|
"tune_base_block_mlp": true, |
|
"mlp_only_blocks": false, |
|
"no_orth_init_in_final_mlp": false |
|
}, |
|
"opt_params": { |
|
"epochs": 2, |
|
"batch_size": 16, |
|
"base_lr": 0.00014959221354488223, |
|
"weight_decay": 0.025, |
|
"min_lr_frac": 0.1, |
|
"warmup_ratio": 0.05, |
|
"decay_ratio": 0.75, |
|
"adam_beta1": 0.9, |
|
"adam_beta2": 0.995, |
|
"classic_behavior_lr_sched": false, |
|
"block_lr": 3e-05, |
|
"no_weight_decay_in_blocks": true |
|
}, |
|
"params_extras": { |
|
"use_proj": false |
|
}, |
|
"device": "cuda:0", |
|
"length": 2048, |
|
"regression_target": false, |
|
"calibrate": true, |
|
"calibration_val_size": 0.11111111111111112, |
|
"calibration_split_type": "tts", |
|
"calibration_val_seed": null, |
|
"shuffle_seed": null, |
|
"evaluate_during_training": true, |
|
"huber_delta": 1.0, |
|
"flooding": false, |
|
"flood_level": 0.0, |
|
"cleanup_on_exception": false, |
|
"show_running_loss": true, |
|
"use_amp_training": true, |
|
"use_amp_inference": true, |
|
"pad_to_mult": 8, |
|
"display_interval_secs": 3, |
|
"partial_forward_type": "tfu", |
|
"use_wandb": true, |
|
"wandb_init_args": { |
|
"tags": [ |
|
"autoreviewer" |
|
], |
|
"config": { |
|
"acti_dropout": 0, |
|
"adam_beta1": 0.9, |
|
"adam_beta2": 0.995, |
|
"attn_dropout": 0, |
|
"base_lr": 0.00014959221354488223, |
|
"batch_size": 16, |
|
"calibrate": true, |
|
"calibrate_prefixes_separately": false, |
|
"calibration_split_type": "tts", |
|
"calibration_val_size": 0.11111111111111112, |
|
"classic_init": true, |
|
"epochs": 2, |
|
"evaluate_during_training": true, |
|
"flood_level": 0.0, |
|
"flooding": false, |
|
"grad_clip": 1000.0, |
|
"init_gain": 1.0, |
|
"init_gain_logit_head": 0.02, |
|
"layer_nums": [ |
|
6 |
|
], |
|
"length": 2048, |
|
"min_lr_frac": 0.1, |
|
"mlp_n_layer": 1, |
|
"mlp_ratio": 1.5, |
|
"n_head": 64, |
|
"orth_init": true, |
|
"res_dropout": 0, |
|
"resid_mlp": true, |
|
"selector_style_attn": true, |
|
"supervise_logits": false, |
|
"supervise_only_logit_diff": false, |
|
"use_mlp": true, |
|
"use_only_logit_diff": false, |
|
"warm_resets": false, |
|
"warmup_ratio": 0.05, |
|
"weight_decay": 0.025, |
|
"classic_behavior_lr_sched": false, |
|
"classic_behavior_attn_init": true, |
|
"use_amp_training": true, |
|
"pad_to_mult": 8, |
|
"proj_ratio": 1, |
|
"params_extras": { |
|
"use_proj": false |
|
}, |
|
"n_blocks": 1, |
|
"tune_base_block_attn": true, |
|
"tune_base_block_mlp": true, |
|
"grad_acc_steps": 2, |
|
"decay_ratio": 0.75, |
|
"block_lr": 3e-05, |
|
"no_weight_decay_in_blocks": true, |
|
"init_gain_blocks": 1.0, |
|
"init_gain_blocks_out": 1.0, |
|
"gain_scale_blocks_out": 1.0, |
|
"qk_dim_blocks": 4096, |
|
"v_dim_final": 4096, |
|
"rotary_dim_blocks": 32, |
|
"rotary_blocks": false, |
|
"use_block_out_gain": false, |
|
"mlp_ratio_blocks": 4, |
|
"no_orth_init_in_final_mlp": false, |
|
"n_head_blocks": 16, |
|
"qk_dim_final": 4096, |
|
"use_final_mlp": true, |
|
"mlp_only_blocks": false |
|
} |
|
}, |
|
"use_galileo": false, |
|
"galileo_separate_runs_for_epochs": false, |
|
"blocks_inference_device_attn": "cuda:0", |
|
"blocks_inference_device_mlp": "cuda:0", |
|
"grad_acc_steps": 2 |
|
} |
|
} |