nostalgebraist
/

nostalgebraist-autoresponder-6_1b-unpacked

Model card Files Files and versions Community

nostalgebraist-autoresponder-6_1b-unpacked / draft_autoreviewer /x11p2 /v1 /metadata.json

nostalgebraist

un-tar'd version of nostalgebraist/nostalgebraist-autoresponder-6_1b

1c11e50 over 1 year ago

raw

history blame

3.74 kB

	{
	"constructor_args": {
	"params": {
	"layer_nums": [
	6
	],
	"n_head": 64,
	"mlp_ratio": 1.5,
	"attn_dropout": 0,
	"res_dropout": 0,
	"init_gain": 1.0,
	"init_gain_logit_head": 0.02,
	"classic_behavior_attn_init": true,
	"proj_ratio": 1,
	"use_final_mlp": true,
	"n_blocks": 1,
	"mlp_ratio_blocks": 4,
	"n_head_blocks": 16,
	"qk_dim_blocks": 4096,
	"qk_dim_final": 4096,
	"v_dim_final": 4096,
	"rotary_blocks": false,
	"rotary_dim_blocks": 32,
	"init_gain_blocks": 1.0,
	"use_block_out_gain": false,
	"init_gain_blocks_out": 1.0,
	"gain_scale_blocks_out": 1.0,
	"tune_base_block_attn": true,
	"tune_base_block_mlp": true,
	"mlp_only_blocks": false,
	"no_orth_init_in_final_mlp": false
	},
	"opt_params": {
	"epochs": 2,
	"batch_size": 16,
	"base_lr": 0.00014959221354488223,
	"weight_decay": 0.025,
	"min_lr_frac": 0.1,
	"warmup_ratio": 0.05,
	"decay_ratio": 0.75,
	"adam_beta1": 0.9,
	"adam_beta2": 0.995,
	"classic_behavior_lr_sched": false,
	"block_lr": 3e-05,
	"no_weight_decay_in_blocks": true
	},
	"params_extras": {
	"use_proj": false
	},
	"device": "cuda:0",
	"length": 2048,
	"regression_target": false,
	"calibrate": true,
	"calibration_val_size": 0.11111111111111112,
	"calibration_split_type": "tts",
	"calibration_val_seed": null,
	"shuffle_seed": null,
	"evaluate_during_training": true,
	"huber_delta": 1.0,
	"flooding": false,
	"flood_level": 0.0,
	"cleanup_on_exception": false,
	"show_running_loss": true,
	"use_amp_training": true,
	"use_amp_inference": true,
	"pad_to_mult": 8,
	"display_interval_secs": 3,
	"partial_forward_type": "tfu",
	"use_wandb": true,
	"wandb_init_args": {
	"tags": [
	"autoreviewer"
	],
	"config": {
	"acti_dropout": 0,
	"adam_beta1": 0.9,
	"adam_beta2": 0.995,
	"attn_dropout": 0,
	"base_lr": 0.00014959221354488223,
	"batch_size": 16,
	"calibrate": true,
	"calibrate_prefixes_separately": false,
	"calibration_split_type": "tts",
	"calibration_val_size": 0.11111111111111112,
	"classic_init": true,
	"epochs": 2,
	"evaluate_during_training": true,
	"flood_level": 0.0,
	"flooding": false,
	"grad_clip": 1000.0,
	"init_gain": 1.0,
	"init_gain_logit_head": 0.02,
	"layer_nums": [
	6
	],
	"length": 2048,
	"min_lr_frac": 0.1,
	"mlp_n_layer": 1,
	"mlp_ratio": 1.5,
	"n_head": 64,
	"orth_init": true,
	"res_dropout": 0,
	"resid_mlp": true,
	"selector_style_attn": true,
	"supervise_logits": false,
	"supervise_only_logit_diff": false,
	"use_mlp": true,
	"use_only_logit_diff": false,
	"warm_resets": false,
	"warmup_ratio": 0.05,
	"weight_decay": 0.025,
	"classic_behavior_lr_sched": false,
	"classic_behavior_attn_init": true,
	"use_amp_training": true,
	"pad_to_mult": 8,
	"proj_ratio": 1,
	"params_extras": {
	"use_proj": false
	},
	"n_blocks": 1,
	"tune_base_block_attn": true,
	"tune_base_block_mlp": true,
	"grad_acc_steps": 2,
	"decay_ratio": 0.75,
	"block_lr": 3e-05,
	"no_weight_decay_in_blocks": true,
	"init_gain_blocks": 1.0,
	"init_gain_blocks_out": 1.0,
	"gain_scale_blocks_out": 1.0,
	"qk_dim_blocks": 4096,
	"v_dim_final": 4096,
	"rotary_dim_blocks": 32,
	"rotary_blocks": false,
	"use_block_out_gain": false,
	"mlp_ratio_blocks": 4,
	"no_orth_init_in_final_mlp": false,
	"n_head_blocks": 16,
	"qk_dim_final": 4096,
	"use_final_mlp": true,
	"mlp_only_blocks": false
	}
	},
	"use_galileo": false,
	"galileo_separate_runs_for_epochs": false,
	"blocks_inference_device_attn": "cuda:0",
	"blocks_inference_device_mlp": "cuda:0",
	"grad_acc_steps": 2
	}
	}