|
{ |
|
"_name_or_path": "gpt2", |
|
"activation_function": "leaky_relu", |
|
"architectures": [ |
|
"GPT2LMHeadModel" |
|
], |
|
"attn_block_resid_gain": 1, |
|
"attn_block_skip_gain": 1, |
|
"attn_mat_resid_gain": 1, |
|
"attn_mat_skip_gain": 0, |
|
"attn_pdrop": 0, |
|
"bos_token_id": 0, |
|
"centre_attn": false, |
|
"centre_attn_gain": 1.0, |
|
"embd_pdrop": 0, |
|
"eos_token_id": 0, |
|
"first_layer_value_resid_gain": null, |
|
"initializer_range": 0.02, |
|
"key_init_std": null, |
|
"last_layer_proj_resid_gain": null, |
|
"layer_norm_epsilon": 1e-05, |
|
"lrelu_neg_slope": 0, |
|
"mlp_block_resid_gain": 1, |
|
"mlp_block_skip_gain": 1, |
|
"mlp_proj_init_std": false, |
|
"model_type": "gpt2", |
|
"n_ctx": 128, |
|
"n_embd": 117, |
|
"n_head": 9, |
|
"n_inner": 468, |
|
"n_layer": 12, |
|
"n_positions": 1024, |
|
"norm_position": "pre", |
|
"norm_type": "rmsnorm", |
|
"output_attentions": "false", |
|
"parallel_layers": false, |
|
"proj_init_type": "normal", |
|
"proj_resid_gain": 1.0, |
|
"proj_skip_gain": null, |
|
"query_init_std": null, |
|
"reorder_and_upcast_attn": false, |
|
"resid_pdrop": 0, |
|
"scale_attn_by_inverse_layer_idx": false, |
|
"scale_attn_weights": true, |
|
"summary_activation": null, |
|
"summary_first_dropout": 0.1, |
|
"summary_proj_to_labels": true, |
|
"summary_type": "cls_index", |
|
"summary_use_proj": true, |
|
"task_specific_params": { |
|
"text-generation": { |
|
"do_sample": true, |
|
"max_length": 50 |
|
} |
|
}, |
|
"tie_valproj_init": null, |
|
"torch_dtype": "float32", |
|
"trainable_attn_block_gains": false, |
|
"trainable_attn_mat_gains": false, |
|
"trainable_mlp_block_gains": false, |
|
"trainable_proj_gains": false, |
|
"trainable_value_gains": false, |
|
"transformers_version": "4.38.1", |
|
"use_cache": true, |
|
"val_init_type": "normal", |
|
"val_proj_init_std": null, |
|
"value_resid_gain": 1, |
|
"value_skip_gain": 0, |
|
"vocab_size": 50000 |
|
} |
|
|