{ "architectures": [ "MegatronGPTForCausalLM" ], "attention_dropout": 0.0, "auto_map": { "AutoConfig": "configuration_megatron_gpt.MegatronGPTConfig", "AutoModelForCausalLM": "modeling_megatron_gpt.MegatronGPTForCausalLM" }, "bias": true, "bos_token_id": 2, "classifier_dropout": 0.0, "eos_token_id": 2, "hidden_act": "gelu", "hidden_dropout": 0.0, "hidden_size": 4096, "initializer_range": 0.01, "intermediate_size": 10880, "layer_norm_eps": 1e-05, "max_position_embeddings": 2048, "model_type": "megatron_gpt", "normalization": "layernorm1p", "normalize_attention_scores": true, "num_attention_heads": 32, "num_hidden_layers": 32, "pad_token_id": 0, "rope_scaling": null, "rotary_emb_base": 10000, "rotary_pct": 0.5, "self_attention_relative_position_bias": true, "tie_word_embeddings": false, "tokenizer_class": "GPT2Tokenizer", "torch_dtype": "float16", "transformers_version": "4.31.0", "use_cache": true, "vocab_size": 56064 }