num_layers: 36 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers. hidden_size: 768 ffn_hidden_size: 2048 # Transformer FFN hidden size. Usually 4 * hidden_size. Since we use Swiglu, which uses extra projection weight matrix, we use 2/3 * 4 * ffn_hidden_size (see https://arxiv.org/abs/2002.05202) num_attention_heads: 12 init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.0 # Dropout probability for hidden state transformer. "Dropout is set to 0 during pretraining" - UL2 paper attention_dropout: 0.0 # Dropout probability in the attention layer. "Dropout is set to 0 during pretraining" - UL2 paper ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. "Dropout is set to 0 during pretraining" - UL2 paper position_embedding_type: 'relative' # Position embedding type. Options ['learned_absolute', 'relative', 'alibi'] relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets. relative_position_bias_self_attention_only: True # whether to only use relative position bias for self attention only. kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 persist_layer_norm: True # Use of persistent fused layer norm kernel. bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. bias: False # Whether to use bias terms in all weight matrices. normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' arch: 'transformer' # Options: ['transformer', 'perceiver'] activation: 'swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer. openai_gelu: False # Use OpenAI's GELU instead of the default GeLU onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. fp32_residual_connection: False # Use FP32 for residual connections. activations_checkpoint_method: null # 'uniform', 'block' activations_checkpoint_num_layers: 1 activations_checkpoint_granularity: null # SELECTIVE: https://github.com/NVIDIA/NeMo/pull/4380 megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF. normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. num_moe_experts: 1 # When >1, FFNs are changed to MoE layers moe_frequency: 1 # every Nth ffn layer will be made MoE moe_dropout: 0.0 # Dropout value for MoE layers # https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/hf_t5v1_1_base_config.yaml