num_layers: 36 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers. | |
hidden_size: 768 | |
ffn_hidden_size: 2048 # Transformer FFN hidden size. Usually 4 * hidden_size. Since we use Swiglu, which uses extra projection weight matrix, we use 2/3 * 4 * ffn_hidden_size (see https://arxiv.org/abs/2002.05202) | |
num_attention_heads: 12 | |
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') | |
hidden_dropout: 0.0 # Dropout probability for hidden state transformer. "Dropout is set to 0 during pretraining" - UL2 paper | |
attention_dropout: 0.0 # Dropout probability in the attention layer. "Dropout is set to 0 during pretraining" - UL2 paper | |
ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. "Dropout is set to 0 during pretraining" - UL2 paper | |
position_embedding_type: 'relative' # Position embedding type. Options ['learned_absolute', 'relative', 'alibi'] | |
relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias | |
relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets. | |
relative_position_bias_self_attention_only: True # whether to only use relative position bias for self attention only. | |
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null | |
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. | |
layernorm_epsilon: 1e-5 | |
persist_layer_norm: True # Use of persistent fused layer norm kernel. | |
bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. | |
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce | |
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. | |
bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. | |
bias: False # Whether to use bias terms in all weight matrices. | |
normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' | |
arch: 'transformer' # Options: ['transformer', 'perceiver'] | |
activation: 'swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] | |
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. | |
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] | |
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders | |
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer. | |
openai_gelu: False # Use OpenAI's GELU instead of the default GeLU | |
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. | |
fp32_residual_connection: False # Use FP32 for residual connections. | |
activations_checkpoint_method: null # 'uniform', 'block' | |
activations_checkpoint_num_layers: 1 | |
activations_checkpoint_granularity: null # SELECTIVE: https://github.com/NVIDIA/NeMo/pull/4380 | |
megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF. | |
normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. | |
num_moe_experts: 1 # When >1, FFNs are changed to MoE layers | |
moe_frequency: 1 # every Nth ffn layer will be made MoE | |
moe_dropout: 0.0 # Dropout value for MoE layers | |
# https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/hf_t5v1_1_base_config.yaml |