Lauler
/

UL2-nemo-conversion

Model card Files Files and versions Community

UL2-nemo-conversion / nemo_config /ul2-base-nl36 /megatron_model_ul2base_config.yaml

Faton Rekathati

UL2 conversion instructions

0fd282e over 1 year ago

3.94 kB

	num_layers: 36 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
	hidden_size: 768
	ffn_hidden_size: 2048 # Transformer FFN hidden size. Usually 4 * hidden_size. Since we use Swiglu, which uses extra projection weight matrix, we use 2/3 * 4 * ffn_hidden_size (see https://arxiv.org/abs/2002.05202)
	num_attention_heads: 12
	init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
	hidden_dropout: 0.0 # Dropout probability for hidden state transformer. "Dropout is set to 0 during pretraining" - UL2 paper
	attention_dropout: 0.0 # Dropout probability in the attention layer. "Dropout is set to 0 during pretraining" - UL2 paper
	ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. "Dropout is set to 0 during pretraining" - UL2 paper
	position_embedding_type: 'relative' # Position embedding type. Options ['learned_absolute', 'relative', 'alibi']
	relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
	relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
	relative_position_bias_self_attention_only: True # whether to only use relative position bias for self attention only.
	kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
	apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
	layernorm_epsilon: 1e-5
	persist_layer_norm: True # Use of persistent fused layer norm kernel.
	bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
	grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
	masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
	bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
	bias: False # Whether to use bias terms in all weight matrices.
	normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
	arch: 'transformer' # Options: ['transformer', 'perceiver']
	activation: 'swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
	headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
	transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
	hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
	num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
	openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
	onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
	fp32_residual_connection: False # Use FP32 for residual connections.
	activations_checkpoint_method: null # 'uniform', 'block'
	activations_checkpoint_num_layers: 1
	activations_checkpoint_granularity: null # SELECTIVE: https://github.com/NVIDIA/NeMo/pull/4380
	megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF.
	normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
	num_moe_experts: 1 # When >1, FFNs are changed to MoE layers
	moe_frequency: 1 # every Nth ffn layer will be made MoE
	moe_dropout: 0.0 # Dropout value for MoE layers
	# https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/hf_t5v1_1_base_config.yaml