m2-bert-80M-32k / config.yaml
Dan Fu
32K partial checkpoint
afc7050
raw
history blame
925 Bytes
# Note that some of the fields in this template haven't been filled in yet.
# Please resolve any `null` fields before launching!
precision: amp_bf16
max_seq_len: 32768
# Tokenizer for dataset creation
tokenizer_name: bert-base-uncased
# Base model config
model:
name: bert
pretrained_model_name: ${tokenizer_name}
tokenizer_name: ${tokenizer_name}
model_config:
num_attention_heads: 12
num_hidden_layers: 12
attention_probs_dropout_prob: 0.0
max_position_embeddings: 32768
monarch_mixer_sequence_mixing: True
long_conv_l_max: 32768
long_conv_kernel_learning_rate: 1e-3
hyena_lr_pos_emb: 1e-5
hyena_w: 10
hyena_wd: 0.1
hyena_emb_dim: 5
hyena_filter_order: 128
hyena_training_additions: False
bidirectional: true
residual_long_conv: true
use_glu_mlp: True
use_monarch_mlp: True
monarch_mlp_nblocks: 4
use_positional_encodings: True