|
|
|
|
|
|
|
precision: amp_bf16 |
|
max_seq_len: 32768 |
|
|
|
|
|
tokenizer_name: bert-base-uncased |
|
|
|
|
|
model: |
|
name: bert |
|
pretrained_model_name: ${tokenizer_name} |
|
tokenizer_name: ${tokenizer_name} |
|
model_config: |
|
num_attention_heads: 12 |
|
num_hidden_layers: 12 |
|
attention_probs_dropout_prob: 0.0 |
|
max_position_embeddings: 32768 |
|
|
|
monarch_mixer_sequence_mixing: True |
|
long_conv_l_max: 32768 |
|
long_conv_kernel_learning_rate: 1e-3 |
|
hyena_lr_pos_emb: 1e-5 |
|
hyena_w: 10 |
|
hyena_wd: 0.1 |
|
hyena_emb_dim: 5 |
|
hyena_filter_order: 128 |
|
hyena_training_additions: False |
|
|
|
bidirectional: true |
|
residual_long_conv: true |
|
|
|
use_glu_mlp: True |
|
use_monarch_mlp: True |
|
monarch_mlp_nblocks: 4 |
|
use_positional_encodings: True |
|
|
|
|