Dan Fu commited on
Commit
afc7050
1 Parent(s): 6d98b24

32K partial checkpoint

Browse files
Files changed (6) hide show
  1. README.md +13 -0
  2. config.json +4 -0
  3. config.yaml +38 -0
  4. model.bin +3 -0
  5. model.pt +3 -0
  6. version.txt +1 -0
README.md CHANGED
@@ -1,3 +1,16 @@
1
  ---
2
  license: apache-2.0
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ language:
4
+ - en
5
+ pipeline_tag: text-classification
6
  ---
7
+
8
+ # Monarch Mixer-BERT
9
+
10
+ The 80M checkpoint for M2-BERT-base from the paper [Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture](https://arxiv.org/abs/2310.12109).
11
+ This model has been pretrained with sequence length 32K.
12
+ Note (11/3 evening): this is a partial checkpoint, this one had not finished training before upload.
13
+
14
+ This model was trained by Dan Fu, Jon Saad-Falcon, and Simran Arora.
15
+
16
+ Check out our [GitHub](https://github.com/HazyResearch/m2/tree/main) for instructions on how to download and fine-tune it!
config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model_type": "m2_bert"
3
+ }
4
+
config.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Note that some of the fields in this template haven't been filled in yet.
2
+ # Please resolve any `null` fields before launching!
3
+
4
+ precision: amp_bf16
5
+ max_seq_len: 32768
6
+
7
+ # Tokenizer for dataset creation
8
+ tokenizer_name: bert-base-uncased
9
+
10
+ # Base model config
11
+ model:
12
+ name: bert
13
+ pretrained_model_name: ${tokenizer_name}
14
+ tokenizer_name: ${tokenizer_name}
15
+ model_config:
16
+ num_attention_heads: 12
17
+ num_hidden_layers: 12
18
+ attention_probs_dropout_prob: 0.0
19
+ max_position_embeddings: 32768
20
+
21
+ monarch_mixer_sequence_mixing: True
22
+ long_conv_l_max: 32768
23
+ long_conv_kernel_learning_rate: 1e-3
24
+ hyena_lr_pos_emb: 1e-5
25
+ hyena_w: 10
26
+ hyena_wd: 0.1
27
+ hyena_emb_dim: 5
28
+ hyena_filter_order: 128
29
+ hyena_training_additions: False
30
+
31
+ bidirectional: true
32
+ residual_long_conv: true
33
+
34
+ use_glu_mlp: True
35
+ use_monarch_mlp: True
36
+ monarch_mlp_nblocks: 4
37
+ use_positional_encodings: True
38
+
model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5f6da4ea57ab1b407363530efba622552c121dc29b439cc9b202f042108d7d2
3
+ size 440736801
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069a59ecf30222fa1e67f68b76f7155966a875ac2ab060f1cb2d1213015e3596
3
+ size 1315397236
version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1