JonasGeiping commited on
Commit
b2ff355
1 Parent(s): c27e877

Upload arch_budget_hours_24.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. arch_budget_hours_24.json +62 -0
arch_budget_hours_24.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ScriptableMaskedLM"
4
+ ],
5
+ "num_transformer_layers": 16,
6
+ "hidden_size": 768,
7
+ "intermed_size": 3072,
8
+ "hidden_dropout_prob": 0.1,
9
+ "norm": "LayerNorm",
10
+ "norm_eps": 1e-12,
11
+ "norm_scheme": "pre",
12
+ "nonlin": "GELUglu",
13
+ "tie_weights": true,
14
+ "sparse_prediction": true,
15
+ "decoder_bias": false,
16
+ "loss": "cross-entropy",
17
+ "z_loss_factor": 0,
18
+ "gradient_checkpointing": false,
19
+ "layer_fusion": true,
20
+ "embedding": {
21
+ "vocab_size": null,
22
+ "pos_embedding": "scaled-sinusoidal",
23
+ "dropout_prob": 0.1,
24
+ "pad_token_id": 0,
25
+ "max_seq_length": 128,
26
+ "embedding_dim": 768,
27
+ "normalization": true
28
+ },
29
+ "attention": {
30
+ "type": "flash-attention-impl",
31
+ "causal_attention": false,
32
+ "num_attention_heads": 12,
33
+ "dropout_prob": 0.1,
34
+ "skip_output_projection": false,
35
+ "qkv_bias": false,
36
+ "rotary_embedding": false,
37
+ "seq_op_in_fp32": false,
38
+ "sequence_op": "torch-softmax",
39
+ "high_level_fusion": false,
40
+ "low_level_fusion": true
41
+ },
42
+ "init": {
43
+ "type": "normal",
44
+ "std": 0.02
45
+ },
46
+ "ffn_layer_frequency": 1,
47
+ "deepnorm_scaling": false,
48
+ "skip_head_transform": true,
49
+ "layer_drop_theta": null,
50
+ "use_bias": false,
51
+ "final_norm": true,
52
+ "recurrent_layers": null,
53
+ "layer_macro_type": "transformer",
54
+ "num_labels": null,
55
+ "classification_head": {
56
+ "pooler": "avg",
57
+ "include_ff_layer": true,
58
+ "head_dim": 1024,
59
+ "nonlin": "Tanh",
60
+ "classifier_dropout": 0.1
61
+ }
62
+ }