itay-nakash commited on
Commit
4360ad9
1 Parent(s): a717674

Upload arch_budget_hours_24.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. arch_budget_hours_24.json +148 -0
arch_budget_hours_24.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ScriptableCrammedBERT"
4
+ ],
5
+ "num_transformer_layers": 16,
6
+ "hidden_size": 768,
7
+ "intermed_size": 3072,
8
+ "hidden_dropout_prob": 0.1,
9
+ "norm": "LayerNorm",
10
+ "norm_eps": 1e-12,
11
+ "norm_scheme": "pre",
12
+ "nonlin": "GELUglu",
13
+ "tie_weights": true,
14
+ "decoder_bias": false,
15
+ "sparse_prediction": 0.25,
16
+ "loss": "cross-entropy",
17
+ "objective_layout": "MLM",
18
+ "embedding": {
19
+ "vocab_size": null,
20
+ "pos_embedding": "scaled-sinusoidal",
21
+ "dropout_prob": 0.1,
22
+ "pad_token_id": 0,
23
+ "max_seq_length": 128,
24
+ "embedding_dim": 768,
25
+ "normalization": true,
26
+ "stable_low_precision": false
27
+ },
28
+ "residual_add": "interp_avg",
29
+ "unet_add": true,
30
+ "attention": {
31
+ "type": "self-attention",
32
+ "kernel_stride_by_layer": [
33
+ [
34
+ 1,
35
+ 1
36
+ ],
37
+ [
38
+ 3,
39
+ 2,
40
+ 1,
41
+ "replicate"
42
+ ],
43
+ [
44
+ 3,
45
+ 2,
46
+ 1,
47
+ "replicate"
48
+ ],
49
+ [
50
+ 3,
51
+ 2,
52
+ 1,
53
+ "replicate"
54
+ ],
55
+ [
56
+ 3,
57
+ 2,
58
+ 1,
59
+ "replicate"
60
+ ],
61
+ [
62
+ 3,
63
+ 2,
64
+ 1,
65
+ "replicate"
66
+ ],
67
+ [
68
+ 3,
69
+ 2,
70
+ 1,
71
+ "replicate"
72
+ ],
73
+ [
74
+ 1,
75
+ 1
76
+ ],
77
+ [
78
+ 1,
79
+ 0.5,
80
+ 1,
81
+ "zeros"
82
+ ],
83
+ [
84
+ 3,
85
+ 0.5,
86
+ 1,
87
+ "zeros"
88
+ ],
89
+ [
90
+ 3,
91
+ 0.5,
92
+ 1,
93
+ "zeros"
94
+ ],
95
+ [
96
+ 3,
97
+ 0.5,
98
+ 1,
99
+ "zeros"
100
+ ],
101
+ [
102
+ 3,
103
+ 0.5,
104
+ 1,
105
+ "zeros"
106
+ ],
107
+ [
108
+ 3,
109
+ 0.5,
110
+ 1,
111
+ "zeros"
112
+ ],
113
+ [
114
+ 3,
115
+ 1
116
+ ],
117
+ [
118
+ 1,
119
+ 1
120
+ ]
121
+ ],
122
+ "causal_attention": false,
123
+ "num_attention_heads": 12,
124
+ "dropout_prob": 0.1,
125
+ "unet_drop_skip": 0.0,
126
+ "skip_output_projection": false,
127
+ "qkv_bias": false,
128
+ "rotary_embedding": false,
129
+ "seq_op_in_fp32": false,
130
+ "sequence_op": "torch-softmax"
131
+ },
132
+ "init": {
133
+ "type": "normal",
134
+ "std": 0.02
135
+ },
136
+ "ffn_layer_frequency": 1,
137
+ "skip_head_transform": true,
138
+ "use_bias": false,
139
+ "final_norm": true,
140
+ "num_labels": null,
141
+ "classification_head": {
142
+ "pooler": "avg",
143
+ "include_ff_layer": true,
144
+ "head_dim": 1024,
145
+ "nonlin": "Tanh",
146
+ "classifier_dropout": 0.1
147
+ }
148
+ }