d-matrix commited on
Commit
e66e7f5
1 Parent(s): aa52ebb

adding activations monkey-patched version

Browse files
Files changed (2) hide show
  1. FALLBACK.yaml +446 -447
  2. activations.py +251 -0
FALLBACK.yaml CHANGED
@@ -1,447 +1,446 @@
1
- model:
2
- lm_head:
3
- accum_format: SAME
4
- approximation_function: NONE
5
- input_format: SAME
6
- instance: Linear
7
- output_format: SAME
8
- weight_format: SAME
9
- weight_sparseness: DENSE
10
- transformer.drop:
11
- approximation_function: NONE
12
- input_format: SAME
13
- instance: Dropout
14
- output_format: SAME
15
- transformer.h.0.attn.attn_dropout:
16
- approximation_function: NONE
17
- input_format: SAME
18
- instance: Dropout
19
- output_format: BFP[8|8]{64,-1}(SN)
20
- transformer.h.0.attn.c_attn:
21
- approximation_function: NONE
22
- bias_format: SAME
23
- input_format: BFP[8|8]{64,-1}(SN)
24
- instance: HFTransformersConv1D
25
- output_format: BFP[8|8]{64,-1}(SN)
26
- weight_format: BFP[8|8]{64,0}(SN)
27
- weight_sparseness: DENSE
28
- transformer.h.0.attn.c_proj:
29
- approximation_function: NONE
30
- bias_format: SAME
31
- input_format: BFP[8|8]{64,-1}(SN)
32
- instance: HFTransformersConv1D
33
- output_format: SAME
34
- weight_format: BFP[8|8]{64,0}(SN)
35
- weight_sparseness: DENSE
36
- transformer.h.0.attn.resid_dropout:
37
- approximation_function: NONE
38
- input_format: SAME
39
- instance: Dropout
40
- output_format: SAME
41
- transformer.h.0.attn.softmax:
42
- approximation_function: SOFTMAX(base2,float16)
43
- input_format: SAME
44
- instance: Softmax
45
- output_format: SAME
46
- transformer.h.0.ln_1:
47
- approximation_function: LAYERNORM(fallback,4,float16)
48
- bias_format: SAME
49
- input_format: SAME
50
- instance: LayerNorm
51
- output_format: SAME
52
- weight_format: SAME
53
- transformer.h.0.ln_2:
54
- approximation_function: LAYERNORM(fallback,4,float16)
55
- bias_format: SAME
56
- input_format: SAME
57
- instance: LayerNorm
58
- output_format: SAME
59
- weight_format: SAME
60
- transformer.h.0.mlp.act:
61
- approximation_function: GELU(vsimd)
62
- input_format: SAME
63
- instance: GELU
64
- output_format: SAME
65
- transformer.h.0.mlp.c_fc:
66
- approximation_function: NONE
67
- bias_format: SAME
68
- input_format: BFP[8|8]{64,-1}(SN)
69
- instance: HFTransformersConv1D
70
- output_format: SAME
71
- weight_format: BFP[8|8]{64,0}(SN)
72
- weight_sparseness: DENSE
73
- transformer.h.0.mlp.c_proj:
74
- approximation_function: NONE
75
- bias_format: SAME
76
- input_format: BFP[8|8]{64,-1}(SN)
77
- instance: HFTransformersConv1D
78
- output_format: SAME
79
- weight_format: BFP[8|8]{64,0}(SN)
80
- weight_sparseness: DENSE
81
- transformer.h.0.mlp.dropout:
82
- approximation_function: NONE
83
- input_format: SAME
84
- instance: Dropout
85
- output_format: SAME
86
- transformer.h.1.attn.attn_dropout:
87
- approximation_function: NONE
88
- input_format: SAME
89
- instance: Dropout
90
- output_format: BFP[8|8]{64,-1}(SN)
91
- transformer.h.1.attn.c_attn:
92
- approximation_function: NONE
93
- bias_format: SAME
94
- input_format: BFP[8|8]{64,-1}(SN)
95
- instance: HFTransformersConv1D
96
- output_format: BFP[8|8]{64,-1}(SN)
97
- weight_format: BFP[8|8]{64,0}(SN)
98
- weight_sparseness: DENSE
99
- transformer.h.1.attn.c_proj:
100
- approximation_function: NONE
101
- bias_format: SAME
102
- input_format: BFP[8|8]{64,-1}(SN)
103
- instance: HFTransformersConv1D
104
- output_format: SAME
105
- weight_format: BFP[8|8]{64,0}(SN)
106
- weight_sparseness: DENSE
107
- transformer.h.1.attn.resid_dropout:
108
- approximation_function: NONE
109
- input_format: SAME
110
- instance: Dropout
111
- output_format: SAME
112
- transformer.h.1.attn.softmax:
113
- approximation_function: SOFTMAX(base2,float16)
114
- input_format: SAME
115
- instance: Softmax
116
- output_format: SAME
117
- transformer.h.1.ln_1:
118
- approximation_function: LAYERNORM(fallback,4,float16)
119
- bias_format: SAME
120
- input_format: SAME
121
- instance: LayerNorm
122
- output_format: SAME
123
- weight_format: SAME
124
- transformer.h.1.ln_2:
125
- approximation_function: LAYERNORM(fallback,4,float16)
126
- bias_format: SAME
127
- input_format: SAME
128
- instance: LayerNorm
129
- output_format: SAME
130
- weight_format: SAME
131
- transformer.h.1.mlp.act:
132
- approximation_function: GELU(vsimd)
133
- input_format: SAME
134
- instance: GELU
135
- output_format: SAME
136
- transformer.h.1.mlp.c_fc:
137
- approximation_function: NONE
138
- bias_format: SAME
139
- input_format: BFP[8|8]{64,-1}(SN)
140
- instance: HFTransformersConv1D
141
- output_format: SAME
142
- weight_format: BFP[8|8]{64,0}(SN)
143
- weight_sparseness: DENSE
144
- transformer.h.1.mlp.c_proj:
145
- approximation_function: NONE
146
- bias_format: SAME
147
- input_format: BFP[8|8]{64,-1}(SN)
148
- instance: HFTransformersConv1D
149
- output_format: SAME
150
- weight_format: BFP[8|8]{64,0}(SN)
151
- weight_sparseness: DENSE
152
- transformer.h.1.mlp.dropout:
153
- approximation_function: NONE
154
- input_format: SAME
155
- instance: Dropout
156
- output_format: SAME
157
- transformer.h.2.attn.attn_dropout:
158
- approximation_function: NONE
159
- input_format: SAME
160
- instance: Dropout
161
- output_format: BFP[8|8]{64,-1}(SN)
162
- transformer.h.2.attn.c_attn:
163
- approximation_function: NONE
164
- bias_format: SAME
165
- input_format: BFP[8|8]{64,-1}(SN)
166
- instance: HFTransformersConv1D
167
- output_format: BFP[8|8]{64,-1}(SN)
168
- weight_format: BFP[8|8]{64,0}(SN)
169
- weight_sparseness: DENSE
170
- transformer.h.2.attn.c_proj:
171
- approximation_function: NONE
172
- bias_format: SAME
173
- input_format: BFP[8|8]{64,-1}(SN)
174
- instance: HFTransformersConv1D
175
- output_format: SAME
176
- weight_format: BFP[8|8]{64,0}(SN)
177
- weight_sparseness: DENSE
178
- transformer.h.2.attn.resid_dropout:
179
- approximation_function: NONE
180
- input_format: SAME
181
- instance: Dropout
182
- output_format: SAME
183
- transformer.h.2.attn.softmax:
184
- approximation_function: SOFTMAX(base2,float16)
185
- input_format: SAME
186
- instance: Softmax
187
- output_format: SAME
188
- transformer.h.2.ln_1:
189
- approximation_function: LAYERNORM(fallback,4,float16)
190
- bias_format: SAME
191
- input_format: SAME
192
- instance: LayerNorm
193
- output_format: SAME
194
- weight_format: SAME
195
- transformer.h.2.ln_2:
196
- approximation_function: LAYERNORM(fallback,4,float16)
197
- bias_format: SAME
198
- input_format: SAME
199
- instance: LayerNorm
200
- output_format: SAME
201
- weight_format: SAME
202
- transformer.h.2.mlp.act:
203
- approximation_function: GELU(vsimd)
204
- input_format: SAME
205
- instance: GELU
206
- output_format: SAME
207
- transformer.h.2.mlp.c_fc:
208
- approximation_function: NONE
209
- bias_format: SAME
210
- input_format: BFP[8|8]{64,-1}(SN)
211
- instance: HFTransformersConv1D
212
- output_format: SAME
213
- weight_format: BFP[8|8]{64,0}(SN)
214
- weight_sparseness: DENSE
215
- transformer.h.2.mlp.c_proj:
216
- approximation_function: NONE
217
- bias_format: SAME
218
- input_format: BFP[8|8]{64,-1}(SN)
219
- instance: HFTransformersConv1D
220
- output_format: SAME
221
- weight_format: BFP[8|8]{64,0}(SN)
222
- weight_sparseness: DENSE
223
- transformer.h.2.mlp.dropout:
224
- approximation_function: NONE
225
- input_format: SAME
226
- instance: Dropout
227
- output_format: SAME
228
- transformer.h.3.attn.attn_dropout:
229
- approximation_function: NONE
230
- input_format: SAME
231
- instance: Dropout
232
- output_format: BFP[8|8]{64,-1}(SN)
233
- transformer.h.3.attn.c_attn:
234
- approximation_function: NONE
235
- bias_format: SAME
236
- input_format: BFP[8|8]{64,-1}(SN)
237
- instance: HFTransformersConv1D
238
- output_format: BFP[8|8]{64,-1}(SN)
239
- weight_format: BFP[8|8]{64,0}(SN)
240
- weight_sparseness: DENSE
241
- transformer.h.3.attn.c_proj:
242
- approximation_function: NONE
243
- bias_format: SAME
244
- input_format: BFP[8|8]{64,-1}(SN)
245
- instance: HFTransformersConv1D
246
- output_format: SAME
247
- weight_format: BFP[8|8]{64,0}(SN)
248
- weight_sparseness: DENSE
249
- transformer.h.3.attn.resid_dropout:
250
- approximation_function: NONE
251
- input_format: SAME
252
- instance: Dropout
253
- output_format: SAME
254
- transformer.h.3.attn.softmax:
255
- approximation_function: SOFTMAX(base2,float16)
256
- input_format: SAME
257
- instance: Softmax
258
- output_format: SAME
259
- transformer.h.3.ln_1:
260
- approximation_function: LAYERNORM(fallback,4,float16)
261
- bias_format: SAME
262
- input_format: SAME
263
- instance: LayerNorm
264
- output_format: SAME
265
- weight_format: SAME
266
- transformer.h.3.ln_2:
267
- approximation_function: LAYERNORM(fallback,4,float16)
268
- bias_format: SAME
269
- input_format: SAME
270
- instance: LayerNorm
271
- output_format: SAME
272
- weight_format: SAME
273
- transformer.h.3.mlp.act:
274
- approximation_function: GELU(vsimd)
275
- input_format: SAME
276
- instance: GELU
277
- output_format: SAME
278
- transformer.h.3.mlp.c_fc:
279
- approximation_function: NONE
280
- bias_format: SAME
281
- input_format: BFP[8|8]{64,-1}(SN)
282
- instance: HFTransformersConv1D
283
- output_format: SAME
284
- weight_format: BFP[8|8]{64,0}(SN)
285
- weight_sparseness: DENSE
286
- transformer.h.3.mlp.c_proj:
287
- approximation_function: NONE
288
- bias_format: SAME
289
- input_format: BFP[8|8]{64,-1}(SN)
290
- instance: HFTransformersConv1D
291
- output_format: SAME
292
- weight_format: BFP[8|8]{64,0}(SN)
293
- weight_sparseness: DENSE
294
- transformer.h.3.mlp.dropout:
295
- approximation_function: NONE
296
- input_format: SAME
297
- instance: Dropout
298
- output_format: SAME
299
- transformer.h.4.attn.attn_dropout:
300
- approximation_function: NONE
301
- input_format: SAME
302
- instance: Dropout
303
- output_format: BFP[8|8]{64,-1}(SN)
304
- transformer.h.4.attn.c_attn:
305
- approximation_function: NONE
306
- bias_format: SAME
307
- input_format: BFP[8|8]{64,-1}(SN)
308
- instance: HFTransformersConv1D
309
- output_format: BFP[8|8]{64,-1}(SN)
310
- weight_format: BFP[8|8]{64,0}(SN)
311
- weight_sparseness: DENSE
312
- transformer.h.4.attn.c_proj:
313
- approximation_function: NONE
314
- bias_format: SAME
315
- input_format: BFP[8|8]{64,-1}(SN)
316
- instance: HFTransformersConv1D
317
- output_format: SAME
318
- weight_format: BFP[8|8]{64,0}(SN)
319
- weight_sparseness: DENSE
320
- transformer.h.4.attn.resid_dropout:
321
- approximation_function: NONE
322
- input_format: SAME
323
- instance: Dropout
324
- output_format: SAME
325
- transformer.h.4.attn.softmax:
326
- approximation_function: SOFTMAX(base2,float16)
327
- input_format: SAME
328
- instance: Softmax
329
- output_format: SAME
330
- transformer.h.4.ln_1:
331
- approximation_function: LAYERNORM(fallback,4,float16)
332
- bias_format: SAME
333
- input_format: SAME
334
- instance: LayerNorm
335
- output_format: SAME
336
- weight_format: SAME
337
- transformer.h.4.ln_2:
338
- approximation_function: LAYERNORM(fallback,4,float16)
339
- bias_format: SAME
340
- input_format: SAME
341
- instance: LayerNorm
342
- output_format: SAME
343
- weight_format: SAME
344
- transformer.h.4.mlp.act:
345
- approximation_function: GELU(vsimd)
346
- input_format: SAME
347
- instance: GELU
348
- output_format: SAME
349
- transformer.h.4.mlp.c_fc:
350
- approximation_function: NONE
351
- bias_format: SAME
352
- input_format: BFP[8|8]{64,-1}(SN)
353
- instance: HFTransformersConv1D
354
- output_format: SAME
355
- weight_format: BFP[8|8]{64,0}(SN)
356
- weight_sparseness: DENSE
357
- transformer.h.4.mlp.c_proj:
358
- approximation_function: NONE
359
- bias_format: SAME
360
- input_format: BFP[8|8]{64,-1}(SN)
361
- instance: HFTransformersConv1D
362
- output_format: SAME
363
- weight_format: BFP[8|8]{64,0}(SN)
364
- weight_sparseness: DENSE
365
- transformer.h.4.mlp.dropout:
366
- approximation_function: NONE
367
- input_format: SAME
368
- instance: Dropout
369
- output_format: SAME
370
- transformer.h.5.attn.attn_dropout:
371
- approximation_function: NONE
372
- input_format: SAME
373
- instance: Dropout
374
- output_format: BFP[8|8]{64,-1}(SN)
375
- transformer.h.5.attn.c_attn:
376
- approximation_function: NONE
377
- bias_format: SAME
378
- input_format: BFP[8|8]{64,-1}(SN)
379
- instance: HFTransformersConv1D
380
- output_format: BFP[8|8]{64,-1}(SN)
381
- weight_format: BFP[8|8]{64,0}(SN)
382
- weight_sparseness: DENSE
383
- transformer.h.5.attn.c_proj:
384
- approximation_function: NONE
385
- bias_format: SAME
386
- input_format: BFP[8|8]{64,-1}(SN)
387
- instance: HFTransformersConv1D
388
- output_format: SAME
389
- weight_format: BFP[8|8]{64,0}(SN)
390
- weight_sparseness: DENSE
391
- transformer.h.5.attn.resid_dropout:
392
- approximation_function: NONE
393
- input_format: SAME
394
- instance: Dropout
395
- output_format: SAME
396
- transformer.h.5.attn.softmax:
397
- approximation_function: SOFTMAX(base2,float16)
398
- input_format: SAME
399
- instance: Softmax
400
- output_format: SAME
401
- transformer.h.5.ln_1:
402
- approximation_function: LAYERNORM(fallback,4,float16)
403
- bias_format: SAME
404
- input_format: SAME
405
- instance: LayerNorm
406
- output_format: SAME
407
- weight_format: SAME
408
- transformer.h.5.ln_2:
409
- approximation_function: LAYERNORM(fallback,4,float16)
410
- bias_format: SAME
411
- input_format: SAME
412
- instance: LayerNorm
413
- output_format: SAME
414
- weight_format: SAME
415
- transformer.h.5.mlp.act:
416
- approximation_function: GELU(vsimd)
417
- input_format: SAME
418
- instance: GELU
419
- output_format: SAME
420
- transformer.h.5.mlp.c_fc:
421
- approximation_function: NONE
422
- bias_format: SAME
423
- input_format: BFP[8|8]{64,-1}(SN)
424
- instance: HFTransformersConv1D
425
- output_format: SAME
426
- weight_format: BFP[8|8]{64,0}(SN)
427
- weight_sparseness: DENSE
428
- transformer.h.5.mlp.c_proj:
429
- approximation_function: NONE
430
- bias_format: SAME
431
- input_format: BFP[8|8]{64,-1}(SN)
432
- instance: HFTransformersConv1D
433
- output_format: SAME
434
- weight_format: BFP[8|8]{64,0}(SN)
435
- weight_sparseness: DENSE
436
- transformer.h.5.mlp.dropout:
437
- approximation_function: NONE
438
- input_format: SAME
439
- instance: Dropout
440
- output_format: SAME
441
- transformer.ln_f:
442
- approximation_function: LAYERNORM(fallback,4,float16)
443
- bias_format: SAME
444
- input_format: SAME
445
- instance: LayerNorm
446
- output_format: SAME
447
- weight_format: SAME
 
1
+ lm_head:
2
+ accum_format: SAME
3
+ approximation_function: NONE
4
+ input_format: SAME
5
+ instance: Linear
6
+ output_format: SAME
7
+ weight_format: SAME
8
+ weight_sparseness: DENSE
9
+ transformer.drop:
10
+ approximation_function: NONE
11
+ input_format: SAME
12
+ instance: Dropout
13
+ output_format: SAME
14
+ transformer.h.0.attn.attn_dropout:
15
+ approximation_function: NONE
16
+ input_format: SAME
17
+ instance: Dropout
18
+ output_format: BFP[8|8]{64,-1}(SN)
19
+ transformer.h.0.attn.c_attn:
20
+ approximation_function: NONE
21
+ bias_format: SAME
22
+ input_format: BFP[8|8]{64,-1}(SN)
23
+ instance: HFTransformersConv1D
24
+ output_format: BFP[8|8]{64,-1}(SN)
25
+ weight_format: BFP[8|8]{64,0}(SN)
26
+ weight_sparseness: DENSE
27
+ transformer.h.0.attn.c_proj:
28
+ approximation_function: NONE
29
+ bias_format: SAME
30
+ input_format: BFP[8|8]{64,-1}(SN)
31
+ instance: HFTransformersConv1D
32
+ output_format: SAME
33
+ weight_format: BFP[8|8]{64,0}(SN)
34
+ weight_sparseness: DENSE
35
+ transformer.h.0.attn.resid_dropout:
36
+ approximation_function: NONE
37
+ input_format: SAME
38
+ instance: Dropout
39
+ output_format: SAME
40
+ transformer.h.0.attn.softmax:
41
+ approximation_function: SOFTMAX(base2,float16)
42
+ input_format: SAME
43
+ instance: Softmax
44
+ output_format: SAME
45
+ transformer.h.0.ln_1:
46
+ approximation_function: LAYERNORM(fallback,4,float16)
47
+ bias_format: SAME
48
+ input_format: SAME
49
+ instance: LayerNorm
50
+ output_format: SAME
51
+ weight_format: SAME
52
+ transformer.h.0.ln_2:
53
+ approximation_function: LAYERNORM(fallback,4,float16)
54
+ bias_format: SAME
55
+ input_format: SAME
56
+ instance: LayerNorm
57
+ output_format: SAME
58
+ weight_format: SAME
59
+ transformer.h.0.mlp.act:
60
+ approximation_function: GELU(poly2,float16)
61
+ input_format: SAME
62
+ instance: GELU
63
+ output_format: SAME
64
+ transformer.h.0.mlp.c_fc:
65
+ approximation_function: NONE
66
+ bias_format: SAME
67
+ input_format: BFP[8|8]{64,-1}(SN)
68
+ instance: HFTransformersConv1D
69
+ output_format: SAME
70
+ weight_format: BFP[8|8]{64,0}(SN)
71
+ weight_sparseness: DENSE
72
+ transformer.h.0.mlp.c_proj:
73
+ approximation_function: NONE
74
+ bias_format: SAME
75
+ input_format: BFP[8|8]{64,-1}(SN)
76
+ instance: HFTransformersConv1D
77
+ output_format: SAME
78
+ weight_format: BFP[8|8]{64,0}(SN)
79
+ weight_sparseness: DENSE
80
+ transformer.h.0.mlp.dropout:
81
+ approximation_function: NONE
82
+ input_format: SAME
83
+ instance: Dropout
84
+ output_format: SAME
85
+ transformer.h.1.attn.attn_dropout:
86
+ approximation_function: NONE
87
+ input_format: SAME
88
+ instance: Dropout
89
+ output_format: BFP[8|8]{64,-1}(SN)
90
+ transformer.h.1.attn.c_attn:
91
+ approximation_function: NONE
92
+ bias_format: SAME
93
+ input_format: BFP[8|8]{64,-1}(SN)
94
+ instance: HFTransformersConv1D
95
+ output_format: BFP[8|8]{64,-1}(SN)
96
+ weight_format: BFP[8|8]{64,0}(SN)
97
+ weight_sparseness: DENSE
98
+ transformer.h.1.attn.c_proj:
99
+ approximation_function: NONE
100
+ bias_format: SAME
101
+ input_format: BFP[8|8]{64,-1}(SN)
102
+ instance: HFTransformersConv1D
103
+ output_format: SAME
104
+ weight_format: BFP[8|8]{64,0}(SN)
105
+ weight_sparseness: DENSE
106
+ transformer.h.1.attn.resid_dropout:
107
+ approximation_function: NONE
108
+ input_format: SAME
109
+ instance: Dropout
110
+ output_format: SAME
111
+ transformer.h.1.attn.softmax:
112
+ approximation_function: SOFTMAX(base2,float16)
113
+ input_format: SAME
114
+ instance: Softmax
115
+ output_format: SAME
116
+ transformer.h.1.ln_1:
117
+ approximation_function: LAYERNORM(fallback,4,float16)
118
+ bias_format: SAME
119
+ input_format: SAME
120
+ instance: LayerNorm
121
+ output_format: SAME
122
+ weight_format: SAME
123
+ transformer.h.1.ln_2:
124
+ approximation_function: LAYERNORM(fallback,4,float16)
125
+ bias_format: SAME
126
+ input_format: SAME
127
+ instance: LayerNorm
128
+ output_format: SAME
129
+ weight_format: SAME
130
+ transformer.h.1.mlp.act:
131
+ approximation_function: GELU(poly2,float16)
132
+ input_format: SAME
133
+ instance: GELU
134
+ output_format: SAME
135
+ transformer.h.1.mlp.c_fc:
136
+ approximation_function: NONE
137
+ bias_format: SAME
138
+ input_format: BFP[8|8]{64,-1}(SN)
139
+ instance: HFTransformersConv1D
140
+ output_format: SAME
141
+ weight_format: BFP[8|8]{64,0}(SN)
142
+ weight_sparseness: DENSE
143
+ transformer.h.1.mlp.c_proj:
144
+ approximation_function: NONE
145
+ bias_format: SAME
146
+ input_format: BFP[8|8]{64,-1}(SN)
147
+ instance: HFTransformersConv1D
148
+ output_format: SAME
149
+ weight_format: BFP[8|8]{64,0}(SN)
150
+ weight_sparseness: DENSE
151
+ transformer.h.1.mlp.dropout:
152
+ approximation_function: NONE
153
+ input_format: SAME
154
+ instance: Dropout
155
+ output_format: SAME
156
+ transformer.h.2.attn.attn_dropout:
157
+ approximation_function: NONE
158
+ input_format: SAME
159
+ instance: Dropout
160
+ output_format: BFP[8|8]{64,-1}(SN)
161
+ transformer.h.2.attn.c_attn:
162
+ approximation_function: NONE
163
+ bias_format: SAME
164
+ input_format: BFP[8|8]{64,-1}(SN)
165
+ instance: HFTransformersConv1D
166
+ output_format: BFP[8|8]{64,-1}(SN)
167
+ weight_format: BFP[8|8]{64,0}(SN)
168
+ weight_sparseness: DENSE
169
+ transformer.h.2.attn.c_proj:
170
+ approximation_function: NONE
171
+ bias_format: SAME
172
+ input_format: BFP[8|8]{64,-1}(SN)
173
+ instance: HFTransformersConv1D
174
+ output_format: SAME
175
+ weight_format: BFP[8|8]{64,0}(SN)
176
+ weight_sparseness: DENSE
177
+ transformer.h.2.attn.resid_dropout:
178
+ approximation_function: NONE
179
+ input_format: SAME
180
+ instance: Dropout
181
+ output_format: SAME
182
+ transformer.h.2.attn.softmax:
183
+ approximation_function: SOFTMAX(base2,float16)
184
+ input_format: SAME
185
+ instance: Softmax
186
+ output_format: SAME
187
+ transformer.h.2.ln_1:
188
+ approximation_function: LAYERNORM(fallback,4,float16)
189
+ bias_format: SAME
190
+ input_format: SAME
191
+ instance: LayerNorm
192
+ output_format: SAME
193
+ weight_format: SAME
194
+ transformer.h.2.ln_2:
195
+ approximation_function: LAYERNORM(fallback,4,float16)
196
+ bias_format: SAME
197
+ input_format: SAME
198
+ instance: LayerNorm
199
+ output_format: SAME
200
+ weight_format: SAME
201
+ transformer.h.2.mlp.act:
202
+ approximation_function: GELU(poly2,float16)
203
+ input_format: SAME
204
+ instance: GELU
205
+ output_format: SAME
206
+ transformer.h.2.mlp.c_fc:
207
+ approximation_function: NONE
208
+ bias_format: SAME
209
+ input_format: BFP[8|8]{64,-1}(SN)
210
+ instance: HFTransformersConv1D
211
+ output_format: SAME
212
+ weight_format: BFP[8|8]{64,0}(SN)
213
+ weight_sparseness: DENSE
214
+ transformer.h.2.mlp.c_proj:
215
+ approximation_function: NONE
216
+ bias_format: SAME
217
+ input_format: BFP[8|8]{64,-1}(SN)
218
+ instance: HFTransformersConv1D
219
+ output_format: SAME
220
+ weight_format: BFP[8|8]{64,0}(SN)
221
+ weight_sparseness: DENSE
222
+ transformer.h.2.mlp.dropout:
223
+ approximation_function: NONE
224
+ input_format: SAME
225
+ instance: Dropout
226
+ output_format: SAME
227
+ transformer.h.3.attn.attn_dropout:
228
+ approximation_function: NONE
229
+ input_format: SAME
230
+ instance: Dropout
231
+ output_format: BFP[8|8]{64,-1}(SN)
232
+ transformer.h.3.attn.c_attn:
233
+ approximation_function: NONE
234
+ bias_format: SAME
235
+ input_format: BFP[8|8]{64,-1}(SN)
236
+ instance: HFTransformersConv1D
237
+ output_format: BFP[8|8]{64,-1}(SN)
238
+ weight_format: BFP[8|8]{64,0}(SN)
239
+ weight_sparseness: DENSE
240
+ transformer.h.3.attn.c_proj:
241
+ approximation_function: NONE
242
+ bias_format: SAME
243
+ input_format: BFP[8|8]{64,-1}(SN)
244
+ instance: HFTransformersConv1D
245
+ output_format: SAME
246
+ weight_format: BFP[8|8]{64,0}(SN)
247
+ weight_sparseness: DENSE
248
+ transformer.h.3.attn.resid_dropout:
249
+ approximation_function: NONE
250
+ input_format: SAME
251
+ instance: Dropout
252
+ output_format: SAME
253
+ transformer.h.3.attn.softmax:
254
+ approximation_function: SOFTMAX(base2,float16)
255
+ input_format: SAME
256
+ instance: Softmax
257
+ output_format: SAME
258
+ transformer.h.3.ln_1:
259
+ approximation_function: LAYERNORM(fallback,4,float16)
260
+ bias_format: SAME
261
+ input_format: SAME
262
+ instance: LayerNorm
263
+ output_format: SAME
264
+ weight_format: SAME
265
+ transformer.h.3.ln_2:
266
+ approximation_function: LAYERNORM(fallback,4,float16)
267
+ bias_format: SAME
268
+ input_format: SAME
269
+ instance: LayerNorm
270
+ output_format: SAME
271
+ weight_format: SAME
272
+ transformer.h.3.mlp.act:
273
+ approximation_function: GELU(poly2,float16)
274
+ input_format: SAME
275
+ instance: GELU
276
+ output_format: SAME
277
+ transformer.h.3.mlp.c_fc:
278
+ approximation_function: NONE
279
+ bias_format: SAME
280
+ input_format: BFP[8|8]{64,-1}(SN)
281
+ instance: HFTransformersConv1D
282
+ output_format: SAME
283
+ weight_format: BFP[8|8]{64,0}(SN)
284
+ weight_sparseness: DENSE
285
+ transformer.h.3.mlp.c_proj:
286
+ approximation_function: NONE
287
+ bias_format: SAME
288
+ input_format: BFP[8|8]{64,-1}(SN)
289
+ instance: HFTransformersConv1D
290
+ output_format: SAME
291
+ weight_format: BFP[8|8]{64,0}(SN)
292
+ weight_sparseness: DENSE
293
+ transformer.h.3.mlp.dropout:
294
+ approximation_function: NONE
295
+ input_format: SAME
296
+ instance: Dropout
297
+ output_format: SAME
298
+ transformer.h.4.attn.attn_dropout:
299
+ approximation_function: NONE
300
+ input_format: SAME
301
+ instance: Dropout
302
+ output_format: BFP[8|8]{64,-1}(SN)
303
+ transformer.h.4.attn.c_attn:
304
+ approximation_function: NONE
305
+ bias_format: SAME
306
+ input_format: BFP[8|8]{64,-1}(SN)
307
+ instance: HFTransformersConv1D
308
+ output_format: BFP[8|8]{64,-1}(SN)
309
+ weight_format: BFP[8|8]{64,0}(SN)
310
+ weight_sparseness: DENSE
311
+ transformer.h.4.attn.c_proj:
312
+ approximation_function: NONE
313
+ bias_format: SAME
314
+ input_format: BFP[8|8]{64,-1}(SN)
315
+ instance: HFTransformersConv1D
316
+ output_format: SAME
317
+ weight_format: BFP[8|8]{64,0}(SN)
318
+ weight_sparseness: DENSE
319
+ transformer.h.4.attn.resid_dropout:
320
+ approximation_function: NONE
321
+ input_format: SAME
322
+ instance: Dropout
323
+ output_format: SAME
324
+ transformer.h.4.attn.softmax:
325
+ approximation_function: SOFTMAX(base2,float16)
326
+ input_format: SAME
327
+ instance: Softmax
328
+ output_format: SAME
329
+ transformer.h.4.ln_1:
330
+ approximation_function: LAYERNORM(fallback,4,float16)
331
+ bias_format: SAME
332
+ input_format: SAME
333
+ instance: LayerNorm
334
+ output_format: SAME
335
+ weight_format: SAME
336
+ transformer.h.4.ln_2:
337
+ approximation_function: LAYERNORM(fallback,4,float16)
338
+ bias_format: SAME
339
+ input_format: SAME
340
+ instance: LayerNorm
341
+ output_format: SAME
342
+ weight_format: SAME
343
+ transformer.h.4.mlp.act:
344
+ approximation_function: GELU(poly2,float16)
345
+ input_format: SAME
346
+ instance: GELU
347
+ output_format: SAME
348
+ transformer.h.4.mlp.c_fc:
349
+ approximation_function: NONE
350
+ bias_format: SAME
351
+ input_format: BFP[8|8]{64,-1}(SN)
352
+ instance: HFTransformersConv1D
353
+ output_format: SAME
354
+ weight_format: BFP[8|8]{64,0}(SN)
355
+ weight_sparseness: DENSE
356
+ transformer.h.4.mlp.c_proj:
357
+ approximation_function: NONE
358
+ bias_format: SAME
359
+ input_format: BFP[8|8]{64,-1}(SN)
360
+ instance: HFTransformersConv1D
361
+ output_format: SAME
362
+ weight_format: BFP[8|8]{64,0}(SN)
363
+ weight_sparseness: DENSE
364
+ transformer.h.4.mlp.dropout:
365
+ approximation_function: NONE
366
+ input_format: SAME
367
+ instance: Dropout
368
+ output_format: SAME
369
+ transformer.h.5.attn.attn_dropout:
370
+ approximation_function: NONE
371
+ input_format: SAME
372
+ instance: Dropout
373
+ output_format: BFP[8|8]{64,-1}(SN)
374
+ transformer.h.5.attn.c_attn:
375
+ approximation_function: NONE
376
+ bias_format: SAME
377
+ input_format: BFP[8|8]{64,-1}(SN)
378
+ instance: HFTransformersConv1D
379
+ output_format: BFP[8|8]{64,-1}(SN)
380
+ weight_format: BFP[8|8]{64,0}(SN)
381
+ weight_sparseness: DENSE
382
+ transformer.h.5.attn.c_proj:
383
+ approximation_function: NONE
384
+ bias_format: SAME
385
+ input_format: BFP[8|8]{64,-1}(SN)
386
+ instance: HFTransformersConv1D
387
+ output_format: SAME
388
+ weight_format: BFP[8|8]{64,0}(SN)
389
+ weight_sparseness: DENSE
390
+ transformer.h.5.attn.resid_dropout:
391
+ approximation_function: NONE
392
+ input_format: SAME
393
+ instance: Dropout
394
+ output_format: SAME
395
+ transformer.h.5.attn.softmax:
396
+ approximation_function: SOFTMAX(base2,float16)
397
+ input_format: SAME
398
+ instance: Softmax
399
+ output_format: SAME
400
+ transformer.h.5.ln_1:
401
+ approximation_function: LAYERNORM(fallback,4,float16)
402
+ bias_format: SAME
403
+ input_format: SAME
404
+ instance: LayerNorm
405
+ output_format: SAME
406
+ weight_format: SAME
407
+ transformer.h.5.ln_2:
408
+ approximation_function: LAYERNORM(fallback,4,float16)
409
+ bias_format: SAME
410
+ input_format: SAME
411
+ instance: LayerNorm
412
+ output_format: SAME
413
+ weight_format: SAME
414
+ transformer.h.5.mlp.act:
415
+ approximation_function: GELU(poly2,float16)
416
+ input_format: SAME
417
+ instance: GELU
418
+ output_format: SAME
419
+ transformer.h.5.mlp.c_fc:
420
+ approximation_function: NONE
421
+ bias_format: SAME
422
+ input_format: BFP[8|8]{64,-1}(SN)
423
+ instance: HFTransformersConv1D
424
+ output_format: SAME
425
+ weight_format: BFP[8|8]{64,0}(SN)
426
+ weight_sparseness: DENSE
427
+ transformer.h.5.mlp.c_proj:
428
+ approximation_function: NONE
429
+ bias_format: SAME
430
+ input_format: BFP[8|8]{64,-1}(SN)
431
+ instance: HFTransformersConv1D
432
+ output_format: SAME
433
+ weight_format: BFP[8|8]{64,0}(SN)
434
+ weight_sparseness: DENSE
435
+ transformer.h.5.mlp.dropout:
436
+ approximation_function: NONE
437
+ input_format: SAME
438
+ instance: Dropout
439
+ output_format: SAME
440
+ transformer.ln_f:
441
+ approximation_function: LAYERNORM(fallback,4,float16)
442
+ bias_format: SAME
443
+ input_format: SAME
444
+ instance: LayerNorm
445
+ output_format: SAME
446
+ weight_format: SAME
 
activations.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from collections import OrderedDict
17
+
18
+ import torch
19
+ from packaging import version
20
+ from torch import Tensor, nn
21
+
22
+ from .utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class PytorchGELUTanh(nn.Module):
29
+ """
30
+ A fast C implementation of the tanh approximation of the GeLU activation function. See
31
+ https://arxiv.org/abs/1606.08415.
32
+
33
+ This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
34
+ match due to rounding errors.
35
+ """
36
+
37
+ def __init__(self):
38
+ super().__init__()
39
+ if version.parse(torch.__version__) < version.parse("1.12.0"):
40
+ raise ImportError(
41
+ f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
42
+ "PytorchGELUTanh. Please upgrade torch."
43
+ )
44
+
45
+ def forward(self, input: Tensor) -> Tensor:
46
+ return nn.functional.gelu(input, approximate="tanh")
47
+
48
+
49
+ class NewGELUActivation(nn.Module):
50
+ """
51
+ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
52
+ the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
53
+ """
54
+
55
+ def forward(self, input: Tensor) -> Tensor:
56
+ return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
57
+
58
+
59
+ class GELUActivation(nn.Module):
60
+ """
61
+ Original Implementation of the GELU activation function in Google BERT repo when initially created. For
62
+ information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
63
+ torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
64
+ Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
65
+ """
66
+
67
+ def __init__(self, use_gelu_python: bool = False):
68
+ super().__init__()
69
+ if use_gelu_python:
70
+ self.act = self._gelu_python
71
+ else:
72
+ self.act = nn.functional.gelu
73
+
74
+ def _gelu_python(self, input: Tensor) -> Tensor:
75
+ return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))
76
+
77
+ def forward(self, input: Tensor) -> Tensor:
78
+ return self.act(input)
79
+
80
+
81
+ class FastGELUActivation(nn.Module):
82
+ """
83
+ Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
84
+ """
85
+
86
+ def forward(self, input: Tensor) -> Tensor:
87
+ return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
88
+
89
+
90
+ class QuickGELUActivation(nn.Module):
91
+ """
92
+ Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
93
+ """
94
+
95
+ def forward(self, input: Tensor) -> Tensor:
96
+ return input * torch.sigmoid(1.702 * input)
97
+
98
+
99
+ class ClippedGELUActivation(nn.Module):
100
+ """
101
+ Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
102
+ it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
103
+ https://arxiv.org/abs/2004.09602.
104
+
105
+ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
106
+ initially created.
107
+
108
+ For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
109
+ torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
110
+ """
111
+
112
+ def __init__(self, min: float, max: float):
113
+ if min > max:
114
+ raise ValueError(f"min should be < max (got min: {min}, max: {max})")
115
+
116
+ super().__init__()
117
+ self.min = min
118
+ self.max = max
119
+
120
+ def forward(self, x: Tensor) -> Tensor:
121
+ return torch.clip(gelu(x), self.min, self.max)
122
+
123
+
124
+ class AccurateGELUActivation(nn.Module):
125
+ """
126
+ Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
127
+ https://github.com/hendrycks/GELUs
128
+
129
+ Implemented along with MEGA (Moving Average Equipped Gated Attention)
130
+ """
131
+
132
+ def __init__(self):
133
+ super().__init__()
134
+ self.precomputed_constant = math.sqrt(2 / math.pi)
135
+
136
+ def forward(self, input: Tensor) -> Tensor:
137
+ return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))))
138
+
139
+
140
+ class SiLUActivation(nn.Module):
141
+ """
142
+ See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
143
+ Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
144
+ Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
145
+ Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
146
+ later.
147
+ """
148
+
149
+ def forward(self, input: Tensor) -> Tensor:
150
+ return nn.functional.silu(input)
151
+
152
+
153
+ class MishActivation(nn.Module):
154
+ """
155
+ See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
156
+ visit the official repository for the paper: https://github.com/digantamisra98/Mish
157
+ """
158
+
159
+ def __init__(self):
160
+ super().__init__()
161
+ if version.parse(torch.__version__) < version.parse("1.9.0"):
162
+ self.act = self._mish_python
163
+ else:
164
+ self.act = nn.functional.mish
165
+
166
+ def _mish_python(self, input: Tensor) -> Tensor:
167
+ return input * torch.tanh(nn.functional.softplus(input))
168
+
169
+ def forward(self, input: Tensor) -> Tensor:
170
+ return self.act(input)
171
+
172
+
173
+ class LinearActivation(nn.Module):
174
+ """
175
+ Applies the linear activation function, i.e. forwarding input directly to output.
176
+ """
177
+
178
+ def forward(self, input: Tensor) -> Tensor:
179
+ return input
180
+
181
+
182
+ class LaplaceActivation(nn.Module):
183
+ """
184
+ Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
185
+ https://arxiv.org/abs/2209.10655
186
+
187
+ Inspired by squared relu, but with bounded range and gradient for better stability
188
+ """
189
+
190
+ def forward(self, input, mu=0.707107, sigma=0.282095):
191
+ input = (input - mu).div(sigma * math.sqrt(2.0))
192
+ return 0.5 * (1.0 + torch.erf(input))
193
+
194
+
195
+ class ReLUSquaredActivation(nn.Module):
196
+ """
197
+ Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
198
+ """
199
+
200
+ def forward(self, input):
201
+ relu_applied = nn.functional.relu(input)
202
+ squared = torch.square(relu_applied)
203
+ return squared
204
+
205
+
206
+ class ClassInstantier(OrderedDict):
207
+ def __getitem__(self, key):
208
+ content = super().__getitem__(key)()
209
+ cls, kwargs = content if isinstance(content, tuple) else (content, {})
210
+ return cls(**kwargs)
211
+
212
+
213
+ ACT2CLS = {
214
+ "gelu": lambda: GELUActivation,
215
+ "gelu_10": lambda: (ClippedGELUActivation, {"min": -10, "max": 10}),
216
+ "gelu_fast": lambda: FastGELUActivation,
217
+ "gelu_new": lambda: NewGELUActivation,
218
+ "gelu_python": lambda: (GELUActivation, {"use_gelu_python": True}),
219
+ "gelu_pytorch_tanh": lambda: PytorchGELUTanh,
220
+ "gelu_accurate": lambda: AccurateGELUActivation,
221
+ "laplace": lambda: LaplaceActivation,
222
+ "linear": lambda: LinearActivation,
223
+ "mish": lambda: MishActivation,
224
+ "quick_gelu": lambda: QuickGELUActivation,
225
+ "relu": lambda: nn.ReLU,
226
+ "relu2": lambda: ReLUSquaredActivation,
227
+ "relu6": lambda: nn.ReLU6,
228
+ "sigmoid": lambda: nn.Sigmoid,
229
+ "silu": lambda: SiLUActivation,
230
+ "swish": lambda: SiLUActivation,
231
+ "tanh": lambda: nn.Tanh,
232
+ }
233
+ ACT2FN = ClassInstantier(ACT2CLS)
234
+
235
+
236
+ def get_activation(activation_string):
237
+ if activation_string in ACT2FN:
238
+ return ACT2FN[activation_string]
239
+ else:
240
+ raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
241
+
242
+
243
+ # For backwards compatibility with: from activations import gelu_python
244
+ gelu_python = get_activation("gelu_python")
245
+ gelu_new = get_activation("gelu_new")
246
+ gelu = get_activation("gelu")
247
+ gelu_fast = get_activation("gelu_fast")
248
+ quick_gelu = get_activation("quick_gelu")
249
+ silu = get_activation("silu")
250
+ mish = get_activation("mish")
251
+ linear_act = get_activation("linear")