File size: 17,404 Bytes
ac8897e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
,layer_id,layer_type,param_type,shape,nparam,nnz,sparsity
0,bert.embeddings.word_embeddings,Embedding,weight,"[30522, 768]",23440896,23440896,0.0
1,bert.embeddings.position_embeddings,Embedding,weight,"[512, 768]",393216,393216,0.0
2,bert.embeddings.token_type_embeddings,Embedding,weight,"[2, 768]",1536,1536,0.0
3,bert.embeddings.LayerNorm,LayerNorm,weight,[768],768,768,0.0
4,bert.embeddings.LayerNorm,LayerNorm,bias,[768],768,768,0.0
5,bert.encoder.layer.0.attention.self.query,Linear,weight,"[320, 768]",245760,135168,0.44999998807907104
6,bert.encoder.layer.0.attention.self.query,Linear,bias,[320],320,256,0.19999998807907104
7,bert.encoder.layer.0.attention.self.key,Linear,weight,"[320, 768]",245760,149504,0.3916666507720947
8,bert.encoder.layer.0.attention.self.key,Linear,bias,[320],320,256,0.19999998807907104
9,bert.encoder.layer.0.attention.self.value,Linear,weight,"[320, 768]",245760,173056,0.2958332896232605
10,bert.encoder.layer.0.attention.self.value,Linear,bias,[320],320,256,0.19999998807907104
11,bert.encoder.layer.0.attention.output.dense,Linear,weight,"[768, 320]",245760,181248,0.26249998807907104
12,bert.encoder.layer.0.attention.output.dense,Linear,bias,[768],768,768,0.0
13,bert.encoder.layer.0.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
14,bert.encoder.layer.0.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
15,bert.encoder.layer.0.intermediate.dense,Linear,weight,"[185, 768]",142080,142080,0.0
16,bert.encoder.layer.0.intermediate.dense,Linear,bias,[185],185,185,0.0
17,bert.encoder.layer.0.output.dense,Linear,weight,"[768, 185]",142080,142080,0.0
18,bert.encoder.layer.0.output.dense,Linear,bias,[768],768,768,0.0
19,bert.encoder.layer.0.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
20,bert.encoder.layer.0.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
21,bert.encoder.layer.1.attention.self.query,Linear,weight,"[320, 768]",245760,175104,0.28749996423721313
22,bert.encoder.layer.1.attention.self.query,Linear,bias,[320],320,288,0.09999996423721313
23,bert.encoder.layer.1.attention.self.key,Linear,weight,"[320, 768]",245760,177152,0.27916663885116577
24,bert.encoder.layer.1.attention.self.key,Linear,bias,[320],320,288,0.09999996423721313
25,bert.encoder.layer.1.attention.self.value,Linear,weight,"[320, 768]",245760,166912,0.32083332538604736
26,bert.encoder.layer.1.attention.self.value,Linear,bias,[320],320,288,0.09999996423721313
27,bert.encoder.layer.1.attention.output.dense,Linear,weight,"[768, 320]",245760,167936,0.3166666030883789
28,bert.encoder.layer.1.attention.output.dense,Linear,bias,[768],768,768,0.0
29,bert.encoder.layer.1.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
30,bert.encoder.layer.1.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
31,bert.encoder.layer.1.intermediate.dense,Linear,weight,"[315, 768]",241920,241920,0.0
32,bert.encoder.layer.1.intermediate.dense,Linear,bias,[315],315,315,0.0
33,bert.encoder.layer.1.output.dense,Linear,weight,"[768, 315]",241920,241920,0.0
34,bert.encoder.layer.1.output.dense,Linear,bias,[768],768,768,0.0
35,bert.encoder.layer.1.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
36,bert.encoder.layer.1.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
37,bert.encoder.layer.2.attention.self.query,Linear,weight,"[576, 768]",442368,285696,0.3541666865348816
38,bert.encoder.layer.2.attention.self.query,Linear,bias,[576],576,480,0.1666666865348816
39,bert.encoder.layer.2.attention.self.key,Linear,weight,"[576, 768]",442368,297984,0.3263888955116272
40,bert.encoder.layer.2.attention.self.key,Linear,bias,[576],576,480,0.1666666865348816
41,bert.encoder.layer.2.attention.self.value,Linear,weight,"[576, 768]",442368,226304,0.4884259104728699
42,bert.encoder.layer.2.attention.self.value,Linear,bias,[576],576,384,0.3333333134651184
43,bert.encoder.layer.2.attention.output.dense,Linear,weight,"[768, 576]",442368,237568,0.4629629850387573
44,bert.encoder.layer.2.attention.output.dense,Linear,bias,[768],768,768,0.0
45,bert.encoder.layer.2.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
46,bert.encoder.layer.2.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
47,bert.encoder.layer.2.intermediate.dense,Linear,weight,"[339, 768]",260352,260352,0.0
48,bert.encoder.layer.2.intermediate.dense,Linear,bias,[339],339,339,0.0
49,bert.encoder.layer.2.output.dense,Linear,weight,"[768, 339]",260352,260352,0.0
50,bert.encoder.layer.2.output.dense,Linear,bias,[768],768,768,0.0
51,bert.encoder.layer.2.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
52,bert.encoder.layer.2.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
53,bert.encoder.layer.3.attention.self.query,Linear,weight,"[576, 768]",442368,277504,0.37268519401550293
54,bert.encoder.layer.3.attention.self.query,Linear,bias,[576],576,512,0.1111111044883728
55,bert.encoder.layer.3.attention.self.key,Linear,weight,"[576, 768]",442368,303104,0.31481480598449707
56,bert.encoder.layer.3.attention.self.key,Linear,bias,[576],576,512,0.1111111044883728
57,bert.encoder.layer.3.attention.self.value,Linear,weight,"[576, 768]",442368,297984,0.3263888955116272
58,bert.encoder.layer.3.attention.self.value,Linear,bias,[576],576,512,0.1111111044883728
59,bert.encoder.layer.3.attention.output.dense,Linear,weight,"[768, 576]",442368,308224,0.30324071645736694
60,bert.encoder.layer.3.attention.output.dense,Linear,bias,[768],768,768,0.0
61,bert.encoder.layer.3.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
62,bert.encoder.layer.3.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
63,bert.encoder.layer.3.intermediate.dense,Linear,weight,"[368, 768]",282624,282624,0.0
64,bert.encoder.layer.3.intermediate.dense,Linear,bias,[368],368,368,0.0
65,bert.encoder.layer.3.output.dense,Linear,weight,"[768, 368]",282624,282624,0.0
66,bert.encoder.layer.3.output.dense,Linear,bias,[768],768,768,0.0
67,bert.encoder.layer.3.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
68,bert.encoder.layer.3.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
69,bert.encoder.layer.4.attention.self.query,Linear,weight,"[576, 768]",442368,291840,0.3402777910232544
70,bert.encoder.layer.4.attention.self.query,Linear,bias,[576],576,544,0.055555522441864014
71,bert.encoder.layer.4.attention.self.key,Linear,weight,"[576, 768]",442368,310272,0.2986111044883728
72,bert.encoder.layer.4.attention.self.key,Linear,bias,[576],576,544,0.055555522441864014
73,bert.encoder.layer.4.attention.self.value,Linear,weight,"[576, 768]",442368,272384,0.38425928354263306
74,bert.encoder.layer.4.attention.self.value,Linear,bias,[576],576,480,0.1666666865348816
75,bert.encoder.layer.4.attention.output.dense,Linear,weight,"[768, 576]",442368,263168,0.40509259700775146
76,bert.encoder.layer.4.attention.output.dense,Linear,bias,[768],768,768,0.0
77,bert.encoder.layer.4.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
78,bert.encoder.layer.4.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
79,bert.encoder.layer.4.intermediate.dense,Linear,weight,"[386, 768]",296448,296448,0.0
80,bert.encoder.layer.4.intermediate.dense,Linear,bias,[386],386,386,0.0
81,bert.encoder.layer.4.output.dense,Linear,weight,"[768, 386]",296448,296448,0.0
82,bert.encoder.layer.4.output.dense,Linear,bias,[768],768,768,0.0
83,bert.encoder.layer.4.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
84,bert.encoder.layer.4.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
85,bert.encoder.layer.5.attention.self.query,Linear,weight,"[384, 768]",294912,171008,0.4201388955116272
86,bert.encoder.layer.5.attention.self.query,Linear,bias,[384],384,352,0.08333331346511841
87,bert.encoder.layer.5.attention.self.key,Linear,weight,"[384, 768]",294912,205824,0.3020833134651184
88,bert.encoder.layer.5.attention.self.key,Linear,bias,[384],384,352,0.08333331346511841
89,bert.encoder.layer.5.attention.self.value,Linear,weight,"[384, 768]",294912,217088,0.2638888955116272
90,bert.encoder.layer.5.attention.self.value,Linear,bias,[384],384,384,0.0
91,bert.encoder.layer.5.attention.output.dense,Linear,weight,"[768, 384]",294912,223232,0.243055522441864
92,bert.encoder.layer.5.attention.output.dense,Linear,bias,[768],768,768,0.0
93,bert.encoder.layer.5.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
94,bert.encoder.layer.5.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
95,bert.encoder.layer.5.intermediate.dense,Linear,weight,"[336, 768]",258048,258048,0.0
96,bert.encoder.layer.5.intermediate.dense,Linear,bias,[336],336,336,0.0
97,bert.encoder.layer.5.output.dense,Linear,weight,"[768, 336]",258048,258048,0.0
98,bert.encoder.layer.5.output.dense,Linear,bias,[768],768,768,0.0
99,bert.encoder.layer.5.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
100,bert.encoder.layer.5.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
101,bert.encoder.layer.6.attention.self.query,Linear,weight,"[448, 768]",344064,192512,0.4404761791229248
102,bert.encoder.layer.6.attention.self.query,Linear,bias,[448],448,416,0.07142853736877441
103,bert.encoder.layer.6.attention.self.key,Linear,weight,"[448, 768]",344064,224256,0.3482142686843872
104,bert.encoder.layer.6.attention.self.key,Linear,bias,[448],448,416,0.07142853736877441
105,bert.encoder.layer.6.attention.self.value,Linear,weight,"[448, 768]",344064,209920,0.3898809552192688
106,bert.encoder.layer.6.attention.self.value,Linear,bias,[448],448,352,0.21428567171096802
107,bert.encoder.layer.6.attention.output.dense,Linear,weight,"[768, 448]",344064,199680,0.4196428656578064
108,bert.encoder.layer.6.attention.output.dense,Linear,bias,[768],768,768,0.0
109,bert.encoder.layer.6.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
110,bert.encoder.layer.6.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
111,bert.encoder.layer.6.intermediate.dense,Linear,weight,"[280, 768]",215040,215040,0.0
112,bert.encoder.layer.6.intermediate.dense,Linear,bias,[280],280,280,0.0
113,bert.encoder.layer.6.output.dense,Linear,weight,"[768, 280]",215040,215040,0.0
114,bert.encoder.layer.6.output.dense,Linear,bias,[768],768,768,0.0
115,bert.encoder.layer.6.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
116,bert.encoder.layer.6.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
117,bert.encoder.layer.7.attention.self.query,Linear,weight,"[448, 768]",344064,201728,0.413690447807312
118,bert.encoder.layer.7.attention.self.query,Linear,bias,[448],448,416,0.07142853736877441
119,bert.encoder.layer.7.attention.self.key,Linear,weight,"[448, 768]",344064,237568,0.3095238208770752
120,bert.encoder.layer.7.attention.self.key,Linear,bias,[448],448,416,0.07142853736877441
121,bert.encoder.layer.7.attention.self.value,Linear,weight,"[448, 768]",344064,218112,0.3660714030265808
122,bert.encoder.layer.7.attention.self.value,Linear,bias,[448],448,352,0.21428567171096802
123,bert.encoder.layer.7.attention.output.dense,Linear,weight,"[768, 448]",344064,202752,0.4107142686843872
124,bert.encoder.layer.7.attention.output.dense,Linear,bias,[768],768,768,0.0
125,bert.encoder.layer.7.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
126,bert.encoder.layer.7.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
127,bert.encoder.layer.7.intermediate.dense,Linear,weight,"[211, 768]",162048,162048,0.0
128,bert.encoder.layer.7.intermediate.dense,Linear,bias,[211],211,211,0.0
129,bert.encoder.layer.7.output.dense,Linear,weight,"[768, 211]",162048,162048,0.0
130,bert.encoder.layer.7.output.dense,Linear,bias,[768],768,768,0.0
131,bert.encoder.layer.7.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
132,bert.encoder.layer.7.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
133,bert.encoder.layer.8.attention.self.query,Linear,weight,"[448, 768]",344064,186368,0.4583333134651184
134,bert.encoder.layer.8.attention.self.query,Linear,bias,[448],448,416,0.07142853736877441
135,bert.encoder.layer.8.attention.self.key,Linear,weight,"[448, 768]",344064,197632,0.425595223903656
136,bert.encoder.layer.8.attention.self.key,Linear,bias,[448],448,416,0.07142853736877441
137,bert.encoder.layer.8.attention.self.value,Linear,weight,"[448, 768]",344064,154624,0.550595223903656
138,bert.encoder.layer.8.attention.self.value,Linear,bias,[448],448,288,0.3571428060531616
139,bert.encoder.layer.8.attention.output.dense,Linear,weight,"[768, 448]",344064,148480,0.5684523582458496
140,bert.encoder.layer.8.attention.output.dense,Linear,bias,[768],768,768,0.0
141,bert.encoder.layer.8.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
142,bert.encoder.layer.8.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
143,bert.encoder.layer.8.intermediate.dense,Linear,weight,"[108, 768]",82944,82944,0.0
144,bert.encoder.layer.8.intermediate.dense,Linear,bias,[108],108,108,0.0
145,bert.encoder.layer.8.output.dense,Linear,weight,"[768, 108]",82944,82944,0.0
146,bert.encoder.layer.8.output.dense,Linear,bias,[768],768,768,0.0
147,bert.encoder.layer.8.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
148,bert.encoder.layer.8.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
149,bert.encoder.layer.9.attention.self.query,Linear,weight,"[320, 768]",245760,144384,0.41249996423721313
150,bert.encoder.layer.9.attention.self.query,Linear,bias,[320],320,288,0.09999996423721313
151,bert.encoder.layer.9.attention.self.key,Linear,weight,"[320, 768]",245760,155648,0.36666661500930786
152,bert.encoder.layer.9.attention.self.key,Linear,bias,[320],320,288,0.09999996423721313
153,bert.encoder.layer.9.attention.self.value,Linear,weight,"[320, 768]",245760,63488,0.7416666746139526
154,bert.encoder.layer.9.attention.self.value,Linear,bias,[320],320,160,0.5
155,bert.encoder.layer.9.attention.output.dense,Linear,weight,"[768, 320]",245760,65536,0.7333333492279053
156,bert.encoder.layer.9.attention.output.dense,Linear,bias,[768],768,704,0.08333331346511841
157,bert.encoder.layer.9.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
158,bert.encoder.layer.9.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
159,bert.encoder.layer.9.intermediate.dense,Linear,weight,"[53, 768]",40704,40704,5.960464477539063e-08
160,bert.encoder.layer.9.intermediate.dense,Linear,bias,[53],53,53,0.0
161,bert.encoder.layer.9.output.dense,Linear,weight,"[768, 53]",40704,40704,5.960464477539063e-08
162,bert.encoder.layer.9.output.dense,Linear,bias,[768],768,768,0.0
163,bert.encoder.layer.9.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
164,bert.encoder.layer.9.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
165,bert.encoder.layer.10.attention.self.query,Linear,weight,"[384, 768]",294912,158720,0.461805522441864
166,bert.encoder.layer.10.attention.self.query,Linear,bias,[384],384,320,0.16666662693023682
167,bert.encoder.layer.10.attention.self.key,Linear,weight,"[384, 768]",294912,158720,0.461805522441864
168,bert.encoder.layer.10.attention.self.key,Linear,bias,[384],384,320,0.16666662693023682
169,bert.encoder.layer.10.attention.self.value,Linear,weight,"[384, 768]",294912,77824,0.7361111044883728
170,bert.encoder.layer.10.attention.self.value,Linear,bias,[384],384,192,0.5
171,bert.encoder.layer.10.attention.output.dense,Linear,weight,"[768, 384]",294912,78848,0.7326388955116272
172,bert.encoder.layer.10.attention.output.dense,Linear,bias,[768],768,736,0.041666626930236816
173,bert.encoder.layer.10.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
174,bert.encoder.layer.10.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
175,bert.encoder.layer.10.intermediate.dense,Linear,weight,"[86, 768]",66048,66048,0.0
176,bert.encoder.layer.10.intermediate.dense,Linear,bias,[86],86,86,0.0
177,bert.encoder.layer.10.output.dense,Linear,weight,"[768, 86]",66048,66048,0.0
178,bert.encoder.layer.10.output.dense,Linear,bias,[768],768,768,0.0
179,bert.encoder.layer.10.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
180,bert.encoder.layer.10.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
181,bert.encoder.layer.11.attention.self.query,Linear,weight,"[384, 768]",294912,107520,0.6354166269302368
182,bert.encoder.layer.11.attention.self.query,Linear,bias,[384],384,256,0.3333333134651184
183,bert.encoder.layer.11.attention.self.key,Linear,weight,"[384, 768]",294912,118784,0.5972222089767456
184,bert.encoder.layer.11.attention.self.key,Linear,bias,[384],384,256,0.3333333134651184
185,bert.encoder.layer.11.attention.self.value,Linear,weight,"[384, 768]",294912,62464,0.7881944179534912
186,bert.encoder.layer.11.attention.self.value,Linear,bias,[384],384,192,0.5
187,bert.encoder.layer.11.attention.output.dense,Linear,weight,"[768, 384]",294912,54272,0.8159722089767456
188,bert.encoder.layer.11.attention.output.dense,Linear,bias,[768],768,672,0.125
189,bert.encoder.layer.11.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
190,bert.encoder.layer.11.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
191,bert.encoder.layer.11.intermediate.dense,Linear,weight,"[105, 768]",80640,80640,0.0
192,bert.encoder.layer.11.intermediate.dense,Linear,bias,[105],105,105,0.0
193,bert.encoder.layer.11.output.dense,Linear,weight,"[768, 105]",80640,80640,0.0
194,bert.encoder.layer.11.output.dense,Linear,bias,[768],768,768,0.0
195,bert.encoder.layer.11.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
196,bert.encoder.layer.11.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
197,qa_outputs,Linear,weight,"[2, 768]",1536,1536,0.0
198,qa_outputs,Linear,bias,[2],2,2,0.0