bert-base-squadv1-block-pruning-hybrid / XP_layer_wise_sparsity_global_rate_15.41.md
Chua, Vui Seng
Update readme and model analysis
ac8897e
layer_id layer_type param_type shape nparam nnz sparsity
0 bert.embeddings.word_embeddings Embedding weight [30522, 768] 23440896 23440896 0
1 bert.embeddings.position_embeddings Embedding weight [512, 768] 393216 393216 0
2 bert.embeddings.token_type_embeddings Embedding weight [2, 768] 1536 1536 0
3 bert.embeddings.LayerNorm LayerNorm weight [768] 768 768 0
4 bert.embeddings.LayerNorm LayerNorm bias [768] 768 768 0
5 bert.encoder.layer.0.attention.self.query Linear weight [320, 768] 245760 135168 0.45
6 bert.encoder.layer.0.attention.self.query Linear bias [320] 320 256 0.2
7 bert.encoder.layer.0.attention.self.key Linear weight [320, 768] 245760 149504 0.391667
8 bert.encoder.layer.0.attention.self.key Linear bias [320] 320 256 0.2
9 bert.encoder.layer.0.attention.self.value Linear weight [320, 768] 245760 173056 0.295833
10 bert.encoder.layer.0.attention.self.value Linear bias [320] 320 256 0.2
11 bert.encoder.layer.0.attention.output.dense Linear weight [768, 320] 245760 181248 0.2625
12 bert.encoder.layer.0.attention.output.dense Linear bias [768] 768 768 0
13 bert.encoder.layer.0.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
14 bert.encoder.layer.0.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
15 bert.encoder.layer.0.intermediate.dense Linear weight [185, 768] 142080 142080 0
16 bert.encoder.layer.0.intermediate.dense Linear bias [185] 185 185 0
17 bert.encoder.layer.0.output.dense Linear weight [768, 185] 142080 142080 0
18 bert.encoder.layer.0.output.dense Linear bias [768] 768 768 0
19 bert.encoder.layer.0.output.LayerNorm LayerNorm weight [768] 768 768 0
20 bert.encoder.layer.0.output.LayerNorm LayerNorm bias [768] 768 768 0
21 bert.encoder.layer.1.attention.self.query Linear weight [320, 768] 245760 175104 0.2875
22 bert.encoder.layer.1.attention.self.query Linear bias [320] 320 288 0.1
23 bert.encoder.layer.1.attention.self.key Linear weight [320, 768] 245760 177152 0.279167
24 bert.encoder.layer.1.attention.self.key Linear bias [320] 320 288 0.1
25 bert.encoder.layer.1.attention.self.value Linear weight [320, 768] 245760 166912 0.320833
26 bert.encoder.layer.1.attention.self.value Linear bias [320] 320 288 0.1
27 bert.encoder.layer.1.attention.output.dense Linear weight [768, 320] 245760 167936 0.316667
28 bert.encoder.layer.1.attention.output.dense Linear bias [768] 768 768 0
29 bert.encoder.layer.1.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
30 bert.encoder.layer.1.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
31 bert.encoder.layer.1.intermediate.dense Linear weight [315, 768] 241920 241920 0
32 bert.encoder.layer.1.intermediate.dense Linear bias [315] 315 315 0
33 bert.encoder.layer.1.output.dense Linear weight [768, 315] 241920 241920 0
34 bert.encoder.layer.1.output.dense Linear bias [768] 768 768 0
35 bert.encoder.layer.1.output.LayerNorm LayerNorm weight [768] 768 768 0
36 bert.encoder.layer.1.output.LayerNorm LayerNorm bias [768] 768 768 0
37 bert.encoder.layer.2.attention.self.query Linear weight [576, 768] 442368 285696 0.354167
38 bert.encoder.layer.2.attention.self.query Linear bias [576] 576 480 0.166667
39 bert.encoder.layer.2.attention.self.key Linear weight [576, 768] 442368 297984 0.326389
40 bert.encoder.layer.2.attention.self.key Linear bias [576] 576 480 0.166667
41 bert.encoder.layer.2.attention.self.value Linear weight [576, 768] 442368 226304 0.488426
42 bert.encoder.layer.2.attention.self.value Linear bias [576] 576 384 0.333333
43 bert.encoder.layer.2.attention.output.dense Linear weight [768, 576] 442368 237568 0.462963
44 bert.encoder.layer.2.attention.output.dense Linear bias [768] 768 768 0
45 bert.encoder.layer.2.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
46 bert.encoder.layer.2.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
47 bert.encoder.layer.2.intermediate.dense Linear weight [339, 768] 260352 260352 0
48 bert.encoder.layer.2.intermediate.dense Linear bias [339] 339 339 0
49 bert.encoder.layer.2.output.dense Linear weight [768, 339] 260352 260352 0
50 bert.encoder.layer.2.output.dense Linear bias [768] 768 768 0
51 bert.encoder.layer.2.output.LayerNorm LayerNorm weight [768] 768 768 0
52 bert.encoder.layer.2.output.LayerNorm LayerNorm bias [768] 768 768 0
53 bert.encoder.layer.3.attention.self.query Linear weight [576, 768] 442368 277504 0.372685
54 bert.encoder.layer.3.attention.self.query Linear bias [576] 576 512 0.111111
55 bert.encoder.layer.3.attention.self.key Linear weight [576, 768] 442368 303104 0.314815
56 bert.encoder.layer.3.attention.self.key Linear bias [576] 576 512 0.111111
57 bert.encoder.layer.3.attention.self.value Linear weight [576, 768] 442368 297984 0.326389
58 bert.encoder.layer.3.attention.self.value Linear bias [576] 576 512 0.111111
59 bert.encoder.layer.3.attention.output.dense Linear weight [768, 576] 442368 308224 0.303241
60 bert.encoder.layer.3.attention.output.dense Linear bias [768] 768 768 0
61 bert.encoder.layer.3.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
62 bert.encoder.layer.3.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
63 bert.encoder.layer.3.intermediate.dense Linear weight [368, 768] 282624 282624 0
64 bert.encoder.layer.3.intermediate.dense Linear bias [368] 368 368 0
65 bert.encoder.layer.3.output.dense Linear weight [768, 368] 282624 282624 0
66 bert.encoder.layer.3.output.dense Linear bias [768] 768 768 0
67 bert.encoder.layer.3.output.LayerNorm LayerNorm weight [768] 768 768 0
68 bert.encoder.layer.3.output.LayerNorm LayerNorm bias [768] 768 768 0
69 bert.encoder.layer.4.attention.self.query Linear weight [576, 768] 442368 291840 0.340278
70 bert.encoder.layer.4.attention.self.query Linear bias [576] 576 544 0.0555555
71 bert.encoder.layer.4.attention.self.key Linear weight [576, 768] 442368 310272 0.298611
72 bert.encoder.layer.4.attention.self.key Linear bias [576] 576 544 0.0555555
73 bert.encoder.layer.4.attention.self.value Linear weight [576, 768] 442368 272384 0.384259
74 bert.encoder.layer.4.attention.self.value Linear bias [576] 576 480 0.166667
75 bert.encoder.layer.4.attention.output.dense Linear weight [768, 576] 442368 263168 0.405093
76 bert.encoder.layer.4.attention.output.dense Linear bias [768] 768 768 0
77 bert.encoder.layer.4.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
78 bert.encoder.layer.4.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
79 bert.encoder.layer.4.intermediate.dense Linear weight [386, 768] 296448 296448 0
80 bert.encoder.layer.4.intermediate.dense Linear bias [386] 386 386 0
81 bert.encoder.layer.4.output.dense Linear weight [768, 386] 296448 296448 0
82 bert.encoder.layer.4.output.dense Linear bias [768] 768 768 0
83 bert.encoder.layer.4.output.LayerNorm LayerNorm weight [768] 768 768 0
84 bert.encoder.layer.4.output.LayerNorm LayerNorm bias [768] 768 768 0
85 bert.encoder.layer.5.attention.self.query Linear weight [384, 768] 294912 171008 0.420139
86 bert.encoder.layer.5.attention.self.query Linear bias [384] 384 352 0.0833333
87 bert.encoder.layer.5.attention.self.key Linear weight [384, 768] 294912 205824 0.302083
88 bert.encoder.layer.5.attention.self.key Linear bias [384] 384 352 0.0833333
89 bert.encoder.layer.5.attention.self.value Linear weight [384, 768] 294912 217088 0.263889
90 bert.encoder.layer.5.attention.self.value Linear bias [384] 384 384 0
91 bert.encoder.layer.5.attention.output.dense Linear weight [768, 384] 294912 223232 0.243056
92 bert.encoder.layer.5.attention.output.dense Linear bias [768] 768 768 0
93 bert.encoder.layer.5.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
94 bert.encoder.layer.5.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
95 bert.encoder.layer.5.intermediate.dense Linear weight [336, 768] 258048 258048 0
96 bert.encoder.layer.5.intermediate.dense Linear bias [336] 336 336 0
97 bert.encoder.layer.5.output.dense Linear weight [768, 336] 258048 258048 0
98 bert.encoder.layer.5.output.dense Linear bias [768] 768 768 0
99 bert.encoder.layer.5.output.LayerNorm LayerNorm weight [768] 768 768 0
100 bert.encoder.layer.5.output.LayerNorm LayerNorm bias [768] 768 768 0
101 bert.encoder.layer.6.attention.self.query Linear weight [448, 768] 344064 192512 0.440476
102 bert.encoder.layer.6.attention.self.query Linear bias [448] 448 416 0.0714285
103 bert.encoder.layer.6.attention.self.key Linear weight [448, 768] 344064 224256 0.348214
104 bert.encoder.layer.6.attention.self.key Linear bias [448] 448 416 0.0714285
105 bert.encoder.layer.6.attention.self.value Linear weight [448, 768] 344064 209920 0.389881
106 bert.encoder.layer.6.attention.self.value Linear bias [448] 448 352 0.214286
107 bert.encoder.layer.6.attention.output.dense Linear weight [768, 448] 344064 199680 0.419643
108 bert.encoder.layer.6.attention.output.dense Linear bias [768] 768 768 0
109 bert.encoder.layer.6.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
110 bert.encoder.layer.6.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
111 bert.encoder.layer.6.intermediate.dense Linear weight [280, 768] 215040 215040 0
112 bert.encoder.layer.6.intermediate.dense Linear bias [280] 280 280 0
113 bert.encoder.layer.6.output.dense Linear weight [768, 280] 215040 215040 0
114 bert.encoder.layer.6.output.dense Linear bias [768] 768 768 0
115 bert.encoder.layer.6.output.LayerNorm LayerNorm weight [768] 768 768 0
116 bert.encoder.layer.6.output.LayerNorm LayerNorm bias [768] 768 768 0
117 bert.encoder.layer.7.attention.self.query Linear weight [448, 768] 344064 201728 0.41369
118 bert.encoder.layer.7.attention.self.query Linear bias [448] 448 416 0.0714285
119 bert.encoder.layer.7.attention.self.key Linear weight [448, 768] 344064 237568 0.309524
120 bert.encoder.layer.7.attention.self.key Linear bias [448] 448 416 0.0714285
121 bert.encoder.layer.7.attention.self.value Linear weight [448, 768] 344064 218112 0.366071
122 bert.encoder.layer.7.attention.self.value Linear bias [448] 448 352 0.214286
123 bert.encoder.layer.7.attention.output.dense Linear weight [768, 448] 344064 202752 0.410714
124 bert.encoder.layer.7.attention.output.dense Linear bias [768] 768 768 0
125 bert.encoder.layer.7.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
126 bert.encoder.layer.7.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
127 bert.encoder.layer.7.intermediate.dense Linear weight [211, 768] 162048 162048 0
128 bert.encoder.layer.7.intermediate.dense Linear bias [211] 211 211 0
129 bert.encoder.layer.7.output.dense Linear weight [768, 211] 162048 162048 0
130 bert.encoder.layer.7.output.dense Linear bias [768] 768 768 0
131 bert.encoder.layer.7.output.LayerNorm LayerNorm weight [768] 768 768 0
132 bert.encoder.layer.7.output.LayerNorm LayerNorm bias [768] 768 768 0
133 bert.encoder.layer.8.attention.self.query Linear weight [448, 768] 344064 186368 0.458333
134 bert.encoder.layer.8.attention.self.query Linear bias [448] 448 416 0.0714285
135 bert.encoder.layer.8.attention.self.key Linear weight [448, 768] 344064 197632 0.425595
136 bert.encoder.layer.8.attention.self.key Linear bias [448] 448 416 0.0714285
137 bert.encoder.layer.8.attention.self.value Linear weight [448, 768] 344064 154624 0.550595
138 bert.encoder.layer.8.attention.self.value Linear bias [448] 448 288 0.357143
139 bert.encoder.layer.8.attention.output.dense Linear weight [768, 448] 344064 148480 0.568452
140 bert.encoder.layer.8.attention.output.dense Linear bias [768] 768 768 0
141 bert.encoder.layer.8.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
142 bert.encoder.layer.8.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
143 bert.encoder.layer.8.intermediate.dense Linear weight [108, 768] 82944 82944 0
144 bert.encoder.layer.8.intermediate.dense Linear bias [108] 108 108 0
145 bert.encoder.layer.8.output.dense Linear weight [768, 108] 82944 82944 0
146 bert.encoder.layer.8.output.dense Linear bias [768] 768 768 0
147 bert.encoder.layer.8.output.LayerNorm LayerNorm weight [768] 768 768 0
148 bert.encoder.layer.8.output.LayerNorm LayerNorm bias [768] 768 768 0
149 bert.encoder.layer.9.attention.self.query Linear weight [320, 768] 245760 144384 0.4125
150 bert.encoder.layer.9.attention.self.query Linear bias [320] 320 288 0.1
151 bert.encoder.layer.9.attention.self.key Linear weight [320, 768] 245760 155648 0.366667
152 bert.encoder.layer.9.attention.self.key Linear bias [320] 320 288 0.1
153 bert.encoder.layer.9.attention.self.value Linear weight [320, 768] 245760 63488 0.741667
154 bert.encoder.layer.9.attention.self.value Linear bias [320] 320 160 0.5
155 bert.encoder.layer.9.attention.output.dense Linear weight [768, 320] 245760 65536 0.733333
156 bert.encoder.layer.9.attention.output.dense Linear bias [768] 768 704 0.0833333
157 bert.encoder.layer.9.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
158 bert.encoder.layer.9.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
159 bert.encoder.layer.9.intermediate.dense Linear weight [53, 768] 40704 40704 5.96046e-08
160 bert.encoder.layer.9.intermediate.dense Linear bias [53] 53 53 0
161 bert.encoder.layer.9.output.dense Linear weight [768, 53] 40704 40704 5.96046e-08
162 bert.encoder.layer.9.output.dense Linear bias [768] 768 768 0
163 bert.encoder.layer.9.output.LayerNorm LayerNorm weight [768] 768 768 0
164 bert.encoder.layer.9.output.LayerNorm LayerNorm bias [768] 768 768 0
165 bert.encoder.layer.10.attention.self.query Linear weight [384, 768] 294912 158720 0.461806
166 bert.encoder.layer.10.attention.self.query Linear bias [384] 384 320 0.166667
167 bert.encoder.layer.10.attention.self.key Linear weight [384, 768] 294912 158720 0.461806
168 bert.encoder.layer.10.attention.self.key Linear bias [384] 384 320 0.166667
169 bert.encoder.layer.10.attention.self.value Linear weight [384, 768] 294912 77824 0.736111
170 bert.encoder.layer.10.attention.self.value Linear bias [384] 384 192 0.5
171 bert.encoder.layer.10.attention.output.dense Linear weight [768, 384] 294912 78848 0.732639
172 bert.encoder.layer.10.attention.output.dense Linear bias [768] 768 736 0.0416666
173 bert.encoder.layer.10.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
174 bert.encoder.layer.10.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
175 bert.encoder.layer.10.intermediate.dense Linear weight [86, 768] 66048 66048 0
176 bert.encoder.layer.10.intermediate.dense Linear bias [86] 86 86 0
177 bert.encoder.layer.10.output.dense Linear weight [768, 86] 66048 66048 0
178 bert.encoder.layer.10.output.dense Linear bias [768] 768 768 0
179 bert.encoder.layer.10.output.LayerNorm LayerNorm weight [768] 768 768 0
180 bert.encoder.layer.10.output.LayerNorm LayerNorm bias [768] 768 768 0
181 bert.encoder.layer.11.attention.self.query Linear weight [384, 768] 294912 107520 0.635417
182 bert.encoder.layer.11.attention.self.query Linear bias [384] 384 256 0.333333
183 bert.encoder.layer.11.attention.self.key Linear weight [384, 768] 294912 118784 0.597222
184 bert.encoder.layer.11.attention.self.key Linear bias [384] 384 256 0.333333
185 bert.encoder.layer.11.attention.self.value Linear weight [384, 768] 294912 62464 0.788194
186 bert.encoder.layer.11.attention.self.value Linear bias [384] 384 192 0.5
187 bert.encoder.layer.11.attention.output.dense Linear weight [768, 384] 294912 54272 0.815972
188 bert.encoder.layer.11.attention.output.dense Linear bias [768] 768 672 0.125
189 bert.encoder.layer.11.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
190 bert.encoder.layer.11.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
191 bert.encoder.layer.11.intermediate.dense Linear weight [105, 768] 80640 80640 0
192 bert.encoder.layer.11.intermediate.dense Linear bias [105] 105 105 0
193 bert.encoder.layer.11.output.dense Linear weight [768, 105] 80640 80640 0
194 bert.encoder.layer.11.output.dense Linear bias [768] 768 768 0
195 bert.encoder.layer.11.output.LayerNorm LayerNorm weight [768] 768 768 0
196 bert.encoder.layer.11.output.LayerNorm LayerNorm bias [768] 768 768 0
197 qa_outputs Linear weight [2, 768] 1536 1536 0
198 qa_outputs Linear bias [2] 2 2 0