Vui Seng Chua
commited on
Commit
•
cd6bbb4
1
Parent(s):
3b67ed8
Add ckpt@35K
Browse files- .gitattributes +1 -0
- checkpoint-35000/NNCFNetwork.onnx +3 -0
- checkpoint-35000/config.json +25 -0
- checkpoint-35000/onnx_sparsity.csv +147 -0
- checkpoint-35000/onnx_sparsity.md +148 -0
- checkpoint-35000/optimizer.pt +3 -0
- checkpoint-35000/pytorch_model.bin +3 -0
- checkpoint-35000/rng_state.pth +3 -0
- checkpoint-35000/scheduler.pt +3 -0
- checkpoint-35000/special_tokens_map.json +1 -0
- checkpoint-35000/tokenizer.json +0 -0
- checkpoint-35000/tokenizer_config.json +1 -0
- checkpoint-35000/torch_mask_structures.csv +73 -0
- checkpoint-35000/torch_mask_structures.md +74 -0
- checkpoint-35000/trainer_state.json +3 -0
- checkpoint-35000/training_args.bin +3 -0
- checkpoint-35000/vocab.txt +0 -0
.gitattributes
CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
trainer_state.json filter=lfs diff=lfs merge=lfs -text
|
checkpoint-35000/NNCFNetwork.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cafb8d66bd60b7c0660a3dbcb033936b27c7cf47f4c7ba854405f8682d039fe
|
3 |
+
size 435667833
|
checkpoint-35000/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"NNCFNetwork"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 3072,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 12,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.9.1",
|
22 |
+
"type_vocab_size": 2,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 30522
|
25 |
+
}
|
checkpoint-35000/onnx_sparsity.csv
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,layer_id,shape,nparam,nnz,sparsity
|
2 |
+
0,nncf_module.bert.encoder.layer.0.attention.self.query.bias,[768],768,192,0.75
|
3 |
+
1,nncf_module.bert.encoder.layer.0.attention.self.query.weight,"[768, 768]",589824,147456,0.75
|
4 |
+
2,nncf_module.bert.encoder.layer.0.attention.self.key.bias,[768],768,192,0.75
|
5 |
+
3,nncf_module.bert.encoder.layer.0.attention.self.key.weight,"[768, 768]",589824,147456,0.75
|
6 |
+
4,nncf_module.bert.encoder.layer.0.attention.self.value.bias,[768],768,192,0.75
|
7 |
+
5,nncf_module.bert.encoder.layer.0.attention.self.value.weight,"[768, 768]",589824,147456,0.75
|
8 |
+
6,nncf_module.bert.encoder.layer.0.attention.output.dense.bias,[768],768,768,0.0
|
9 |
+
7,nncf_module.bert.encoder.layer.0.attention.output.dense.weight,"[768, 768]",589824,147456,0.75
|
10 |
+
8,nncf_module.bert.encoder.layer.0.intermediate.dense.bias,[3072],3072,2940,0.04296875
|
11 |
+
9,nncf_module.bert.encoder.layer.0.intermediate.dense.weight,"[3072, 768]",2359296,2257920,0.04296875
|
12 |
+
10,nncf_module.bert.encoder.layer.0.output.dense.bias,[768],768,768,0.0
|
13 |
+
11,nncf_module.bert.encoder.layer.0.output.dense.weight,"[768, 3072]",2359296,2257920,0.04296875
|
14 |
+
12,nncf_module.bert.encoder.layer.1.attention.self.query.bias,[768],768,256,0.6666666666666667
|
15 |
+
13,nncf_module.bert.encoder.layer.1.attention.self.query.weight,"[768, 768]",589824,196608,0.6666666666666667
|
16 |
+
14,nncf_module.bert.encoder.layer.1.attention.self.key.bias,[768],768,256,0.6666666666666667
|
17 |
+
15,nncf_module.bert.encoder.layer.1.attention.self.key.weight,"[768, 768]",589824,196608,0.6666666666666667
|
18 |
+
16,nncf_module.bert.encoder.layer.1.attention.self.value.bias,[768],768,256,0.6666666666666667
|
19 |
+
17,nncf_module.bert.encoder.layer.1.attention.self.value.weight,"[768, 768]",589824,196608,0.6666666666666667
|
20 |
+
18,nncf_module.bert.encoder.layer.1.attention.output.dense.bias,[768],768,768,0.0
|
21 |
+
19,nncf_module.bert.encoder.layer.1.attention.output.dense.weight,"[768, 768]",589824,196608,0.6666666666666667
|
22 |
+
20,nncf_module.bert.encoder.layer.1.intermediate.dense.bias,[3072],3072,2923,0.04850260416666663
|
23 |
+
21,nncf_module.bert.encoder.layer.1.intermediate.dense.weight,"[3072, 768]",2359296,2244864,0.04850260416666663
|
24 |
+
22,nncf_module.bert.encoder.layer.1.output.dense.bias,[768],768,768,0.0
|
25 |
+
23,nncf_module.bert.encoder.layer.1.output.dense.weight,"[768, 3072]",2359296,2244864,0.04850260416666663
|
26 |
+
24,nncf_module.bert.encoder.layer.2.attention.self.query.bias,[768],768,768,0.0
|
27 |
+
25,nncf_module.bert.encoder.layer.2.attention.self.query.weight,"[768, 768]",589824,589824,0.0
|
28 |
+
26,nncf_module.bert.encoder.layer.2.attention.self.key.bias,[768],768,768,0.0
|
29 |
+
27,nncf_module.bert.encoder.layer.2.attention.self.key.weight,"[768, 768]",589824,589824,0.0
|
30 |
+
28,nncf_module.bert.encoder.layer.2.attention.self.value.bias,[768],768,768,0.0
|
31 |
+
29,nncf_module.bert.encoder.layer.2.attention.self.value.weight,"[768, 768]",589824,589824,0.0
|
32 |
+
30,nncf_module.bert.encoder.layer.2.attention.output.dense.bias,[768],768,768,0.0
|
33 |
+
31,nncf_module.bert.encoder.layer.2.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
|
34 |
+
32,nncf_module.bert.encoder.layer.2.intermediate.dense.bias,[3072],3072,2980,0.02994791666666663
|
35 |
+
33,nncf_module.bert.encoder.layer.2.intermediate.dense.weight,"[3072, 768]",2359296,2288640,0.02994791666666663
|
36 |
+
34,nncf_module.bert.encoder.layer.2.output.dense.bias,[768],768,768,0.0
|
37 |
+
35,nncf_module.bert.encoder.layer.2.output.dense.weight,"[768, 3072]",2359296,2288640,0.02994791666666663
|
38 |
+
36,nncf_module.bert.encoder.layer.3.attention.self.query.bias,[768],768,768,0.0
|
39 |
+
37,nncf_module.bert.encoder.layer.3.attention.self.query.weight,"[768, 768]",589824,589824,0.0
|
40 |
+
38,nncf_module.bert.encoder.layer.3.attention.self.key.bias,[768],768,768,0.0
|
41 |
+
39,nncf_module.bert.encoder.layer.3.attention.self.key.weight,"[768, 768]",589824,589824,0.0
|
42 |
+
40,nncf_module.bert.encoder.layer.3.attention.self.value.bias,[768],768,768,0.0
|
43 |
+
41,nncf_module.bert.encoder.layer.3.attention.self.value.weight,"[768, 768]",589824,589824,0.0
|
44 |
+
42,nncf_module.bert.encoder.layer.3.attention.output.dense.bias,[768],768,768,0.0
|
45 |
+
43,nncf_module.bert.encoder.layer.3.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
|
46 |
+
44,nncf_module.bert.encoder.layer.3.intermediate.dense.bias,[3072],3072,2957,0.03743489583333337
|
47 |
+
45,nncf_module.bert.encoder.layer.3.intermediate.dense.weight,"[3072, 768]",2359296,2270976,0.03743489583333337
|
48 |
+
46,nncf_module.bert.encoder.layer.3.output.dense.bias,[768],768,768,0.0
|
49 |
+
47,nncf_module.bert.encoder.layer.3.output.dense.weight,"[768, 3072]",2359296,2270976,0.03743489583333337
|
50 |
+
48,nncf_module.bert.encoder.layer.4.attention.self.query.bias,[768],768,768,0.0
|
51 |
+
49,nncf_module.bert.encoder.layer.4.attention.self.query.weight,"[768, 768]",589824,589824,0.0
|
52 |
+
50,nncf_module.bert.encoder.layer.4.attention.self.key.bias,[768],768,768,0.0
|
53 |
+
51,nncf_module.bert.encoder.layer.4.attention.self.key.weight,"[768, 768]",589824,589824,0.0
|
54 |
+
52,nncf_module.bert.encoder.layer.4.attention.self.value.bias,[768],768,768,0.0
|
55 |
+
53,nncf_module.bert.encoder.layer.4.attention.self.value.weight,"[768, 768]",589824,589824,0.0
|
56 |
+
54,nncf_module.bert.encoder.layer.4.attention.output.dense.bias,[768],768,768,0.0
|
57 |
+
55,nncf_module.bert.encoder.layer.4.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
|
58 |
+
56,nncf_module.bert.encoder.layer.4.intermediate.dense.bias,[3072],3072,2906,0.05403645833333337
|
59 |
+
57,nncf_module.bert.encoder.layer.4.intermediate.dense.weight,"[3072, 768]",2359296,2231808,0.05403645833333337
|
60 |
+
58,nncf_module.bert.encoder.layer.4.output.dense.bias,[768],768,768,0.0
|
61 |
+
59,nncf_module.bert.encoder.layer.4.output.dense.weight,"[768, 3072]",2359296,2231808,0.05403645833333337
|
62 |
+
60,nncf_module.bert.encoder.layer.5.attention.self.query.bias,[768],768,768,0.0
|
63 |
+
61,nncf_module.bert.encoder.layer.5.attention.self.query.weight,"[768, 768]",589824,589824,0.0
|
64 |
+
62,nncf_module.bert.encoder.layer.5.attention.self.key.bias,[768],768,768,0.0
|
65 |
+
63,nncf_module.bert.encoder.layer.5.attention.self.key.weight,"[768, 768]",589824,589824,0.0
|
66 |
+
64,nncf_module.bert.encoder.layer.5.attention.self.value.bias,[768],768,768,0.0
|
67 |
+
65,nncf_module.bert.encoder.layer.5.attention.self.value.weight,"[768, 768]",589824,589824,0.0
|
68 |
+
66,nncf_module.bert.encoder.layer.5.attention.output.dense.bias,[768],768,768,0.0
|
69 |
+
67,nncf_module.bert.encoder.layer.5.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
|
70 |
+
68,nncf_module.bert.encoder.layer.5.intermediate.dense.bias,[3072],3072,2865,0.0673828125
|
71 |
+
69,nncf_module.bert.encoder.layer.5.intermediate.dense.weight,"[3072, 768]",2359296,2200320,0.0673828125
|
72 |
+
70,nncf_module.bert.encoder.layer.5.output.dense.bias,[768],768,768,0.0
|
73 |
+
71,nncf_module.bert.encoder.layer.5.output.dense.weight,"[768, 3072]",2359296,2200320,0.0673828125
|
74 |
+
72,nncf_module.bert.encoder.layer.6.attention.self.query.bias,[768],768,768,0.0
|
75 |
+
73,nncf_module.bert.encoder.layer.6.attention.self.query.weight,"[768, 768]",589824,589824,0.0
|
76 |
+
74,nncf_module.bert.encoder.layer.6.attention.self.key.bias,[768],768,768,0.0
|
77 |
+
75,nncf_module.bert.encoder.layer.6.attention.self.key.weight,"[768, 768]",589824,589824,0.0
|
78 |
+
76,nncf_module.bert.encoder.layer.6.attention.self.value.bias,[768],768,768,0.0
|
79 |
+
77,nncf_module.bert.encoder.layer.6.attention.self.value.weight,"[768, 768]",589824,589824,0.0
|
80 |
+
78,nncf_module.bert.encoder.layer.6.attention.output.dense.bias,[768],768,768,0.0
|
81 |
+
79,nncf_module.bert.encoder.layer.6.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
|
82 |
+
80,nncf_module.bert.encoder.layer.6.intermediate.dense.bias,[3072],3072,2759,0.10188802083333337
|
83 |
+
81,nncf_module.bert.encoder.layer.6.intermediate.dense.weight,"[3072, 768]",2359296,2118912,0.10188802083333337
|
84 |
+
82,nncf_module.bert.encoder.layer.6.output.dense.bias,[768],768,768,0.0
|
85 |
+
83,nncf_module.bert.encoder.layer.6.output.dense.weight,"[768, 3072]",2359296,2118912,0.10188802083333337
|
86 |
+
84,nncf_module.bert.encoder.layer.7.attention.self.query.bias,[768],768,768,0.0
|
87 |
+
85,nncf_module.bert.encoder.layer.7.attention.self.query.weight,"[768, 768]",589824,589824,0.0
|
88 |
+
86,nncf_module.bert.encoder.layer.7.attention.self.key.bias,[768],768,768,0.0
|
89 |
+
87,nncf_module.bert.encoder.layer.7.attention.self.key.weight,"[768, 768]",589824,589824,0.0
|
90 |
+
88,nncf_module.bert.encoder.layer.7.attention.self.value.bias,[768],768,768,0.0
|
91 |
+
89,nncf_module.bert.encoder.layer.7.attention.self.value.weight,"[768, 768]",589824,589824,0.0
|
92 |
+
90,nncf_module.bert.encoder.layer.7.attention.output.dense.bias,[768],768,768,0.0
|
93 |
+
91,nncf_module.bert.encoder.layer.7.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
|
94 |
+
92,nncf_module.bert.encoder.layer.7.intermediate.dense.bias,[3072],3072,2569,0.16373697916666663
|
95 |
+
93,nncf_module.bert.encoder.layer.7.intermediate.dense.weight,"[3072, 768]",2359296,1972992,0.16373697916666663
|
96 |
+
94,nncf_module.bert.encoder.layer.7.output.dense.bias,[768],768,768,0.0
|
97 |
+
95,nncf_module.bert.encoder.layer.7.output.dense.weight,"[768, 3072]",2359296,1972992,0.16373697916666663
|
98 |
+
96,nncf_module.bert.encoder.layer.8.attention.self.query.bias,[768],768,256,0.6666666666666667
|
99 |
+
97,nncf_module.bert.encoder.layer.8.attention.self.query.weight,"[768, 768]",589824,196608,0.6666666666666667
|
100 |
+
98,nncf_module.bert.encoder.layer.8.attention.self.key.bias,[768],768,256,0.6666666666666667
|
101 |
+
99,nncf_module.bert.encoder.layer.8.attention.self.key.weight,"[768, 768]",589824,196608,0.6666666666666667
|
102 |
+
100,nncf_module.bert.encoder.layer.8.attention.self.value.bias,[768],768,256,0.6666666666666667
|
103 |
+
101,nncf_module.bert.encoder.layer.8.attention.self.value.weight,"[768, 768]",589824,196608,0.6666666666666667
|
104 |
+
102,nncf_module.bert.encoder.layer.8.attention.output.dense.bias,[768],768,768,0.0
|
105 |
+
103,nncf_module.bert.encoder.layer.8.attention.output.dense.weight,"[768, 768]",589824,196608,0.6666666666666667
|
106 |
+
104,nncf_module.bert.encoder.layer.8.intermediate.dense.bias,[3072],3072,2094,0.318359375
|
107 |
+
105,nncf_module.bert.encoder.layer.8.intermediate.dense.weight,"[3072, 768]",2359296,1608192,0.318359375
|
108 |
+
106,nncf_module.bert.encoder.layer.8.output.dense.bias,[768],768,768,0.0
|
109 |
+
107,nncf_module.bert.encoder.layer.8.output.dense.weight,"[768, 3072]",2359296,1608192,0.318359375
|
110 |
+
108,nncf_module.bert.encoder.layer.9.attention.self.query.bias,[768],768,768,0.0
|
111 |
+
109,nncf_module.bert.encoder.layer.9.attention.self.query.weight,"[768, 768]",589824,589824,0.0
|
112 |
+
110,nncf_module.bert.encoder.layer.9.attention.self.key.bias,[768],768,768,0.0
|
113 |
+
111,nncf_module.bert.encoder.layer.9.attention.self.key.weight,"[768, 768]",589824,589824,0.0
|
114 |
+
112,nncf_module.bert.encoder.layer.9.attention.self.value.bias,[768],768,768,0.0
|
115 |
+
113,nncf_module.bert.encoder.layer.9.attention.self.value.weight,"[768, 768]",589824,589824,0.0
|
116 |
+
114,nncf_module.bert.encoder.layer.9.attention.output.dense.bias,[768],768,768,0.0
|
117 |
+
115,nncf_module.bert.encoder.layer.9.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
|
118 |
+
116,nncf_module.bert.encoder.layer.9.intermediate.dense.bias,[3072],3072,1009,0.6715494791666667
|
119 |
+
117,nncf_module.bert.encoder.layer.9.intermediate.dense.weight,"[3072, 768]",2359296,774912,0.6715494791666667
|
120 |
+
118,nncf_module.bert.encoder.layer.9.output.dense.bias,[768],768,768,0.0
|
121 |
+
119,nncf_module.bert.encoder.layer.9.output.dense.weight,"[768, 3072]",2359296,774912,0.6715494791666667
|
122 |
+
120,nncf_module.bert.encoder.layer.10.attention.self.query.bias,[768],768,320,0.5833333333333333
|
123 |
+
121,nncf_module.bert.encoder.layer.10.attention.self.query.weight,"[768, 768]",589824,245760,0.5833333333333333
|
124 |
+
122,nncf_module.bert.encoder.layer.10.attention.self.key.bias,[768],768,320,0.5833333333333333
|
125 |
+
123,nncf_module.bert.encoder.layer.10.attention.self.key.weight,"[768, 768]",589824,245760,0.5833333333333333
|
126 |
+
124,nncf_module.bert.encoder.layer.10.attention.self.value.bias,[768],768,320,0.5833333333333333
|
127 |
+
125,nncf_module.bert.encoder.layer.10.attention.self.value.weight,"[768, 768]",589824,245760,0.5833333333333333
|
128 |
+
126,nncf_module.bert.encoder.layer.10.attention.output.dense.bias,[768],768,768,0.0
|
129 |
+
127,nncf_module.bert.encoder.layer.10.attention.output.dense.weight,"[768, 768]",589824,245760,0.5833333333333333
|
130 |
+
128,nncf_module.bert.encoder.layer.10.intermediate.dense.bias,[3072],3072,743,0.7581380208333334
|
131 |
+
129,nncf_module.bert.encoder.layer.10.intermediate.dense.weight,"[3072, 768]",2359296,570624,0.7581380208333334
|
132 |
+
130,nncf_module.bert.encoder.layer.10.output.dense.bias,[768],768,768,0.0
|
133 |
+
131,nncf_module.bert.encoder.layer.10.output.dense.weight,"[768, 3072]",2359296,570624,0.7581380208333334
|
134 |
+
132,nncf_module.bert.encoder.layer.11.attention.self.query.bias,[768],768,192,0.75
|
135 |
+
133,nncf_module.bert.encoder.layer.11.attention.self.query.weight,"[768, 768]",589824,147456,0.75
|
136 |
+
134,nncf_module.bert.encoder.layer.11.attention.self.key.bias,[768],768,192,0.75
|
137 |
+
135,nncf_module.bert.encoder.layer.11.attention.self.key.weight,"[768, 768]",589824,147456,0.75
|
138 |
+
136,nncf_module.bert.encoder.layer.11.attention.self.value.bias,[768],768,192,0.75
|
139 |
+
137,nncf_module.bert.encoder.layer.11.attention.self.value.weight,"[768, 768]",589824,147456,0.75
|
140 |
+
138,nncf_module.bert.encoder.layer.11.attention.output.dense.bias,[768],768,768,0.0
|
141 |
+
139,nncf_module.bert.encoder.layer.11.attention.output.dense.weight,"[768, 768]",589824,147456,0.75
|
142 |
+
140,nncf_module.bert.encoder.layer.11.intermediate.dense.bias,[3072],3072,605,0.8030598958333334
|
143 |
+
141,nncf_module.bert.encoder.layer.11.intermediate.dense.weight,"[3072, 768]",2359296,464640,0.8030598958333334
|
144 |
+
142,nncf_module.bert.encoder.layer.11.output.dense.bias,[768],768,768,0.0
|
145 |
+
143,nncf_module.bert.encoder.layer.11.output.dense.weight,"[768, 3072]",2359296,464640,0.8030598958333334
|
146 |
+
144,nncf_module.qa_outputs.bias,[2],2,2,0.0
|
147 |
+
145,nncf_module.qa_outputs.weight,"[2, 768]",1536,1536,0.0
|
checkpoint-35000/onnx_sparsity.md
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
| | layer_id | shape | nparam | nnz | sparsity |
|
2 |
+
|----:|:----------------------------------------------------------------|:------------|---------:|--------:|-----------:|
|
3 |
+
| 0 | nncf_module.bert.encoder.layer.0.attention.self.query.bias | [768] | 768 | 192 | 0.75 |
|
4 |
+
| 1 | nncf_module.bert.encoder.layer.0.attention.self.query.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
5 |
+
| 2 | nncf_module.bert.encoder.layer.0.attention.self.key.bias | [768] | 768 | 192 | 0.75 |
|
6 |
+
| 3 | nncf_module.bert.encoder.layer.0.attention.self.key.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
7 |
+
| 4 | nncf_module.bert.encoder.layer.0.attention.self.value.bias | [768] | 768 | 192 | 0.75 |
|
8 |
+
| 5 | nncf_module.bert.encoder.layer.0.attention.self.value.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
9 |
+
| 6 | nncf_module.bert.encoder.layer.0.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
10 |
+
| 7 | nncf_module.bert.encoder.layer.0.attention.output.dense.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
11 |
+
| 8 | nncf_module.bert.encoder.layer.0.intermediate.dense.bias | [3072] | 3072 | 2940 | 0.0429688 |
|
12 |
+
| 9 | nncf_module.bert.encoder.layer.0.intermediate.dense.weight | [3072, 768] | 2359296 | 2257920 | 0.0429688 |
|
13 |
+
| 10 | nncf_module.bert.encoder.layer.0.output.dense.bias | [768] | 768 | 768 | 0 |
|
14 |
+
| 11 | nncf_module.bert.encoder.layer.0.output.dense.weight | [768, 3072] | 2359296 | 2257920 | 0.0429688 |
|
15 |
+
| 12 | nncf_module.bert.encoder.layer.1.attention.self.query.bias | [768] | 768 | 256 | 0.666667 |
|
16 |
+
| 13 | nncf_module.bert.encoder.layer.1.attention.self.query.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
17 |
+
| 14 | nncf_module.bert.encoder.layer.1.attention.self.key.bias | [768] | 768 | 256 | 0.666667 |
|
18 |
+
| 15 | nncf_module.bert.encoder.layer.1.attention.self.key.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
19 |
+
| 16 | nncf_module.bert.encoder.layer.1.attention.self.value.bias | [768] | 768 | 256 | 0.666667 |
|
20 |
+
| 17 | nncf_module.bert.encoder.layer.1.attention.self.value.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
21 |
+
| 18 | nncf_module.bert.encoder.layer.1.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
22 |
+
| 19 | nncf_module.bert.encoder.layer.1.attention.output.dense.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
23 |
+
| 20 | nncf_module.bert.encoder.layer.1.intermediate.dense.bias | [3072] | 3072 | 2923 | 0.0485026 |
|
24 |
+
| 21 | nncf_module.bert.encoder.layer.1.intermediate.dense.weight | [3072, 768] | 2359296 | 2244864 | 0.0485026 |
|
25 |
+
| 22 | nncf_module.bert.encoder.layer.1.output.dense.bias | [768] | 768 | 768 | 0 |
|
26 |
+
| 23 | nncf_module.bert.encoder.layer.1.output.dense.weight | [768, 3072] | 2359296 | 2244864 | 0.0485026 |
|
27 |
+
| 24 | nncf_module.bert.encoder.layer.2.attention.self.query.bias | [768] | 768 | 768 | 0 |
|
28 |
+
| 25 | nncf_module.bert.encoder.layer.2.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
|
29 |
+
| 26 | nncf_module.bert.encoder.layer.2.attention.self.key.bias | [768] | 768 | 768 | 0 |
|
30 |
+
| 27 | nncf_module.bert.encoder.layer.2.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
|
31 |
+
| 28 | nncf_module.bert.encoder.layer.2.attention.self.value.bias | [768] | 768 | 768 | 0 |
|
32 |
+
| 29 | nncf_module.bert.encoder.layer.2.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
|
33 |
+
| 30 | nncf_module.bert.encoder.layer.2.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
34 |
+
| 31 | nncf_module.bert.encoder.layer.2.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
|
35 |
+
| 32 | nncf_module.bert.encoder.layer.2.intermediate.dense.bias | [3072] | 3072 | 2980 | 0.0299479 |
|
36 |
+
| 33 | nncf_module.bert.encoder.layer.2.intermediate.dense.weight | [3072, 768] | 2359296 | 2288640 | 0.0299479 |
|
37 |
+
| 34 | nncf_module.bert.encoder.layer.2.output.dense.bias | [768] | 768 | 768 | 0 |
|
38 |
+
| 35 | nncf_module.bert.encoder.layer.2.output.dense.weight | [768, 3072] | 2359296 | 2288640 | 0.0299479 |
|
39 |
+
| 36 | nncf_module.bert.encoder.layer.3.attention.self.query.bias | [768] | 768 | 768 | 0 |
|
40 |
+
| 37 | nncf_module.bert.encoder.layer.3.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
|
41 |
+
| 38 | nncf_module.bert.encoder.layer.3.attention.self.key.bias | [768] | 768 | 768 | 0 |
|
42 |
+
| 39 | nncf_module.bert.encoder.layer.3.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
|
43 |
+
| 40 | nncf_module.bert.encoder.layer.3.attention.self.value.bias | [768] | 768 | 768 | 0 |
|
44 |
+
| 41 | nncf_module.bert.encoder.layer.3.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
|
45 |
+
| 42 | nncf_module.bert.encoder.layer.3.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
46 |
+
| 43 | nncf_module.bert.encoder.layer.3.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
|
47 |
+
| 44 | nncf_module.bert.encoder.layer.3.intermediate.dense.bias | [3072] | 3072 | 2957 | 0.0374349 |
|
48 |
+
| 45 | nncf_module.bert.encoder.layer.3.intermediate.dense.weight | [3072, 768] | 2359296 | 2270976 | 0.0374349 |
|
49 |
+
| 46 | nncf_module.bert.encoder.layer.3.output.dense.bias | [768] | 768 | 768 | 0 |
|
50 |
+
| 47 | nncf_module.bert.encoder.layer.3.output.dense.weight | [768, 3072] | 2359296 | 2270976 | 0.0374349 |
|
51 |
+
| 48 | nncf_module.bert.encoder.layer.4.attention.self.query.bias | [768] | 768 | 768 | 0 |
|
52 |
+
| 49 | nncf_module.bert.encoder.layer.4.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
|
53 |
+
| 50 | nncf_module.bert.encoder.layer.4.attention.self.key.bias | [768] | 768 | 768 | 0 |
|
54 |
+
| 51 | nncf_module.bert.encoder.layer.4.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
|
55 |
+
| 52 | nncf_module.bert.encoder.layer.4.attention.self.value.bias | [768] | 768 | 768 | 0 |
|
56 |
+
| 53 | nncf_module.bert.encoder.layer.4.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
|
57 |
+
| 54 | nncf_module.bert.encoder.layer.4.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
58 |
+
| 55 | nncf_module.bert.encoder.layer.4.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
|
59 |
+
| 56 | nncf_module.bert.encoder.layer.4.intermediate.dense.bias | [3072] | 3072 | 2906 | 0.0540365 |
|
60 |
+
| 57 | nncf_module.bert.encoder.layer.4.intermediate.dense.weight | [3072, 768] | 2359296 | 2231808 | 0.0540365 |
|
61 |
+
| 58 | nncf_module.bert.encoder.layer.4.output.dense.bias | [768] | 768 | 768 | 0 |
|
62 |
+
| 59 | nncf_module.bert.encoder.layer.4.output.dense.weight | [768, 3072] | 2359296 | 2231808 | 0.0540365 |
|
63 |
+
| 60 | nncf_module.bert.encoder.layer.5.attention.self.query.bias | [768] | 768 | 768 | 0 |
|
64 |
+
| 61 | nncf_module.bert.encoder.layer.5.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
|
65 |
+
| 62 | nncf_module.bert.encoder.layer.5.attention.self.key.bias | [768] | 768 | 768 | 0 |
|
66 |
+
| 63 | nncf_module.bert.encoder.layer.5.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
|
67 |
+
| 64 | nncf_module.bert.encoder.layer.5.attention.self.value.bias | [768] | 768 | 768 | 0 |
|
68 |
+
| 65 | nncf_module.bert.encoder.layer.5.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
|
69 |
+
| 66 | nncf_module.bert.encoder.layer.5.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
70 |
+
| 67 | nncf_module.bert.encoder.layer.5.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
|
71 |
+
| 68 | nncf_module.bert.encoder.layer.5.intermediate.dense.bias | [3072] | 3072 | 2865 | 0.0673828 |
|
72 |
+
| 69 | nncf_module.bert.encoder.layer.5.intermediate.dense.weight | [3072, 768] | 2359296 | 2200320 | 0.0673828 |
|
73 |
+
| 70 | nncf_module.bert.encoder.layer.5.output.dense.bias | [768] | 768 | 768 | 0 |
|
74 |
+
| 71 | nncf_module.bert.encoder.layer.5.output.dense.weight | [768, 3072] | 2359296 | 2200320 | 0.0673828 |
|
75 |
+
| 72 | nncf_module.bert.encoder.layer.6.attention.self.query.bias | [768] | 768 | 768 | 0 |
|
76 |
+
| 73 | nncf_module.bert.encoder.layer.6.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
|
77 |
+
| 74 | nncf_module.bert.encoder.layer.6.attention.self.key.bias | [768] | 768 | 768 | 0 |
|
78 |
+
| 75 | nncf_module.bert.encoder.layer.6.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
|
79 |
+
| 76 | nncf_module.bert.encoder.layer.6.attention.self.value.bias | [768] | 768 | 768 | 0 |
|
80 |
+
| 77 | nncf_module.bert.encoder.layer.6.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
|
81 |
+
| 78 | nncf_module.bert.encoder.layer.6.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
82 |
+
| 79 | nncf_module.bert.encoder.layer.6.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
|
83 |
+
| 80 | nncf_module.bert.encoder.layer.6.intermediate.dense.bias | [3072] | 3072 | 2759 | 0.101888 |
|
84 |
+
| 81 | nncf_module.bert.encoder.layer.6.intermediate.dense.weight | [3072, 768] | 2359296 | 2118912 | 0.101888 |
|
85 |
+
| 82 | nncf_module.bert.encoder.layer.6.output.dense.bias | [768] | 768 | 768 | 0 |
|
86 |
+
| 83 | nncf_module.bert.encoder.layer.6.output.dense.weight | [768, 3072] | 2359296 | 2118912 | 0.101888 |
|
87 |
+
| 84 | nncf_module.bert.encoder.layer.7.attention.self.query.bias | [768] | 768 | 768 | 0 |
|
88 |
+
| 85 | nncf_module.bert.encoder.layer.7.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
|
89 |
+
| 86 | nncf_module.bert.encoder.layer.7.attention.self.key.bias | [768] | 768 | 768 | 0 |
|
90 |
+
| 87 | nncf_module.bert.encoder.layer.7.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
|
91 |
+
| 88 | nncf_module.bert.encoder.layer.7.attention.self.value.bias | [768] | 768 | 768 | 0 |
|
92 |
+
| 89 | nncf_module.bert.encoder.layer.7.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
|
93 |
+
| 90 | nncf_module.bert.encoder.layer.7.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
94 |
+
| 91 | nncf_module.bert.encoder.layer.7.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
|
95 |
+
| 92 | nncf_module.bert.encoder.layer.7.intermediate.dense.bias | [3072] | 3072 | 2569 | 0.163737 |
|
96 |
+
| 93 | nncf_module.bert.encoder.layer.7.intermediate.dense.weight | [3072, 768] | 2359296 | 1972992 | 0.163737 |
|
97 |
+
| 94 | nncf_module.bert.encoder.layer.7.output.dense.bias | [768] | 768 | 768 | 0 |
|
98 |
+
| 95 | nncf_module.bert.encoder.layer.7.output.dense.weight | [768, 3072] | 2359296 | 1972992 | 0.163737 |
|
99 |
+
| 96 | nncf_module.bert.encoder.layer.8.attention.self.query.bias | [768] | 768 | 256 | 0.666667 |
|
100 |
+
| 97 | nncf_module.bert.encoder.layer.8.attention.self.query.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
101 |
+
| 98 | nncf_module.bert.encoder.layer.8.attention.self.key.bias | [768] | 768 | 256 | 0.666667 |
|
102 |
+
| 99 | nncf_module.bert.encoder.layer.8.attention.self.key.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
103 |
+
| 100 | nncf_module.bert.encoder.layer.8.attention.self.value.bias | [768] | 768 | 256 | 0.666667 |
|
104 |
+
| 101 | nncf_module.bert.encoder.layer.8.attention.self.value.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
105 |
+
| 102 | nncf_module.bert.encoder.layer.8.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
106 |
+
| 103 | nncf_module.bert.encoder.layer.8.attention.output.dense.weight | [768, 768] | 589824 | 196608 | 0.666667 |
|
107 |
+
| 104 | nncf_module.bert.encoder.layer.8.intermediate.dense.bias | [3072] | 3072 | 2094 | 0.318359 |
|
108 |
+
| 105 | nncf_module.bert.encoder.layer.8.intermediate.dense.weight | [3072, 768] | 2359296 | 1608192 | 0.318359 |
|
109 |
+
| 106 | nncf_module.bert.encoder.layer.8.output.dense.bias | [768] | 768 | 768 | 0 |
|
110 |
+
| 107 | nncf_module.bert.encoder.layer.8.output.dense.weight | [768, 3072] | 2359296 | 1608192 | 0.318359 |
|
111 |
+
| 108 | nncf_module.bert.encoder.layer.9.attention.self.query.bias | [768] | 768 | 768 | 0 |
|
112 |
+
| 109 | nncf_module.bert.encoder.layer.9.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
|
113 |
+
| 110 | nncf_module.bert.encoder.layer.9.attention.self.key.bias | [768] | 768 | 768 | 0 |
|
114 |
+
| 111 | nncf_module.bert.encoder.layer.9.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
|
115 |
+
| 112 | nncf_module.bert.encoder.layer.9.attention.self.value.bias | [768] | 768 | 768 | 0 |
|
116 |
+
| 113 | nncf_module.bert.encoder.layer.9.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
|
117 |
+
| 114 | nncf_module.bert.encoder.layer.9.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
118 |
+
| 115 | nncf_module.bert.encoder.layer.9.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
|
119 |
+
| 116 | nncf_module.bert.encoder.layer.9.intermediate.dense.bias | [3072] | 3072 | 1009 | 0.671549 |
|
120 |
+
| 117 | nncf_module.bert.encoder.layer.9.intermediate.dense.weight | [3072, 768] | 2359296 | 774912 | 0.671549 |
|
121 |
+
| 118 | nncf_module.bert.encoder.layer.9.output.dense.bias | [768] | 768 | 768 | 0 |
|
122 |
+
| 119 | nncf_module.bert.encoder.layer.9.output.dense.weight | [768, 3072] | 2359296 | 774912 | 0.671549 |
|
123 |
+
| 120 | nncf_module.bert.encoder.layer.10.attention.self.query.bias | [768] | 768 | 320 | 0.583333 |
|
124 |
+
| 121 | nncf_module.bert.encoder.layer.10.attention.self.query.weight | [768, 768] | 589824 | 245760 | 0.583333 |
|
125 |
+
| 122 | nncf_module.bert.encoder.layer.10.attention.self.key.bias | [768] | 768 | 320 | 0.583333 |
|
126 |
+
| 123 | nncf_module.bert.encoder.layer.10.attention.self.key.weight | [768, 768] | 589824 | 245760 | 0.583333 |
|
127 |
+
| 124 | nncf_module.bert.encoder.layer.10.attention.self.value.bias | [768] | 768 | 320 | 0.583333 |
|
128 |
+
| 125 | nncf_module.bert.encoder.layer.10.attention.self.value.weight | [768, 768] | 589824 | 245760 | 0.583333 |
|
129 |
+
| 126 | nncf_module.bert.encoder.layer.10.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
130 |
+
| 127 | nncf_module.bert.encoder.layer.10.attention.output.dense.weight | [768, 768] | 589824 | 245760 | 0.583333 |
|
131 |
+
| 128 | nncf_module.bert.encoder.layer.10.intermediate.dense.bias | [3072] | 3072 | 743 | 0.758138 |
|
132 |
+
| 129 | nncf_module.bert.encoder.layer.10.intermediate.dense.weight | [3072, 768] | 2359296 | 570624 | 0.758138 |
|
133 |
+
| 130 | nncf_module.bert.encoder.layer.10.output.dense.bias | [768] | 768 | 768 | 0 |
|
134 |
+
| 131 | nncf_module.bert.encoder.layer.10.output.dense.weight | [768, 3072] | 2359296 | 570624 | 0.758138 |
|
135 |
+
| 132 | nncf_module.bert.encoder.layer.11.attention.self.query.bias | [768] | 768 | 192 | 0.75 |
|
136 |
+
| 133 | nncf_module.bert.encoder.layer.11.attention.self.query.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
137 |
+
| 134 | nncf_module.bert.encoder.layer.11.attention.self.key.bias | [768] | 768 | 192 | 0.75 |
|
138 |
+
| 135 | nncf_module.bert.encoder.layer.11.attention.self.key.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
139 |
+
| 136 | nncf_module.bert.encoder.layer.11.attention.self.value.bias | [768] | 768 | 192 | 0.75 |
|
140 |
+
| 137 | nncf_module.bert.encoder.layer.11.attention.self.value.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
141 |
+
| 138 | nncf_module.bert.encoder.layer.11.attention.output.dense.bias | [768] | 768 | 768 | 0 |
|
142 |
+
| 139 | nncf_module.bert.encoder.layer.11.attention.output.dense.weight | [768, 768] | 589824 | 147456 | 0.75 |
|
143 |
+
| 140 | nncf_module.bert.encoder.layer.11.intermediate.dense.bias | [3072] | 3072 | 605 | 0.80306 |
|
144 |
+
| 141 | nncf_module.bert.encoder.layer.11.intermediate.dense.weight | [3072, 768] | 2359296 | 464640 | 0.80306 |
|
145 |
+
| 142 | nncf_module.bert.encoder.layer.11.output.dense.bias | [768] | 768 | 768 | 0 |
|
146 |
+
| 143 | nncf_module.bert.encoder.layer.11.output.dense.weight | [768, 3072] | 2359296 | 464640 | 0.80306 |
|
147 |
+
| 144 | nncf_module.qa_outputs.bias | [2] | 2 | 2 | 0 |
|
148 |
+
| 145 | nncf_module.qa_outputs.weight | [2, 768] | 1536 | 1536 | 0 |
|
checkpoint-35000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c69179b142350c0900215f2171560e60ce8071203a2a11acc717b62b70da947
|
3 |
+
size 872456613
|
checkpoint-35000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c65c8c3531e309f7ee0524ab8b7e4e35d401ee8b24f40c0fe91f08b4d377ef68
|
3 |
+
size 776435185
|
checkpoint-35000/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53f9d1332dd8b0177c10a70c94f642ef694da36859eccb739ae846f1b2fc39d8
|
3 |
+
size 14503
|
checkpoint-35000/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883abecad344373d12706b2d3326e6572c4bb5d25ea9ce27fa2f2c6f3496b303
|
3 |
+
size 623
|
checkpoint-35000/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
checkpoint-35000/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoint-35000/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
|
checkpoint-35000/torch_mask_structures.csv
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,pt_module_name,block_id,weight_shape,prune_w_shape,bias_shape,prune_b_shape,head_id_to_keep,nncf_graph_node
|
2 |
+
0,nncf_module.bert.encoder.layer.0.attention.output.dense,0,"(768, 768)","(768, 192)","(768,)","(768,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
3 |
+
1,nncf_module.bert.encoder.layer.0.attention.self.value,0,"(768, 768)","(192, 768)","(768,)","(192,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
4 |
+
2,nncf_module.bert.encoder.layer.0.attention.self.key,0,"(768, 768)","(192, 768)","(768,)","(192,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
5 |
+
3,nncf_module.bert.encoder.layer.0.attention.self.query,0,"(768, 768)","(192, 768)","(768,)","(192,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
6 |
+
4,nncf_module.bert.encoder.layer.0.output.dense,1,"(768, 3072)","(768, 2940)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
7 |
+
5,nncf_module.bert.encoder.layer.0.intermediate.dense,1,"(3072, 768)","(2940, 768)","(3072,)","(2940,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
8 |
+
6,nncf_module.bert.encoder.layer.1.attention.self.key,2,"(768, 768)","(256, 768)","(768,)","(256,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
9 |
+
7,nncf_module.bert.encoder.layer.1.attention.self.query,2,"(768, 768)","(256, 768)","(768,)","(256,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
10 |
+
8,nncf_module.bert.encoder.layer.1.attention.output.dense,2,"(768, 768)","(768, 256)","(768,)","(768,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
11 |
+
9,nncf_module.bert.encoder.layer.1.attention.self.value,2,"(768, 768)","(256, 768)","(768,)","(256,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
12 |
+
10,nncf_module.bert.encoder.layer.1.intermediate.dense,3,"(3072, 768)","(2923, 768)","(3072,)","(2923,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
13 |
+
11,nncf_module.bert.encoder.layer.1.output.dense,3,"(768, 3072)","(768, 2923)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
14 |
+
12,nncf_module.bert.encoder.layer.2.attention.self.value,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
15 |
+
13,nncf_module.bert.encoder.layer.2.attention.output.dense,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
16 |
+
14,nncf_module.bert.encoder.layer.2.attention.self.key,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
17 |
+
15,nncf_module.bert.encoder.layer.2.attention.self.query,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
18 |
+
16,nncf_module.bert.encoder.layer.2.output.dense,5,"(768, 3072)","(768, 2980)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
19 |
+
17,nncf_module.bert.encoder.layer.2.intermediate.dense,5,"(3072, 768)","(2980, 768)","(3072,)","(2980,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
20 |
+
18,nncf_module.bert.encoder.layer.3.attention.output.dense,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
21 |
+
19,nncf_module.bert.encoder.layer.3.attention.self.value,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
22 |
+
20,nncf_module.bert.encoder.layer.3.attention.self.key,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
23 |
+
21,nncf_module.bert.encoder.layer.3.attention.self.query,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
24 |
+
22,nncf_module.bert.encoder.layer.3.intermediate.dense,7,"(3072, 768)","(2957, 768)","(3072,)","(2957,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
25 |
+
23,nncf_module.bert.encoder.layer.3.output.dense,7,"(768, 3072)","(768, 2957)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
26 |
+
24,nncf_module.bert.encoder.layer.4.attention.self.key,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
27 |
+
25,nncf_module.bert.encoder.layer.4.attention.self.value,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
28 |
+
26,nncf_module.bert.encoder.layer.4.attention.output.dense,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
29 |
+
27,nncf_module.bert.encoder.layer.4.attention.self.query,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
30 |
+
28,nncf_module.bert.encoder.layer.4.intermediate.dense,9,"(3072, 768)","(2906, 768)","(3072,)","(2906,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
31 |
+
29,nncf_module.bert.encoder.layer.4.output.dense,9,"(768, 3072)","(768, 2906)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
32 |
+
30,nncf_module.bert.encoder.layer.5.attention.self.value,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
33 |
+
31,nncf_module.bert.encoder.layer.5.attention.output.dense,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
34 |
+
32,nncf_module.bert.encoder.layer.5.attention.self.query,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
35 |
+
33,nncf_module.bert.encoder.layer.5.attention.self.key,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
36 |
+
34,nncf_module.bert.encoder.layer.5.intermediate.dense,11,"(3072, 768)","(2865, 768)","(3072,)","(2865,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
37 |
+
35,nncf_module.bert.encoder.layer.5.output.dense,11,"(768, 3072)","(768, 2865)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
38 |
+
36,nncf_module.bert.encoder.layer.6.attention.self.value,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
39 |
+
37,nncf_module.bert.encoder.layer.6.attention.self.query,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
40 |
+
38,nncf_module.bert.encoder.layer.6.attention.self.key,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
41 |
+
39,nncf_module.bert.encoder.layer.6.attention.output.dense,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
42 |
+
40,nncf_module.bert.encoder.layer.6.output.dense,13,"(768, 3072)","(768, 2759)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
43 |
+
41,nncf_module.bert.encoder.layer.6.intermediate.dense,13,"(3072, 768)","(2759, 768)","(3072,)","(2759,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
44 |
+
42,nncf_module.bert.encoder.layer.7.attention.self.query,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
45 |
+
43,nncf_module.bert.encoder.layer.7.attention.self.key,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
46 |
+
44,nncf_module.bert.encoder.layer.7.attention.self.value,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
47 |
+
45,nncf_module.bert.encoder.layer.7.attention.output.dense,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
48 |
+
46,nncf_module.bert.encoder.layer.7.intermediate.dense,15,"(3072, 768)","(2569, 768)","(3072,)","(2569,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
49 |
+
47,nncf_module.bert.encoder.layer.7.output.dense,15,"(768, 3072)","(768, 2569)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
50 |
+
48,nncf_module.bert.encoder.layer.8.attention.self.key,16,"(768, 768)","(256, 768)","(768,)","(256,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
51 |
+
49,nncf_module.bert.encoder.layer.8.attention.self.value,16,"(768, 768)","(256, 768)","(768,)","(256,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
52 |
+
50,nncf_module.bert.encoder.layer.8.attention.self.query,16,"(768, 768)","(256, 768)","(768,)","(256,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
53 |
+
51,nncf_module.bert.encoder.layer.8.attention.output.dense,16,"(768, 768)","(768, 256)","(768,)","(768,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
54 |
+
52,nncf_module.bert.encoder.layer.8.output.dense,17,"(768, 3072)","(768, 2094)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
55 |
+
53,nncf_module.bert.encoder.layer.8.intermediate.dense,17,"(3072, 768)","(2094, 768)","(3072,)","(2094,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
56 |
+
54,nncf_module.bert.encoder.layer.9.attention.self.query,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
57 |
+
55,nncf_module.bert.encoder.layer.9.attention.output.dense,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
58 |
+
56,nncf_module.bert.encoder.layer.9.attention.self.value,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
59 |
+
57,nncf_module.bert.encoder.layer.9.attention.self.key,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
60 |
+
58,nncf_module.bert.encoder.layer.9.intermediate.dense,19,"(3072, 768)","(1009, 768)","(3072,)","(1009,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
61 |
+
59,nncf_module.bert.encoder.layer.9.output.dense,19,"(768, 3072)","(768, 1009)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
62 |
+
60,nncf_module.bert.encoder.layer.10.attention.self.key,20,"(768, 768)","(320, 768)","(768,)","(320,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
63 |
+
61,nncf_module.bert.encoder.layer.10.attention.self.value,20,"(768, 768)","(320, 768)","(768,)","(320,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
64 |
+
62,nncf_module.bert.encoder.layer.10.attention.self.query,20,"(768, 768)","(320, 768)","(768,)","(320,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
65 |
+
63,nncf_module.bert.encoder.layer.10.attention.output.dense,20,"(768, 768)","(768, 320)","(768,)","(768,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
66 |
+
64,nncf_module.bert.encoder.layer.10.output.dense,21,"(768, 3072)","(768, 743)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
67 |
+
65,nncf_module.bert.encoder.layer.10.intermediate.dense,21,"(3072, 768)","(743, 768)","(3072,)","(743,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
68 |
+
66,nncf_module.bert.encoder.layer.11.attention.output.dense,22,"(768, 768)","(768, 192)","(768,)","(768,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
69 |
+
67,nncf_module.bert.encoder.layer.11.attention.self.key,22,"(768, 768)","(192, 768)","(768,)","(192,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
70 |
+
68,nncf_module.bert.encoder.layer.11.attention.self.value,22,"(768, 768)","(192, 768)","(768,)","(192,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
71 |
+
69,nncf_module.bert.encoder.layer.11.attention.self.query,22,"(768, 768)","(192, 768)","(768,)","(192,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
72 |
+
70,nncf_module.bert.encoder.layer.11.intermediate.dense,23,"(3072, 768)","(605, 768)","(3072,)","(605,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
73 |
+
71,nncf_module.bert.encoder.layer.11.output.dense,23,"(768, 3072)","(768, 605)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
checkpoint-35000/torch_mask_structures.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
| | pt_module_name | block_id | weight_shape | prune_w_shape | bias_shape | prune_b_shape | head_id_to_keep | nncf_graph_node |
|
2 |
+
|---:|:---------------------------------------------------------|-----------:|:---------------|:----------------|:-------------|:----------------|:---------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
3 |
+
| 0 | nncf_module.bert.encoder.layer.0.attention.output.dense | 0 | (768, 768) | (768, 192) | (768,) | (768,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
4 |
+
| 1 | nncf_module.bert.encoder.layer.0.attention.self.value | 0 | (768, 768) | (192, 768) | (768,) | (192,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
5 |
+
| 2 | nncf_module.bert.encoder.layer.0.attention.self.key | 0 | (768, 768) | (192, 768) | (768,) | (192,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
6 |
+
| 3 | nncf_module.bert.encoder.layer.0.attention.self.query | 0 | (768, 768) | (192, 768) | (768,) | (192,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
7 |
+
| 4 | nncf_module.bert.encoder.layer.0.output.dense | 1 | (768, 3072) | (768, 2940) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
8 |
+
| 5 | nncf_module.bert.encoder.layer.0.intermediate.dense | 1 | (3072, 768) | (2940, 768) | (3072,) | (2940,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
9 |
+
| 6 | nncf_module.bert.encoder.layer.1.attention.self.key | 2 | (768, 768) | (256, 768) | (768,) | (256,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
10 |
+
| 7 | nncf_module.bert.encoder.layer.1.attention.self.query | 2 | (768, 768) | (256, 768) | (768,) | (256,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
11 |
+
| 8 | nncf_module.bert.encoder.layer.1.attention.output.dense | 2 | (768, 768) | (768, 256) | (768,) | (768,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
12 |
+
| 9 | nncf_module.bert.encoder.layer.1.attention.self.value | 2 | (768, 768) | (256, 768) | (768,) | (256,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
13 |
+
| 10 | nncf_module.bert.encoder.layer.1.intermediate.dense | 3 | (3072, 768) | (2923, 768) | (3072,) | (2923,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
14 |
+
| 11 | nncf_module.bert.encoder.layer.1.output.dense | 3 | (768, 3072) | (768, 2923) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
15 |
+
| 12 | nncf_module.bert.encoder.layer.2.attention.self.value | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
16 |
+
| 13 | nncf_module.bert.encoder.layer.2.attention.output.dense | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
17 |
+
| 14 | nncf_module.bert.encoder.layer.2.attention.self.key | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
18 |
+
| 15 | nncf_module.bert.encoder.layer.2.attention.self.query | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
19 |
+
| 16 | nncf_module.bert.encoder.layer.2.output.dense | 5 | (768, 3072) | (768, 2980) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
20 |
+
| 17 | nncf_module.bert.encoder.layer.2.intermediate.dense | 5 | (3072, 768) | (2980, 768) | (3072,) | (2980,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
21 |
+
| 18 | nncf_module.bert.encoder.layer.3.attention.output.dense | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
22 |
+
| 19 | nncf_module.bert.encoder.layer.3.attention.self.value | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
23 |
+
| 20 | nncf_module.bert.encoder.layer.3.attention.self.key | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
24 |
+
| 21 | nncf_module.bert.encoder.layer.3.attention.self.query | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
25 |
+
| 22 | nncf_module.bert.encoder.layer.3.intermediate.dense | 7 | (3072, 768) | (2957, 768) | (3072,) | (2957,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
26 |
+
| 23 | nncf_module.bert.encoder.layer.3.output.dense | 7 | (768, 3072) | (768, 2957) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
27 |
+
| 24 | nncf_module.bert.encoder.layer.4.attention.self.key | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
28 |
+
| 25 | nncf_module.bert.encoder.layer.4.attention.self.value | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
29 |
+
| 26 | nncf_module.bert.encoder.layer.4.attention.output.dense | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
30 |
+
| 27 | nncf_module.bert.encoder.layer.4.attention.self.query | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
31 |
+
| 28 | nncf_module.bert.encoder.layer.4.intermediate.dense | 9 | (3072, 768) | (2906, 768) | (3072,) | (2906,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
32 |
+
| 29 | nncf_module.bert.encoder.layer.4.output.dense | 9 | (768, 3072) | (768, 2906) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
33 |
+
| 30 | nncf_module.bert.encoder.layer.5.attention.self.value | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
34 |
+
| 31 | nncf_module.bert.encoder.layer.5.attention.output.dense | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
35 |
+
| 32 | nncf_module.bert.encoder.layer.5.attention.self.query | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
36 |
+
| 33 | nncf_module.bert.encoder.layer.5.attention.self.key | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
37 |
+
| 34 | nncf_module.bert.encoder.layer.5.intermediate.dense | 11 | (3072, 768) | (2865, 768) | (3072,) | (2865,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
38 |
+
| 35 | nncf_module.bert.encoder.layer.5.output.dense | 11 | (768, 3072) | (768, 2865) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
39 |
+
| 36 | nncf_module.bert.encoder.layer.6.attention.self.value | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
40 |
+
| 37 | nncf_module.bert.encoder.layer.6.attention.self.query | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
41 |
+
| 38 | nncf_module.bert.encoder.layer.6.attention.self.key | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
42 |
+
| 39 | nncf_module.bert.encoder.layer.6.attention.output.dense | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
43 |
+
| 40 | nncf_module.bert.encoder.layer.6.output.dense | 13 | (768, 3072) | (768, 2759) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
44 |
+
| 41 | nncf_module.bert.encoder.layer.6.intermediate.dense | 13 | (3072, 768) | (2759, 768) | (3072,) | (2759,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
45 |
+
| 42 | nncf_module.bert.encoder.layer.7.attention.self.query | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
46 |
+
| 43 | nncf_module.bert.encoder.layer.7.attention.self.key | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
47 |
+
| 44 | nncf_module.bert.encoder.layer.7.attention.self.value | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
48 |
+
| 45 | nncf_module.bert.encoder.layer.7.attention.output.dense | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
49 |
+
| 46 | nncf_module.bert.encoder.layer.7.intermediate.dense | 15 | (3072, 768) | (2569, 768) | (3072,) | (2569,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
50 |
+
| 47 | nncf_module.bert.encoder.layer.7.output.dense | 15 | (768, 3072) | (768, 2569) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
51 |
+
| 48 | nncf_module.bert.encoder.layer.8.attention.self.key | 16 | (768, 768) | (256, 768) | (768,) | (256,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
52 |
+
| 49 | nncf_module.bert.encoder.layer.8.attention.self.value | 16 | (768, 768) | (256, 768) | (768,) | (256,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
53 |
+
| 50 | nncf_module.bert.encoder.layer.8.attention.self.query | 16 | (768, 768) | (256, 768) | (768,) | (256,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
54 |
+
| 51 | nncf_module.bert.encoder.layer.8.attention.output.dense | 16 | (768, 768) | (768, 256) | (768,) | (768,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
55 |
+
| 52 | nncf_module.bert.encoder.layer.8.output.dense | 17 | (768, 3072) | (768, 2094) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
56 |
+
| 53 | nncf_module.bert.encoder.layer.8.intermediate.dense | 17 | (3072, 768) | (2094, 768) | (3072,) | (2094,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
57 |
+
| 54 | nncf_module.bert.encoder.layer.9.attention.self.query | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
58 |
+
| 55 | nncf_module.bert.encoder.layer.9.attention.output.dense | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
59 |
+
| 56 | nncf_module.bert.encoder.layer.9.attention.self.value | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
60 |
+
| 57 | nncf_module.bert.encoder.layer.9.attention.self.key | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
61 |
+
| 58 | nncf_module.bert.encoder.layer.9.intermediate.dense | 19 | (3072, 768) | (1009, 768) | (3072,) | (1009,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
62 |
+
| 59 | nncf_module.bert.encoder.layer.9.output.dense | 19 | (768, 3072) | (768, 1009) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
63 |
+
| 60 | nncf_module.bert.encoder.layer.10.attention.self.key | 20 | (768, 768) | (320, 768) | (768,) | (320,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
64 |
+
| 61 | nncf_module.bert.encoder.layer.10.attention.self.value | 20 | (768, 768) | (320, 768) | (768,) | (320,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
65 |
+
| 62 | nncf_module.bert.encoder.layer.10.attention.self.query | 20 | (768, 768) | (320, 768) | (768,) | (320,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
66 |
+
| 63 | nncf_module.bert.encoder.layer.10.attention.output.dense | 20 | (768, 768) | (768, 320) | (768,) | (768,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
67 |
+
| 64 | nncf_module.bert.encoder.layer.10.output.dense | 21 | (768, 3072) | (768, 743) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
68 |
+
| 65 | nncf_module.bert.encoder.layer.10.intermediate.dense | 21 | (3072, 768) | (743, 768) | (3072,) | (743,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
69 |
+
| 66 | nncf_module.bert.encoder.layer.11.attention.output.dense | 22 | (768, 768) | (768, 192) | (768,) | (768,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
|
70 |
+
| 67 | nncf_module.bert.encoder.layer.11.attention.self.key | 22 | (768, 768) | (192, 768) | (768,) | (192,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
|
71 |
+
| 68 | nncf_module.bert.encoder.layer.11.attention.self.value | 22 | (768, 768) | (192, 768) | (768,) | (192,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
|
72 |
+
| 69 | nncf_module.bert.encoder.layer.11.attention.self.query | 22 | (768, 768) | (192, 768) | (768,) | (192,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
|
73 |
+
| 70 | nncf_module.bert.encoder.layer.11.intermediate.dense | 23 | (3072, 768) | (605, 768) | (3072,) | (605,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
|
74 |
+
| 71 | nncf_module.bert.encoder.layer.11.output.dense | 23 | (768, 3072) | (768, 605) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
|
checkpoint-35000/trainer_state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:963740bdff2cf72416e92704c8f12f612711498e80bcb86943396aa8686d3525
|
3 |
+
size 18704532
|
checkpoint-35000/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b71055db74256e103f6b235fc05792460b6f443079012f8566a63d151abf413
|
3 |
+
size 3183
|
checkpoint-35000/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|