Vui Seng Chua commited on
Commit
cd6bbb4
1 Parent(s): 3b67ed8

Add ckpt@35K

Browse files
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ trainer_state.json filter=lfs diff=lfs merge=lfs -text
checkpoint-35000/NNCFNetwork.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cafb8d66bd60b7c0660a3dbcb033936b27c7cf47f4c7ba854405f8682d039fe
3
+ size 435667833
checkpoint-35000/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-uncased",
3
+ "architectures": [
4
+ "NNCFNetwork"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.9.1",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
checkpoint-35000/onnx_sparsity.csv ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer_id,shape,nparam,nnz,sparsity
2
+ 0,nncf_module.bert.encoder.layer.0.attention.self.query.bias,[768],768,192,0.75
3
+ 1,nncf_module.bert.encoder.layer.0.attention.self.query.weight,"[768, 768]",589824,147456,0.75
4
+ 2,nncf_module.bert.encoder.layer.0.attention.self.key.bias,[768],768,192,0.75
5
+ 3,nncf_module.bert.encoder.layer.0.attention.self.key.weight,"[768, 768]",589824,147456,0.75
6
+ 4,nncf_module.bert.encoder.layer.0.attention.self.value.bias,[768],768,192,0.75
7
+ 5,nncf_module.bert.encoder.layer.0.attention.self.value.weight,"[768, 768]",589824,147456,0.75
8
+ 6,nncf_module.bert.encoder.layer.0.attention.output.dense.bias,[768],768,768,0.0
9
+ 7,nncf_module.bert.encoder.layer.0.attention.output.dense.weight,"[768, 768]",589824,147456,0.75
10
+ 8,nncf_module.bert.encoder.layer.0.intermediate.dense.bias,[3072],3072,2940,0.04296875
11
+ 9,nncf_module.bert.encoder.layer.0.intermediate.dense.weight,"[3072, 768]",2359296,2257920,0.04296875
12
+ 10,nncf_module.bert.encoder.layer.0.output.dense.bias,[768],768,768,0.0
13
+ 11,nncf_module.bert.encoder.layer.0.output.dense.weight,"[768, 3072]",2359296,2257920,0.04296875
14
+ 12,nncf_module.bert.encoder.layer.1.attention.self.query.bias,[768],768,256,0.6666666666666667
15
+ 13,nncf_module.bert.encoder.layer.1.attention.self.query.weight,"[768, 768]",589824,196608,0.6666666666666667
16
+ 14,nncf_module.bert.encoder.layer.1.attention.self.key.bias,[768],768,256,0.6666666666666667
17
+ 15,nncf_module.bert.encoder.layer.1.attention.self.key.weight,"[768, 768]",589824,196608,0.6666666666666667
18
+ 16,nncf_module.bert.encoder.layer.1.attention.self.value.bias,[768],768,256,0.6666666666666667
19
+ 17,nncf_module.bert.encoder.layer.1.attention.self.value.weight,"[768, 768]",589824,196608,0.6666666666666667
20
+ 18,nncf_module.bert.encoder.layer.1.attention.output.dense.bias,[768],768,768,0.0
21
+ 19,nncf_module.bert.encoder.layer.1.attention.output.dense.weight,"[768, 768]",589824,196608,0.6666666666666667
22
+ 20,nncf_module.bert.encoder.layer.1.intermediate.dense.bias,[3072],3072,2923,0.04850260416666663
23
+ 21,nncf_module.bert.encoder.layer.1.intermediate.dense.weight,"[3072, 768]",2359296,2244864,0.04850260416666663
24
+ 22,nncf_module.bert.encoder.layer.1.output.dense.bias,[768],768,768,0.0
25
+ 23,nncf_module.bert.encoder.layer.1.output.dense.weight,"[768, 3072]",2359296,2244864,0.04850260416666663
26
+ 24,nncf_module.bert.encoder.layer.2.attention.self.query.bias,[768],768,768,0.0
27
+ 25,nncf_module.bert.encoder.layer.2.attention.self.query.weight,"[768, 768]",589824,589824,0.0
28
+ 26,nncf_module.bert.encoder.layer.2.attention.self.key.bias,[768],768,768,0.0
29
+ 27,nncf_module.bert.encoder.layer.2.attention.self.key.weight,"[768, 768]",589824,589824,0.0
30
+ 28,nncf_module.bert.encoder.layer.2.attention.self.value.bias,[768],768,768,0.0
31
+ 29,nncf_module.bert.encoder.layer.2.attention.self.value.weight,"[768, 768]",589824,589824,0.0
32
+ 30,nncf_module.bert.encoder.layer.2.attention.output.dense.bias,[768],768,768,0.0
33
+ 31,nncf_module.bert.encoder.layer.2.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
34
+ 32,nncf_module.bert.encoder.layer.2.intermediate.dense.bias,[3072],3072,2980,0.02994791666666663
35
+ 33,nncf_module.bert.encoder.layer.2.intermediate.dense.weight,"[3072, 768]",2359296,2288640,0.02994791666666663
36
+ 34,nncf_module.bert.encoder.layer.2.output.dense.bias,[768],768,768,0.0
37
+ 35,nncf_module.bert.encoder.layer.2.output.dense.weight,"[768, 3072]",2359296,2288640,0.02994791666666663
38
+ 36,nncf_module.bert.encoder.layer.3.attention.self.query.bias,[768],768,768,0.0
39
+ 37,nncf_module.bert.encoder.layer.3.attention.self.query.weight,"[768, 768]",589824,589824,0.0
40
+ 38,nncf_module.bert.encoder.layer.3.attention.self.key.bias,[768],768,768,0.0
41
+ 39,nncf_module.bert.encoder.layer.3.attention.self.key.weight,"[768, 768]",589824,589824,0.0
42
+ 40,nncf_module.bert.encoder.layer.3.attention.self.value.bias,[768],768,768,0.0
43
+ 41,nncf_module.bert.encoder.layer.3.attention.self.value.weight,"[768, 768]",589824,589824,0.0
44
+ 42,nncf_module.bert.encoder.layer.3.attention.output.dense.bias,[768],768,768,0.0
45
+ 43,nncf_module.bert.encoder.layer.3.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
46
+ 44,nncf_module.bert.encoder.layer.3.intermediate.dense.bias,[3072],3072,2957,0.03743489583333337
47
+ 45,nncf_module.bert.encoder.layer.3.intermediate.dense.weight,"[3072, 768]",2359296,2270976,0.03743489583333337
48
+ 46,nncf_module.bert.encoder.layer.3.output.dense.bias,[768],768,768,0.0
49
+ 47,nncf_module.bert.encoder.layer.3.output.dense.weight,"[768, 3072]",2359296,2270976,0.03743489583333337
50
+ 48,nncf_module.bert.encoder.layer.4.attention.self.query.bias,[768],768,768,0.0
51
+ 49,nncf_module.bert.encoder.layer.4.attention.self.query.weight,"[768, 768]",589824,589824,0.0
52
+ 50,nncf_module.bert.encoder.layer.4.attention.self.key.bias,[768],768,768,0.0
53
+ 51,nncf_module.bert.encoder.layer.4.attention.self.key.weight,"[768, 768]",589824,589824,0.0
54
+ 52,nncf_module.bert.encoder.layer.4.attention.self.value.bias,[768],768,768,0.0
55
+ 53,nncf_module.bert.encoder.layer.4.attention.self.value.weight,"[768, 768]",589824,589824,0.0
56
+ 54,nncf_module.bert.encoder.layer.4.attention.output.dense.bias,[768],768,768,0.0
57
+ 55,nncf_module.bert.encoder.layer.4.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
58
+ 56,nncf_module.bert.encoder.layer.4.intermediate.dense.bias,[3072],3072,2906,0.05403645833333337
59
+ 57,nncf_module.bert.encoder.layer.4.intermediate.dense.weight,"[3072, 768]",2359296,2231808,0.05403645833333337
60
+ 58,nncf_module.bert.encoder.layer.4.output.dense.bias,[768],768,768,0.0
61
+ 59,nncf_module.bert.encoder.layer.4.output.dense.weight,"[768, 3072]",2359296,2231808,0.05403645833333337
62
+ 60,nncf_module.bert.encoder.layer.5.attention.self.query.bias,[768],768,768,0.0
63
+ 61,nncf_module.bert.encoder.layer.5.attention.self.query.weight,"[768, 768]",589824,589824,0.0
64
+ 62,nncf_module.bert.encoder.layer.5.attention.self.key.bias,[768],768,768,0.0
65
+ 63,nncf_module.bert.encoder.layer.5.attention.self.key.weight,"[768, 768]",589824,589824,0.0
66
+ 64,nncf_module.bert.encoder.layer.5.attention.self.value.bias,[768],768,768,0.0
67
+ 65,nncf_module.bert.encoder.layer.5.attention.self.value.weight,"[768, 768]",589824,589824,0.0
68
+ 66,nncf_module.bert.encoder.layer.5.attention.output.dense.bias,[768],768,768,0.0
69
+ 67,nncf_module.bert.encoder.layer.5.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
70
+ 68,nncf_module.bert.encoder.layer.5.intermediate.dense.bias,[3072],3072,2865,0.0673828125
71
+ 69,nncf_module.bert.encoder.layer.5.intermediate.dense.weight,"[3072, 768]",2359296,2200320,0.0673828125
72
+ 70,nncf_module.bert.encoder.layer.5.output.dense.bias,[768],768,768,0.0
73
+ 71,nncf_module.bert.encoder.layer.5.output.dense.weight,"[768, 3072]",2359296,2200320,0.0673828125
74
+ 72,nncf_module.bert.encoder.layer.6.attention.self.query.bias,[768],768,768,0.0
75
+ 73,nncf_module.bert.encoder.layer.6.attention.self.query.weight,"[768, 768]",589824,589824,0.0
76
+ 74,nncf_module.bert.encoder.layer.6.attention.self.key.bias,[768],768,768,0.0
77
+ 75,nncf_module.bert.encoder.layer.6.attention.self.key.weight,"[768, 768]",589824,589824,0.0
78
+ 76,nncf_module.bert.encoder.layer.6.attention.self.value.bias,[768],768,768,0.0
79
+ 77,nncf_module.bert.encoder.layer.6.attention.self.value.weight,"[768, 768]",589824,589824,0.0
80
+ 78,nncf_module.bert.encoder.layer.6.attention.output.dense.bias,[768],768,768,0.0
81
+ 79,nncf_module.bert.encoder.layer.6.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
82
+ 80,nncf_module.bert.encoder.layer.6.intermediate.dense.bias,[3072],3072,2759,0.10188802083333337
83
+ 81,nncf_module.bert.encoder.layer.6.intermediate.dense.weight,"[3072, 768]",2359296,2118912,0.10188802083333337
84
+ 82,nncf_module.bert.encoder.layer.6.output.dense.bias,[768],768,768,0.0
85
+ 83,nncf_module.bert.encoder.layer.6.output.dense.weight,"[768, 3072]",2359296,2118912,0.10188802083333337
86
+ 84,nncf_module.bert.encoder.layer.7.attention.self.query.bias,[768],768,768,0.0
87
+ 85,nncf_module.bert.encoder.layer.7.attention.self.query.weight,"[768, 768]",589824,589824,0.0
88
+ 86,nncf_module.bert.encoder.layer.7.attention.self.key.bias,[768],768,768,0.0
89
+ 87,nncf_module.bert.encoder.layer.7.attention.self.key.weight,"[768, 768]",589824,589824,0.0
90
+ 88,nncf_module.bert.encoder.layer.7.attention.self.value.bias,[768],768,768,0.0
91
+ 89,nncf_module.bert.encoder.layer.7.attention.self.value.weight,"[768, 768]",589824,589824,0.0
92
+ 90,nncf_module.bert.encoder.layer.7.attention.output.dense.bias,[768],768,768,0.0
93
+ 91,nncf_module.bert.encoder.layer.7.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
94
+ 92,nncf_module.bert.encoder.layer.7.intermediate.dense.bias,[3072],3072,2569,0.16373697916666663
95
+ 93,nncf_module.bert.encoder.layer.7.intermediate.dense.weight,"[3072, 768]",2359296,1972992,0.16373697916666663
96
+ 94,nncf_module.bert.encoder.layer.7.output.dense.bias,[768],768,768,0.0
97
+ 95,nncf_module.bert.encoder.layer.7.output.dense.weight,"[768, 3072]",2359296,1972992,0.16373697916666663
98
+ 96,nncf_module.bert.encoder.layer.8.attention.self.query.bias,[768],768,256,0.6666666666666667
99
+ 97,nncf_module.bert.encoder.layer.8.attention.self.query.weight,"[768, 768]",589824,196608,0.6666666666666667
100
+ 98,nncf_module.bert.encoder.layer.8.attention.self.key.bias,[768],768,256,0.6666666666666667
101
+ 99,nncf_module.bert.encoder.layer.8.attention.self.key.weight,"[768, 768]",589824,196608,0.6666666666666667
102
+ 100,nncf_module.bert.encoder.layer.8.attention.self.value.bias,[768],768,256,0.6666666666666667
103
+ 101,nncf_module.bert.encoder.layer.8.attention.self.value.weight,"[768, 768]",589824,196608,0.6666666666666667
104
+ 102,nncf_module.bert.encoder.layer.8.attention.output.dense.bias,[768],768,768,0.0
105
+ 103,nncf_module.bert.encoder.layer.8.attention.output.dense.weight,"[768, 768]",589824,196608,0.6666666666666667
106
+ 104,nncf_module.bert.encoder.layer.8.intermediate.dense.bias,[3072],3072,2094,0.318359375
107
+ 105,nncf_module.bert.encoder.layer.8.intermediate.dense.weight,"[3072, 768]",2359296,1608192,0.318359375
108
+ 106,nncf_module.bert.encoder.layer.8.output.dense.bias,[768],768,768,0.0
109
+ 107,nncf_module.bert.encoder.layer.8.output.dense.weight,"[768, 3072]",2359296,1608192,0.318359375
110
+ 108,nncf_module.bert.encoder.layer.9.attention.self.query.bias,[768],768,768,0.0
111
+ 109,nncf_module.bert.encoder.layer.9.attention.self.query.weight,"[768, 768]",589824,589824,0.0
112
+ 110,nncf_module.bert.encoder.layer.9.attention.self.key.bias,[768],768,768,0.0
113
+ 111,nncf_module.bert.encoder.layer.9.attention.self.key.weight,"[768, 768]",589824,589824,0.0
114
+ 112,nncf_module.bert.encoder.layer.9.attention.self.value.bias,[768],768,768,0.0
115
+ 113,nncf_module.bert.encoder.layer.9.attention.self.value.weight,"[768, 768]",589824,589824,0.0
116
+ 114,nncf_module.bert.encoder.layer.9.attention.output.dense.bias,[768],768,768,0.0
117
+ 115,nncf_module.bert.encoder.layer.9.attention.output.dense.weight,"[768, 768]",589824,589824,0.0
118
+ 116,nncf_module.bert.encoder.layer.9.intermediate.dense.bias,[3072],3072,1009,0.6715494791666667
119
+ 117,nncf_module.bert.encoder.layer.9.intermediate.dense.weight,"[3072, 768]",2359296,774912,0.6715494791666667
120
+ 118,nncf_module.bert.encoder.layer.9.output.dense.bias,[768],768,768,0.0
121
+ 119,nncf_module.bert.encoder.layer.9.output.dense.weight,"[768, 3072]",2359296,774912,0.6715494791666667
122
+ 120,nncf_module.bert.encoder.layer.10.attention.self.query.bias,[768],768,320,0.5833333333333333
123
+ 121,nncf_module.bert.encoder.layer.10.attention.self.query.weight,"[768, 768]",589824,245760,0.5833333333333333
124
+ 122,nncf_module.bert.encoder.layer.10.attention.self.key.bias,[768],768,320,0.5833333333333333
125
+ 123,nncf_module.bert.encoder.layer.10.attention.self.key.weight,"[768, 768]",589824,245760,0.5833333333333333
126
+ 124,nncf_module.bert.encoder.layer.10.attention.self.value.bias,[768],768,320,0.5833333333333333
127
+ 125,nncf_module.bert.encoder.layer.10.attention.self.value.weight,"[768, 768]",589824,245760,0.5833333333333333
128
+ 126,nncf_module.bert.encoder.layer.10.attention.output.dense.bias,[768],768,768,0.0
129
+ 127,nncf_module.bert.encoder.layer.10.attention.output.dense.weight,"[768, 768]",589824,245760,0.5833333333333333
130
+ 128,nncf_module.bert.encoder.layer.10.intermediate.dense.bias,[3072],3072,743,0.7581380208333334
131
+ 129,nncf_module.bert.encoder.layer.10.intermediate.dense.weight,"[3072, 768]",2359296,570624,0.7581380208333334
132
+ 130,nncf_module.bert.encoder.layer.10.output.dense.bias,[768],768,768,0.0
133
+ 131,nncf_module.bert.encoder.layer.10.output.dense.weight,"[768, 3072]",2359296,570624,0.7581380208333334
134
+ 132,nncf_module.bert.encoder.layer.11.attention.self.query.bias,[768],768,192,0.75
135
+ 133,nncf_module.bert.encoder.layer.11.attention.self.query.weight,"[768, 768]",589824,147456,0.75
136
+ 134,nncf_module.bert.encoder.layer.11.attention.self.key.bias,[768],768,192,0.75
137
+ 135,nncf_module.bert.encoder.layer.11.attention.self.key.weight,"[768, 768]",589824,147456,0.75
138
+ 136,nncf_module.bert.encoder.layer.11.attention.self.value.bias,[768],768,192,0.75
139
+ 137,nncf_module.bert.encoder.layer.11.attention.self.value.weight,"[768, 768]",589824,147456,0.75
140
+ 138,nncf_module.bert.encoder.layer.11.attention.output.dense.bias,[768],768,768,0.0
141
+ 139,nncf_module.bert.encoder.layer.11.attention.output.dense.weight,"[768, 768]",589824,147456,0.75
142
+ 140,nncf_module.bert.encoder.layer.11.intermediate.dense.bias,[3072],3072,605,0.8030598958333334
143
+ 141,nncf_module.bert.encoder.layer.11.intermediate.dense.weight,"[3072, 768]",2359296,464640,0.8030598958333334
144
+ 142,nncf_module.bert.encoder.layer.11.output.dense.bias,[768],768,768,0.0
145
+ 143,nncf_module.bert.encoder.layer.11.output.dense.weight,"[768, 3072]",2359296,464640,0.8030598958333334
146
+ 144,nncf_module.qa_outputs.bias,[2],2,2,0.0
147
+ 145,nncf_module.qa_outputs.weight,"[2, 768]",1536,1536,0.0
checkpoint-35000/onnx_sparsity.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | | layer_id | shape | nparam | nnz | sparsity |
2
+ |----:|:----------------------------------------------------------------|:------------|---------:|--------:|-----------:|
3
+ | 0 | nncf_module.bert.encoder.layer.0.attention.self.query.bias | [768] | 768 | 192 | 0.75 |
4
+ | 1 | nncf_module.bert.encoder.layer.0.attention.self.query.weight | [768, 768] | 589824 | 147456 | 0.75 |
5
+ | 2 | nncf_module.bert.encoder.layer.0.attention.self.key.bias | [768] | 768 | 192 | 0.75 |
6
+ | 3 | nncf_module.bert.encoder.layer.0.attention.self.key.weight | [768, 768] | 589824 | 147456 | 0.75 |
7
+ | 4 | nncf_module.bert.encoder.layer.0.attention.self.value.bias | [768] | 768 | 192 | 0.75 |
8
+ | 5 | nncf_module.bert.encoder.layer.0.attention.self.value.weight | [768, 768] | 589824 | 147456 | 0.75 |
9
+ | 6 | nncf_module.bert.encoder.layer.0.attention.output.dense.bias | [768] | 768 | 768 | 0 |
10
+ | 7 | nncf_module.bert.encoder.layer.0.attention.output.dense.weight | [768, 768] | 589824 | 147456 | 0.75 |
11
+ | 8 | nncf_module.bert.encoder.layer.0.intermediate.dense.bias | [3072] | 3072 | 2940 | 0.0429688 |
12
+ | 9 | nncf_module.bert.encoder.layer.0.intermediate.dense.weight | [3072, 768] | 2359296 | 2257920 | 0.0429688 |
13
+ | 10 | nncf_module.bert.encoder.layer.0.output.dense.bias | [768] | 768 | 768 | 0 |
14
+ | 11 | nncf_module.bert.encoder.layer.0.output.dense.weight | [768, 3072] | 2359296 | 2257920 | 0.0429688 |
15
+ | 12 | nncf_module.bert.encoder.layer.1.attention.self.query.bias | [768] | 768 | 256 | 0.666667 |
16
+ | 13 | nncf_module.bert.encoder.layer.1.attention.self.query.weight | [768, 768] | 589824 | 196608 | 0.666667 |
17
+ | 14 | nncf_module.bert.encoder.layer.1.attention.self.key.bias | [768] | 768 | 256 | 0.666667 |
18
+ | 15 | nncf_module.bert.encoder.layer.1.attention.self.key.weight | [768, 768] | 589824 | 196608 | 0.666667 |
19
+ | 16 | nncf_module.bert.encoder.layer.1.attention.self.value.bias | [768] | 768 | 256 | 0.666667 |
20
+ | 17 | nncf_module.bert.encoder.layer.1.attention.self.value.weight | [768, 768] | 589824 | 196608 | 0.666667 |
21
+ | 18 | nncf_module.bert.encoder.layer.1.attention.output.dense.bias | [768] | 768 | 768 | 0 |
22
+ | 19 | nncf_module.bert.encoder.layer.1.attention.output.dense.weight | [768, 768] | 589824 | 196608 | 0.666667 |
23
+ | 20 | nncf_module.bert.encoder.layer.1.intermediate.dense.bias | [3072] | 3072 | 2923 | 0.0485026 |
24
+ | 21 | nncf_module.bert.encoder.layer.1.intermediate.dense.weight | [3072, 768] | 2359296 | 2244864 | 0.0485026 |
25
+ | 22 | nncf_module.bert.encoder.layer.1.output.dense.bias | [768] | 768 | 768 | 0 |
26
+ | 23 | nncf_module.bert.encoder.layer.1.output.dense.weight | [768, 3072] | 2359296 | 2244864 | 0.0485026 |
27
+ | 24 | nncf_module.bert.encoder.layer.2.attention.self.query.bias | [768] | 768 | 768 | 0 |
28
+ | 25 | nncf_module.bert.encoder.layer.2.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
29
+ | 26 | nncf_module.bert.encoder.layer.2.attention.self.key.bias | [768] | 768 | 768 | 0 |
30
+ | 27 | nncf_module.bert.encoder.layer.2.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
31
+ | 28 | nncf_module.bert.encoder.layer.2.attention.self.value.bias | [768] | 768 | 768 | 0 |
32
+ | 29 | nncf_module.bert.encoder.layer.2.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
33
+ | 30 | nncf_module.bert.encoder.layer.2.attention.output.dense.bias | [768] | 768 | 768 | 0 |
34
+ | 31 | nncf_module.bert.encoder.layer.2.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
35
+ | 32 | nncf_module.bert.encoder.layer.2.intermediate.dense.bias | [3072] | 3072 | 2980 | 0.0299479 |
36
+ | 33 | nncf_module.bert.encoder.layer.2.intermediate.dense.weight | [3072, 768] | 2359296 | 2288640 | 0.0299479 |
37
+ | 34 | nncf_module.bert.encoder.layer.2.output.dense.bias | [768] | 768 | 768 | 0 |
38
+ | 35 | nncf_module.bert.encoder.layer.2.output.dense.weight | [768, 3072] | 2359296 | 2288640 | 0.0299479 |
39
+ | 36 | nncf_module.bert.encoder.layer.3.attention.self.query.bias | [768] | 768 | 768 | 0 |
40
+ | 37 | nncf_module.bert.encoder.layer.3.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
41
+ | 38 | nncf_module.bert.encoder.layer.3.attention.self.key.bias | [768] | 768 | 768 | 0 |
42
+ | 39 | nncf_module.bert.encoder.layer.3.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
43
+ | 40 | nncf_module.bert.encoder.layer.3.attention.self.value.bias | [768] | 768 | 768 | 0 |
44
+ | 41 | nncf_module.bert.encoder.layer.3.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
45
+ | 42 | nncf_module.bert.encoder.layer.3.attention.output.dense.bias | [768] | 768 | 768 | 0 |
46
+ | 43 | nncf_module.bert.encoder.layer.3.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
47
+ | 44 | nncf_module.bert.encoder.layer.3.intermediate.dense.bias | [3072] | 3072 | 2957 | 0.0374349 |
48
+ | 45 | nncf_module.bert.encoder.layer.3.intermediate.dense.weight | [3072, 768] | 2359296 | 2270976 | 0.0374349 |
49
+ | 46 | nncf_module.bert.encoder.layer.3.output.dense.bias | [768] | 768 | 768 | 0 |
50
+ | 47 | nncf_module.bert.encoder.layer.3.output.dense.weight | [768, 3072] | 2359296 | 2270976 | 0.0374349 |
51
+ | 48 | nncf_module.bert.encoder.layer.4.attention.self.query.bias | [768] | 768 | 768 | 0 |
52
+ | 49 | nncf_module.bert.encoder.layer.4.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
53
+ | 50 | nncf_module.bert.encoder.layer.4.attention.self.key.bias | [768] | 768 | 768 | 0 |
54
+ | 51 | nncf_module.bert.encoder.layer.4.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
55
+ | 52 | nncf_module.bert.encoder.layer.4.attention.self.value.bias | [768] | 768 | 768 | 0 |
56
+ | 53 | nncf_module.bert.encoder.layer.4.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
57
+ | 54 | nncf_module.bert.encoder.layer.4.attention.output.dense.bias | [768] | 768 | 768 | 0 |
58
+ | 55 | nncf_module.bert.encoder.layer.4.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
59
+ | 56 | nncf_module.bert.encoder.layer.4.intermediate.dense.bias | [3072] | 3072 | 2906 | 0.0540365 |
60
+ | 57 | nncf_module.bert.encoder.layer.4.intermediate.dense.weight | [3072, 768] | 2359296 | 2231808 | 0.0540365 |
61
+ | 58 | nncf_module.bert.encoder.layer.4.output.dense.bias | [768] | 768 | 768 | 0 |
62
+ | 59 | nncf_module.bert.encoder.layer.4.output.dense.weight | [768, 3072] | 2359296 | 2231808 | 0.0540365 |
63
+ | 60 | nncf_module.bert.encoder.layer.5.attention.self.query.bias | [768] | 768 | 768 | 0 |
64
+ | 61 | nncf_module.bert.encoder.layer.5.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
65
+ | 62 | nncf_module.bert.encoder.layer.5.attention.self.key.bias | [768] | 768 | 768 | 0 |
66
+ | 63 | nncf_module.bert.encoder.layer.5.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
67
+ | 64 | nncf_module.bert.encoder.layer.5.attention.self.value.bias | [768] | 768 | 768 | 0 |
68
+ | 65 | nncf_module.bert.encoder.layer.5.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
69
+ | 66 | nncf_module.bert.encoder.layer.5.attention.output.dense.bias | [768] | 768 | 768 | 0 |
70
+ | 67 | nncf_module.bert.encoder.layer.5.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
71
+ | 68 | nncf_module.bert.encoder.layer.5.intermediate.dense.bias | [3072] | 3072 | 2865 | 0.0673828 |
72
+ | 69 | nncf_module.bert.encoder.layer.5.intermediate.dense.weight | [3072, 768] | 2359296 | 2200320 | 0.0673828 |
73
+ | 70 | nncf_module.bert.encoder.layer.5.output.dense.bias | [768] | 768 | 768 | 0 |
74
+ | 71 | nncf_module.bert.encoder.layer.5.output.dense.weight | [768, 3072] | 2359296 | 2200320 | 0.0673828 |
75
+ | 72 | nncf_module.bert.encoder.layer.6.attention.self.query.bias | [768] | 768 | 768 | 0 |
76
+ | 73 | nncf_module.bert.encoder.layer.6.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
77
+ | 74 | nncf_module.bert.encoder.layer.6.attention.self.key.bias | [768] | 768 | 768 | 0 |
78
+ | 75 | nncf_module.bert.encoder.layer.6.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
79
+ | 76 | nncf_module.bert.encoder.layer.6.attention.self.value.bias | [768] | 768 | 768 | 0 |
80
+ | 77 | nncf_module.bert.encoder.layer.6.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
81
+ | 78 | nncf_module.bert.encoder.layer.6.attention.output.dense.bias | [768] | 768 | 768 | 0 |
82
+ | 79 | nncf_module.bert.encoder.layer.6.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
83
+ | 80 | nncf_module.bert.encoder.layer.6.intermediate.dense.bias | [3072] | 3072 | 2759 | 0.101888 |
84
+ | 81 | nncf_module.bert.encoder.layer.6.intermediate.dense.weight | [3072, 768] | 2359296 | 2118912 | 0.101888 |
85
+ | 82 | nncf_module.bert.encoder.layer.6.output.dense.bias | [768] | 768 | 768 | 0 |
86
+ | 83 | nncf_module.bert.encoder.layer.6.output.dense.weight | [768, 3072] | 2359296 | 2118912 | 0.101888 |
87
+ | 84 | nncf_module.bert.encoder.layer.7.attention.self.query.bias | [768] | 768 | 768 | 0 |
88
+ | 85 | nncf_module.bert.encoder.layer.7.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
89
+ | 86 | nncf_module.bert.encoder.layer.7.attention.self.key.bias | [768] | 768 | 768 | 0 |
90
+ | 87 | nncf_module.bert.encoder.layer.7.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
91
+ | 88 | nncf_module.bert.encoder.layer.7.attention.self.value.bias | [768] | 768 | 768 | 0 |
92
+ | 89 | nncf_module.bert.encoder.layer.7.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
93
+ | 90 | nncf_module.bert.encoder.layer.7.attention.output.dense.bias | [768] | 768 | 768 | 0 |
94
+ | 91 | nncf_module.bert.encoder.layer.7.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
95
+ | 92 | nncf_module.bert.encoder.layer.7.intermediate.dense.bias | [3072] | 3072 | 2569 | 0.163737 |
96
+ | 93 | nncf_module.bert.encoder.layer.7.intermediate.dense.weight | [3072, 768] | 2359296 | 1972992 | 0.163737 |
97
+ | 94 | nncf_module.bert.encoder.layer.7.output.dense.bias | [768] | 768 | 768 | 0 |
98
+ | 95 | nncf_module.bert.encoder.layer.7.output.dense.weight | [768, 3072] | 2359296 | 1972992 | 0.163737 |
99
+ | 96 | nncf_module.bert.encoder.layer.8.attention.self.query.bias | [768] | 768 | 256 | 0.666667 |
100
+ | 97 | nncf_module.bert.encoder.layer.8.attention.self.query.weight | [768, 768] | 589824 | 196608 | 0.666667 |
101
+ | 98 | nncf_module.bert.encoder.layer.8.attention.self.key.bias | [768] | 768 | 256 | 0.666667 |
102
+ | 99 | nncf_module.bert.encoder.layer.8.attention.self.key.weight | [768, 768] | 589824 | 196608 | 0.666667 |
103
+ | 100 | nncf_module.bert.encoder.layer.8.attention.self.value.bias | [768] | 768 | 256 | 0.666667 |
104
+ | 101 | nncf_module.bert.encoder.layer.8.attention.self.value.weight | [768, 768] | 589824 | 196608 | 0.666667 |
105
+ | 102 | nncf_module.bert.encoder.layer.8.attention.output.dense.bias | [768] | 768 | 768 | 0 |
106
+ | 103 | nncf_module.bert.encoder.layer.8.attention.output.dense.weight | [768, 768] | 589824 | 196608 | 0.666667 |
107
+ | 104 | nncf_module.bert.encoder.layer.8.intermediate.dense.bias | [3072] | 3072 | 2094 | 0.318359 |
108
+ | 105 | nncf_module.bert.encoder.layer.8.intermediate.dense.weight | [3072, 768] | 2359296 | 1608192 | 0.318359 |
109
+ | 106 | nncf_module.bert.encoder.layer.8.output.dense.bias | [768] | 768 | 768 | 0 |
110
+ | 107 | nncf_module.bert.encoder.layer.8.output.dense.weight | [768, 3072] | 2359296 | 1608192 | 0.318359 |
111
+ | 108 | nncf_module.bert.encoder.layer.9.attention.self.query.bias | [768] | 768 | 768 | 0 |
112
+ | 109 | nncf_module.bert.encoder.layer.9.attention.self.query.weight | [768, 768] | 589824 | 589824 | 0 |
113
+ | 110 | nncf_module.bert.encoder.layer.9.attention.self.key.bias | [768] | 768 | 768 | 0 |
114
+ | 111 | nncf_module.bert.encoder.layer.9.attention.self.key.weight | [768, 768] | 589824 | 589824 | 0 |
115
+ | 112 | nncf_module.bert.encoder.layer.9.attention.self.value.bias | [768] | 768 | 768 | 0 |
116
+ | 113 | nncf_module.bert.encoder.layer.9.attention.self.value.weight | [768, 768] | 589824 | 589824 | 0 |
117
+ | 114 | nncf_module.bert.encoder.layer.9.attention.output.dense.bias | [768] | 768 | 768 | 0 |
118
+ | 115 | nncf_module.bert.encoder.layer.9.attention.output.dense.weight | [768, 768] | 589824 | 589824 | 0 |
119
+ | 116 | nncf_module.bert.encoder.layer.9.intermediate.dense.bias | [3072] | 3072 | 1009 | 0.671549 |
120
+ | 117 | nncf_module.bert.encoder.layer.9.intermediate.dense.weight | [3072, 768] | 2359296 | 774912 | 0.671549 |
121
+ | 118 | nncf_module.bert.encoder.layer.9.output.dense.bias | [768] | 768 | 768 | 0 |
122
+ | 119 | nncf_module.bert.encoder.layer.9.output.dense.weight | [768, 3072] | 2359296 | 774912 | 0.671549 |
123
+ | 120 | nncf_module.bert.encoder.layer.10.attention.self.query.bias | [768] | 768 | 320 | 0.583333 |
124
+ | 121 | nncf_module.bert.encoder.layer.10.attention.self.query.weight | [768, 768] | 589824 | 245760 | 0.583333 |
125
+ | 122 | nncf_module.bert.encoder.layer.10.attention.self.key.bias | [768] | 768 | 320 | 0.583333 |
126
+ | 123 | nncf_module.bert.encoder.layer.10.attention.self.key.weight | [768, 768] | 589824 | 245760 | 0.583333 |
127
+ | 124 | nncf_module.bert.encoder.layer.10.attention.self.value.bias | [768] | 768 | 320 | 0.583333 |
128
+ | 125 | nncf_module.bert.encoder.layer.10.attention.self.value.weight | [768, 768] | 589824 | 245760 | 0.583333 |
129
+ | 126 | nncf_module.bert.encoder.layer.10.attention.output.dense.bias | [768] | 768 | 768 | 0 |
130
+ | 127 | nncf_module.bert.encoder.layer.10.attention.output.dense.weight | [768, 768] | 589824 | 245760 | 0.583333 |
131
+ | 128 | nncf_module.bert.encoder.layer.10.intermediate.dense.bias | [3072] | 3072 | 743 | 0.758138 |
132
+ | 129 | nncf_module.bert.encoder.layer.10.intermediate.dense.weight | [3072, 768] | 2359296 | 570624 | 0.758138 |
133
+ | 130 | nncf_module.bert.encoder.layer.10.output.dense.bias | [768] | 768 | 768 | 0 |
134
+ | 131 | nncf_module.bert.encoder.layer.10.output.dense.weight | [768, 3072] | 2359296 | 570624 | 0.758138 |
135
+ | 132 | nncf_module.bert.encoder.layer.11.attention.self.query.bias | [768] | 768 | 192 | 0.75 |
136
+ | 133 | nncf_module.bert.encoder.layer.11.attention.self.query.weight | [768, 768] | 589824 | 147456 | 0.75 |
137
+ | 134 | nncf_module.bert.encoder.layer.11.attention.self.key.bias | [768] | 768 | 192 | 0.75 |
138
+ | 135 | nncf_module.bert.encoder.layer.11.attention.self.key.weight | [768, 768] | 589824 | 147456 | 0.75 |
139
+ | 136 | nncf_module.bert.encoder.layer.11.attention.self.value.bias | [768] | 768 | 192 | 0.75 |
140
+ | 137 | nncf_module.bert.encoder.layer.11.attention.self.value.weight | [768, 768] | 589824 | 147456 | 0.75 |
141
+ | 138 | nncf_module.bert.encoder.layer.11.attention.output.dense.bias | [768] | 768 | 768 | 0 |
142
+ | 139 | nncf_module.bert.encoder.layer.11.attention.output.dense.weight | [768, 768] | 589824 | 147456 | 0.75 |
143
+ | 140 | nncf_module.bert.encoder.layer.11.intermediate.dense.bias | [3072] | 3072 | 605 | 0.80306 |
144
+ | 141 | nncf_module.bert.encoder.layer.11.intermediate.dense.weight | [3072, 768] | 2359296 | 464640 | 0.80306 |
145
+ | 142 | nncf_module.bert.encoder.layer.11.output.dense.bias | [768] | 768 | 768 | 0 |
146
+ | 143 | nncf_module.bert.encoder.layer.11.output.dense.weight | [768, 3072] | 2359296 | 464640 | 0.80306 |
147
+ | 144 | nncf_module.qa_outputs.bias | [2] | 2 | 2 | 0 |
148
+ | 145 | nncf_module.qa_outputs.weight | [2, 768] | 1536 | 1536 | 0 |
checkpoint-35000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c69179b142350c0900215f2171560e60ce8071203a2a11acc717b62b70da947
3
+ size 872456613
checkpoint-35000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65c8c3531e309f7ee0524ab8b7e4e35d401ee8b24f40c0fe91f08b4d377ef68
3
+ size 776435185
checkpoint-35000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53f9d1332dd8b0177c10a70c94f642ef694da36859eccb739ae846f1b2fc39d8
3
+ size 14503
checkpoint-35000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883abecad344373d12706b2d3326e6572c4bb5d25ea9ce27fa2f2c6f3496b303
3
+ size 623
checkpoint-35000/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
checkpoint-35000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-35000/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
checkpoint-35000/torch_mask_structures.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,pt_module_name,block_id,weight_shape,prune_w_shape,bias_shape,prune_b_shape,head_id_to_keep,nncf_graph_node
2
+ 0,nncf_module.bert.encoder.layer.0.attention.output.dense,0,"(768, 768)","(768, 192)","(768,)","(768,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
3
+ 1,nncf_module.bert.encoder.layer.0.attention.self.value,0,"(768, 768)","(192, 768)","(768,)","(192,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
4
+ 2,nncf_module.bert.encoder.layer.0.attention.self.key,0,"(768, 768)","(192, 768)","(768,)","(192,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
5
+ 3,nncf_module.bert.encoder.layer.0.attention.self.query,0,"(768, 768)","(192, 768)","(768,)","(192,)","[3, 8, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
6
+ 4,nncf_module.bert.encoder.layer.0.output.dense,1,"(768, 3072)","(768, 2940)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertOutput[output]/NNCFLinear[dense]/linear_0
7
+ 5,nncf_module.bert.encoder.layer.0.intermediate.dense,1,"(3072, 768)","(2940, 768)","(3072,)","(2940,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
8
+ 6,nncf_module.bert.encoder.layer.1.attention.self.key,2,"(768, 768)","(256, 768)","(768,)","(256,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
9
+ 7,nncf_module.bert.encoder.layer.1.attention.self.query,2,"(768, 768)","(256, 768)","(768,)","(256,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
10
+ 8,nncf_module.bert.encoder.layer.1.attention.output.dense,2,"(768, 768)","(768, 256)","(768,)","(768,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
11
+ 9,nncf_module.bert.encoder.layer.1.attention.self.value,2,"(768, 768)","(256, 768)","(768,)","(256,)","[4, 7, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
12
+ 10,nncf_module.bert.encoder.layer.1.intermediate.dense,3,"(3072, 768)","(2923, 768)","(3072,)","(2923,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
13
+ 11,nncf_module.bert.encoder.layer.1.output.dense,3,"(768, 3072)","(768, 2923)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertOutput[output]/NNCFLinear[dense]/linear_0
14
+ 12,nncf_module.bert.encoder.layer.2.attention.self.value,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
15
+ 13,nncf_module.bert.encoder.layer.2.attention.output.dense,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
16
+ 14,nncf_module.bert.encoder.layer.2.attention.self.key,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
17
+ 15,nncf_module.bert.encoder.layer.2.attention.self.query,4,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
18
+ 16,nncf_module.bert.encoder.layer.2.output.dense,5,"(768, 3072)","(768, 2980)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertOutput[output]/NNCFLinear[dense]/linear_0
19
+ 17,nncf_module.bert.encoder.layer.2.intermediate.dense,5,"(3072, 768)","(2980, 768)","(3072,)","(2980,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
20
+ 18,nncf_module.bert.encoder.layer.3.attention.output.dense,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
21
+ 19,nncf_module.bert.encoder.layer.3.attention.self.value,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
22
+ 20,nncf_module.bert.encoder.layer.3.attention.self.key,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
23
+ 21,nncf_module.bert.encoder.layer.3.attention.self.query,6,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
24
+ 22,nncf_module.bert.encoder.layer.3.intermediate.dense,7,"(3072, 768)","(2957, 768)","(3072,)","(2957,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
25
+ 23,nncf_module.bert.encoder.layer.3.output.dense,7,"(768, 3072)","(768, 2957)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertOutput[output]/NNCFLinear[dense]/linear_0
26
+ 24,nncf_module.bert.encoder.layer.4.attention.self.key,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
27
+ 25,nncf_module.bert.encoder.layer.4.attention.self.value,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
28
+ 26,nncf_module.bert.encoder.layer.4.attention.output.dense,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
29
+ 27,nncf_module.bert.encoder.layer.4.attention.self.query,8,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
30
+ 28,nncf_module.bert.encoder.layer.4.intermediate.dense,9,"(3072, 768)","(2906, 768)","(3072,)","(2906,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
31
+ 29,nncf_module.bert.encoder.layer.4.output.dense,9,"(768, 3072)","(768, 2906)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertOutput[output]/NNCFLinear[dense]/linear_0
32
+ 30,nncf_module.bert.encoder.layer.5.attention.self.value,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
33
+ 31,nncf_module.bert.encoder.layer.5.attention.output.dense,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
34
+ 32,nncf_module.bert.encoder.layer.5.attention.self.query,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
35
+ 33,nncf_module.bert.encoder.layer.5.attention.self.key,10,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
36
+ 34,nncf_module.bert.encoder.layer.5.intermediate.dense,11,"(3072, 768)","(2865, 768)","(3072,)","(2865,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
37
+ 35,nncf_module.bert.encoder.layer.5.output.dense,11,"(768, 3072)","(768, 2865)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertOutput[output]/NNCFLinear[dense]/linear_0
38
+ 36,nncf_module.bert.encoder.layer.6.attention.self.value,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
39
+ 37,nncf_module.bert.encoder.layer.6.attention.self.query,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
40
+ 38,nncf_module.bert.encoder.layer.6.attention.self.key,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
41
+ 39,nncf_module.bert.encoder.layer.6.attention.output.dense,12,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
42
+ 40,nncf_module.bert.encoder.layer.6.output.dense,13,"(768, 3072)","(768, 2759)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertOutput[output]/NNCFLinear[dense]/linear_0
43
+ 41,nncf_module.bert.encoder.layer.6.intermediate.dense,13,"(3072, 768)","(2759, 768)","(3072,)","(2759,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
44
+ 42,nncf_module.bert.encoder.layer.7.attention.self.query,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
45
+ 43,nncf_module.bert.encoder.layer.7.attention.self.key,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
46
+ 44,nncf_module.bert.encoder.layer.7.attention.self.value,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
47
+ 45,nncf_module.bert.encoder.layer.7.attention.output.dense,14,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
48
+ 46,nncf_module.bert.encoder.layer.7.intermediate.dense,15,"(3072, 768)","(2569, 768)","(3072,)","(2569,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
49
+ 47,nncf_module.bert.encoder.layer.7.output.dense,15,"(768, 3072)","(768, 2569)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertOutput[output]/NNCFLinear[dense]/linear_0
50
+ 48,nncf_module.bert.encoder.layer.8.attention.self.key,16,"(768, 768)","(256, 768)","(768,)","(256,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
51
+ 49,nncf_module.bert.encoder.layer.8.attention.self.value,16,"(768, 768)","(256, 768)","(768,)","(256,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
52
+ 50,nncf_module.bert.encoder.layer.8.attention.self.query,16,"(768, 768)","(256, 768)","(768,)","(256,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
53
+ 51,nncf_module.bert.encoder.layer.8.attention.output.dense,16,"(768, 768)","(768, 256)","(768,)","(768,)","[1, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
54
+ 52,nncf_module.bert.encoder.layer.8.output.dense,17,"(768, 3072)","(768, 2094)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertOutput[output]/NNCFLinear[dense]/linear_0
55
+ 53,nncf_module.bert.encoder.layer.8.intermediate.dense,17,"(3072, 768)","(2094, 768)","(3072,)","(2094,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
56
+ 54,nncf_module.bert.encoder.layer.9.attention.self.query,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
57
+ 55,nncf_module.bert.encoder.layer.9.attention.output.dense,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
58
+ 56,nncf_module.bert.encoder.layer.9.attention.self.value,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
59
+ 57,nncf_module.bert.encoder.layer.9.attention.self.key,18,"(768, 768)","(768, 768)","(768,)","(768,)","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
60
+ 58,nncf_module.bert.encoder.layer.9.intermediate.dense,19,"(3072, 768)","(1009, 768)","(3072,)","(1009,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
61
+ 59,nncf_module.bert.encoder.layer.9.output.dense,19,"(768, 3072)","(768, 1009)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertOutput[output]/NNCFLinear[dense]/linear_0
62
+ 60,nncf_module.bert.encoder.layer.10.attention.self.key,20,"(768, 768)","(320, 768)","(768,)","(320,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
63
+ 61,nncf_module.bert.encoder.layer.10.attention.self.value,20,"(768, 768)","(320, 768)","(768,)","(320,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
64
+ 62,nncf_module.bert.encoder.layer.10.attention.self.query,20,"(768, 768)","(320, 768)","(768,)","(320,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
65
+ 63,nncf_module.bert.encoder.layer.10.attention.output.dense,20,"(768, 768)","(768, 320)","(768,)","(768,)","[0, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
66
+ 64,nncf_module.bert.encoder.layer.10.output.dense,21,"(768, 3072)","(768, 743)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertOutput[output]/NNCFLinear[dense]/linear_0
67
+ 65,nncf_module.bert.encoder.layer.10.intermediate.dense,21,"(3072, 768)","(743, 768)","(3072,)","(743,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
68
+ 66,nncf_module.bert.encoder.layer.11.attention.output.dense,22,"(768, 768)","(768, 192)","(768,)","(768,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
69
+ 67,nncf_module.bert.encoder.layer.11.attention.self.key,22,"(768, 768)","(192, 768)","(768,)","(192,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
70
+ 68,nncf_module.bert.encoder.layer.11.attention.self.value,22,"(768, 768)","(192, 768)","(768,)","(192,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
71
+ 69,nncf_module.bert.encoder.layer.11.attention.self.query,22,"(768, 768)","(192, 768)","(768,)","(192,)","[1, 2, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
72
+ 70,nncf_module.bert.encoder.layer.11.intermediate.dense,23,"(3072, 768)","(605, 768)","(3072,)","(605,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
73
+ 71,nncf_module.bert.encoder.layer.11.output.dense,23,"(768, 3072)","(768, 605)","(768,)","(768,)",skip reporting,BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertOutput[output]/NNCFLinear[dense]/linear_0
checkpoint-35000/torch_mask_structures.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | | pt_module_name | block_id | weight_shape | prune_w_shape | bias_shape | prune_b_shape | head_id_to_keep | nncf_graph_node |
2
+ |---:|:---------------------------------------------------------|-----------:|:---------------|:----------------|:-------------|:----------------|:---------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
3
+ | 0 | nncf_module.bert.encoder.layer.0.attention.output.dense | 0 | (768, 768) | (768, 192) | (768,) | (768,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
4
+ | 1 | nncf_module.bert.encoder.layer.0.attention.self.value | 0 | (768, 768) | (192, 768) | (768,) | (192,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
5
+ | 2 | nncf_module.bert.encoder.layer.0.attention.self.key | 0 | (768, 768) | (192, 768) | (768,) | (192,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
6
+ | 3 | nncf_module.bert.encoder.layer.0.attention.self.query | 0 | (768, 768) | (192, 768) | (768,) | (192,) | [3, 8, 10] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
7
+ | 4 | nncf_module.bert.encoder.layer.0.output.dense | 1 | (768, 3072) | (768, 2940) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
8
+ | 5 | nncf_module.bert.encoder.layer.0.intermediate.dense | 1 | (3072, 768) | (2940, 768) | (3072,) | (2940,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
9
+ | 6 | nncf_module.bert.encoder.layer.1.attention.self.key | 2 | (768, 768) | (256, 768) | (768,) | (256,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
10
+ | 7 | nncf_module.bert.encoder.layer.1.attention.self.query | 2 | (768, 768) | (256, 768) | (768,) | (256,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
11
+ | 8 | nncf_module.bert.encoder.layer.1.attention.output.dense | 2 | (768, 768) | (768, 256) | (768,) | (768,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
12
+ | 9 | nncf_module.bert.encoder.layer.1.attention.self.value | 2 | (768, 768) | (256, 768) | (768,) | (256,) | [4, 7, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
13
+ | 10 | nncf_module.bert.encoder.layer.1.intermediate.dense | 3 | (3072, 768) | (2923, 768) | (3072,) | (2923,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
14
+ | 11 | nncf_module.bert.encoder.layer.1.output.dense | 3 | (768, 3072) | (768, 2923) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
15
+ | 12 | nncf_module.bert.encoder.layer.2.attention.self.value | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
16
+ | 13 | nncf_module.bert.encoder.layer.2.attention.output.dense | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
17
+ | 14 | nncf_module.bert.encoder.layer.2.attention.self.key | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
18
+ | 15 | nncf_module.bert.encoder.layer.2.attention.self.query | 4 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
19
+ | 16 | nncf_module.bert.encoder.layer.2.output.dense | 5 | (768, 3072) | (768, 2980) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
20
+ | 17 | nncf_module.bert.encoder.layer.2.intermediate.dense | 5 | (3072, 768) | (2980, 768) | (3072,) | (2980,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
21
+ | 18 | nncf_module.bert.encoder.layer.3.attention.output.dense | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
22
+ | 19 | nncf_module.bert.encoder.layer.3.attention.self.value | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
23
+ | 20 | nncf_module.bert.encoder.layer.3.attention.self.key | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
24
+ | 21 | nncf_module.bert.encoder.layer.3.attention.self.query | 6 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
25
+ | 22 | nncf_module.bert.encoder.layer.3.intermediate.dense | 7 | (3072, 768) | (2957, 768) | (3072,) | (2957,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
26
+ | 23 | nncf_module.bert.encoder.layer.3.output.dense | 7 | (768, 3072) | (768, 2957) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
27
+ | 24 | nncf_module.bert.encoder.layer.4.attention.self.key | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
28
+ | 25 | nncf_module.bert.encoder.layer.4.attention.self.value | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
29
+ | 26 | nncf_module.bert.encoder.layer.4.attention.output.dense | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
30
+ | 27 | nncf_module.bert.encoder.layer.4.attention.self.query | 8 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
31
+ | 28 | nncf_module.bert.encoder.layer.4.intermediate.dense | 9 | (3072, 768) | (2906, 768) | (3072,) | (2906,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
32
+ | 29 | nncf_module.bert.encoder.layer.4.output.dense | 9 | (768, 3072) | (768, 2906) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
33
+ | 30 | nncf_module.bert.encoder.layer.5.attention.self.value | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
34
+ | 31 | nncf_module.bert.encoder.layer.5.attention.output.dense | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
35
+ | 32 | nncf_module.bert.encoder.layer.5.attention.self.query | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
36
+ | 33 | nncf_module.bert.encoder.layer.5.attention.self.key | 10 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
37
+ | 34 | nncf_module.bert.encoder.layer.5.intermediate.dense | 11 | (3072, 768) | (2865, 768) | (3072,) | (2865,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
38
+ | 35 | nncf_module.bert.encoder.layer.5.output.dense | 11 | (768, 3072) | (768, 2865) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
39
+ | 36 | nncf_module.bert.encoder.layer.6.attention.self.value | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
40
+ | 37 | nncf_module.bert.encoder.layer.6.attention.self.query | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
41
+ | 38 | nncf_module.bert.encoder.layer.6.attention.self.key | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
42
+ | 39 | nncf_module.bert.encoder.layer.6.attention.output.dense | 12 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
43
+ | 40 | nncf_module.bert.encoder.layer.6.output.dense | 13 | (768, 3072) | (768, 2759) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
44
+ | 41 | nncf_module.bert.encoder.layer.6.intermediate.dense | 13 | (3072, 768) | (2759, 768) | (3072,) | (2759,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
45
+ | 42 | nncf_module.bert.encoder.layer.7.attention.self.query | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
46
+ | 43 | nncf_module.bert.encoder.layer.7.attention.self.key | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
47
+ | 44 | nncf_module.bert.encoder.layer.7.attention.self.value | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
48
+ | 45 | nncf_module.bert.encoder.layer.7.attention.output.dense | 14 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
49
+ | 46 | nncf_module.bert.encoder.layer.7.intermediate.dense | 15 | (3072, 768) | (2569, 768) | (3072,) | (2569,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
50
+ | 47 | nncf_module.bert.encoder.layer.7.output.dense | 15 | (768, 3072) | (768, 2569) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
51
+ | 48 | nncf_module.bert.encoder.layer.8.attention.self.key | 16 | (768, 768) | (256, 768) | (768,) | (256,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
52
+ | 49 | nncf_module.bert.encoder.layer.8.attention.self.value | 16 | (768, 768) | (256, 768) | (768,) | (256,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
53
+ | 50 | nncf_module.bert.encoder.layer.8.attention.self.query | 16 | (768, 768) | (256, 768) | (768,) | (256,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
54
+ | 51 | nncf_module.bert.encoder.layer.8.attention.output.dense | 16 | (768, 768) | (768, 256) | (768,) | (768,) | [1, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
55
+ | 52 | nncf_module.bert.encoder.layer.8.output.dense | 17 | (768, 3072) | (768, 2094) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
56
+ | 53 | nncf_module.bert.encoder.layer.8.intermediate.dense | 17 | (3072, 768) | (2094, 768) | (3072,) | (2094,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
57
+ | 54 | nncf_module.bert.encoder.layer.9.attention.self.query | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
58
+ | 55 | nncf_module.bert.encoder.layer.9.attention.output.dense | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
59
+ | 56 | nncf_module.bert.encoder.layer.9.attention.self.value | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
60
+ | 57 | nncf_module.bert.encoder.layer.9.attention.self.key | 18 | (768, 768) | (768, 768) | (768,) | (768,) | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
61
+ | 58 | nncf_module.bert.encoder.layer.9.intermediate.dense | 19 | (3072, 768) | (1009, 768) | (3072,) | (1009,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
62
+ | 59 | nncf_module.bert.encoder.layer.9.output.dense | 19 | (768, 3072) | (768, 1009) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
63
+ | 60 | nncf_module.bert.encoder.layer.10.attention.self.key | 20 | (768, 768) | (320, 768) | (768,) | (320,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
64
+ | 61 | nncf_module.bert.encoder.layer.10.attention.self.value | 20 | (768, 768) | (320, 768) | (768,) | (320,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
65
+ | 62 | nncf_module.bert.encoder.layer.10.attention.self.query | 20 | (768, 768) | (320, 768) | (768,) | (320,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
66
+ | 63 | nncf_module.bert.encoder.layer.10.attention.output.dense | 20 | (768, 768) | (768, 320) | (768,) | (768,) | [0, 3, 9, 10, 11] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
67
+ | 64 | nncf_module.bert.encoder.layer.10.output.dense | 21 | (768, 3072) | (768, 743) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
68
+ | 65 | nncf_module.bert.encoder.layer.10.intermediate.dense | 21 | (3072, 768) | (743, 768) | (3072,) | (743,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
69
+ | 66 | nncf_module.bert.encoder.layer.11.attention.output.dense | 22 | (768, 768) | (768, 192) | (768,) | (768,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0 |
70
+ | 67 | nncf_module.bert.encoder.layer.11.attention.self.key | 22 | (768, 768) | (192, 768) | (768,) | (192,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0 |
71
+ | 68 | nncf_module.bert.encoder.layer.11.attention.self.value | 22 | (768, 768) | (192, 768) | (768,) | (192,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0 |
72
+ | 69 | nncf_module.bert.encoder.layer.11.attention.self.query | 22 | (768, 768) | (192, 768) | (768,) | (192,) | [1, 2, 9] | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0 |
73
+ | 70 | nncf_module.bert.encoder.layer.11.intermediate.dense | 23 | (3072, 768) | (605, 768) | (3072,) | (605,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0 |
74
+ | 71 | nncf_module.bert.encoder.layer.11.output.dense | 23 | (768, 3072) | (768, 605) | (768,) | (768,) | skip reporting | BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertOutput[output]/NNCFLinear[dense]/linear_0 |
checkpoint-35000/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:963740bdff2cf72416e92704c8f12f612711498e80bcb86943396aa8686d3525
3
+ size 18704532
checkpoint-35000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b71055db74256e103f6b235fc05792460b6f443079012f8566a63d151abf413
3
+ size 3183
checkpoint-35000/vocab.txt ADDED
The diff for this file is too large to render. See raw diff