Chua, Vui Seng commited on
Commit
0e66ada
1 Parent(s): 8be7a6a

Add collaterals

Browse files
.gitattributes CHANGED
@@ -25,3 +25,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ bert-base-squadv1-pruneofa-90pc-bt-qat-lt.onnx filter=lfs diff=lfs merge=lfs -text
29
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
30
+ run.log filter=lfs diff=lfs merge=lfs -text
31
+ checkpoint-22000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
32
+ checkpoint-22000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
33
+ checkpoint-22000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
34
+ eval_nbest_predictions.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TODO: Update documentation
2
+
3
+ Quantized version of the following. Scores are correct.
4
+ This model is transfer-learning of [bert-base pruneofa 90% sparse](https://huggingface.co/Intel/bert-base-uncased-sparse-90-unstructured-pruneofa) on Squadv1 dataset.
5
+
6
+ ```
7
+ eval_exact_match = 80.6623
8
+ eval_f1 = 87.7147
9
+ eval_samples = 10784
10
+ ```
11
+
12
+
13
+ # Train
14
+ use https://github.com/IntelLabs/Model-Compression-Research-Package.git
15
+ see ```pruneofa-transfer-learning.sh```
16
+
17
+ # Eval
18
+ ```bash
19
+ export CUDA_VISIBLE_DEVICES=0
20
+
21
+ OUTDIR=eval-bert-base-squadv1-pruneofa-90pc-bt
22
+ WORKDIR=transformers/examples/pytorch/question-answering
23
+ cd $WORKDIR
24
+
25
+ nohup python run_qa.py \
26
+ --model_name_or_path vuiseng9/bert-base-squadv1-pruneofa-90pc-bt \
27
+ --dataset_name squad \
28
+ --do_eval \
29
+ --per_device_eval_batch_size 128 \
30
+ --max_seq_length 384 \
31
+ --doc_stride 128 \
32
+ --overwrite_output_dir \
33
+ --output_dir $OUTDIR 2>&1 | tee $OUTDIR/run.log &
34
+ ```
all_results.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eval_exact_match": 80.66225165562913,
3
+ "eval_f1": 87.71465786559115,
4
+ "eval_samples": 10784
5
+ }
bert-base-squadv1-pruneofa-90pc-bt-qat-lt.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0a773401371f82cdca6deadae08b8bdba09cb22adf132be66f36623bd3ca171
3
+ size 435709198
checkpoint-22000/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data1/vchua/tld-poc-csr-dgx1-03/pruneofa-tl/run01-bert-squad-pruneofa-90pc-8eph/checkpoint-56750",
3
+ "architectures": [
4
+ "NNCFNetwork"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "keys_to_ignore_at_inference": [
14
+ "prediction_logits"
15
+ ],
16
+ "layer_norm_eps": 1e-12,
17
+ "max_position_embeddings": 512,
18
+ "model_type": "bert",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "pad_token_id": 0,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.9.1",
25
+ "type_vocab_size": 2,
26
+ "use_cache": true,
27
+ "vocab_size": 30522
28
+ }
checkpoint-22000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48ff7a46101cabd43ae9ed7b8bfa7d93ecf70bbb48c97691fdeca8548ffe48c4
3
+ size 871399469
checkpoint-22000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8bb510838ff52d98d2722ee8a3e292245d968d9d4faba216c8ff9d15449f764
3
+ size 775914961
checkpoint-22000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0b25ac2a24f8e5fd3e555e063f2caf28dce25219456e0ce2866583c6b7ed5a5
3
+ size 14503
checkpoint-22000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b72605820a6ad348cf4cff05f1884662adde36d71e5332196bac4dbc8a5ceca
3
+ size 623
checkpoint-22000/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
checkpoint-22000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-22000/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "/data1/vchua/tld-poc-csr-dgx1-03/pruneofa-tl/run01-bert-squad-pruneofa-90pc-8eph/checkpoint-56750", "tokenizer_class": "BertTokenizer"}
checkpoint-22000/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c329245e2bbdadf8c22c2db3fc773037bbcba796f55927a049b2955c6fcae605
3
+ size 11383779
checkpoint-22000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ccbaeeb7792e78206b2de0346d1d5f5781ab565e246db9298301a8e03f75aea
3
+ size 3247
checkpoint-22000/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
compressed_graph.dot ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data1/vchua/tld-poc-csr-dgx1-03/pruneofa-tl/run01-bert-squad-pruneofa-90pc-8eph/checkpoint-56750",
3
+ "architectures": [
4
+ "NNCFNetwork"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "keys_to_ignore_at_inference": [
14
+ "prediction_logits"
15
+ ],
16
+ "layer_norm_eps": 1e-12,
17
+ "max_position_embeddings": 512,
18
+ "model_type": "bert",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "pad_token_id": 0,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.9.1",
25
+ "type_vocab_size": 2,
26
+ "use_cache": true,
27
+ "vocab_size": 30522
28
+ }
eval_nbest_predictions.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a78edbfc9c377ebf147e4789821ca4d74a301a1eab22153aebc24df2eb3f923e
3
+ size 49044610
eval_predictions.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_results.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eval_exact_match": 80.66225165562913,
3
+ "eval_f1": 87.71465786559115,
4
+ "eval_samples": 10784
5
+ }
layer_wise_sparsity_global_rate_70.20.csv ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer_id,layer_type,param_type,shape,nparam,nnz,sparsity
2
+ 0,nncf_module.bert.embeddings.word_embeddings,NNCFEmbedding,weight,"[30522, 768]",23440896,23440896,0.0
3
+ 1,nncf_module.bert.embeddings.position_embeddings,NNCFEmbedding,weight,"[512, 768]",393216,393216,0.0
4
+ 2,nncf_module.bert.embeddings.token_type_embeddings,NNCFEmbedding,weight,"[2, 768]",1536,1536,0.0
5
+ 3,nncf_module.bert.embeddings.LayerNorm,LayerNorm,weight,[768],768,768,0.0
6
+ 4,nncf_module.bert.embeddings.LayerNorm,LayerNorm,bias,[768],768,768,0.0
7
+ 5,nncf_module.bert.encoder.layer.0.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
8
+ 6,nncf_module.bert.encoder.layer.0.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
9
+ 7,nncf_module.bert.encoder.layer.0.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
10
+ 8,nncf_module.bert.encoder.layer.0.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
11
+ 9,nncf_module.bert.encoder.layer.0.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
12
+ 10,nncf_module.bert.encoder.layer.0.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
13
+ 11,nncf_module.bert.encoder.layer.0.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
14
+ 12,nncf_module.bert.encoder.layer.0.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
15
+ 13,nncf_module.bert.encoder.layer.0.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
16
+ 14,nncf_module.bert.encoder.layer.0.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
17
+ 15,nncf_module.bert.encoder.layer.0.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
18
+ 16,nncf_module.bert.encoder.layer.0.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
19
+ 17,nncf_module.bert.encoder.layer.0.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
20
+ 18,nncf_module.bert.encoder.layer.0.output.dense,NNCFLinear,bias,[768],768,768,0.0
21
+ 19,nncf_module.bert.encoder.layer.0.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
22
+ 20,nncf_module.bert.encoder.layer.0.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
23
+ 21,nncf_module.bert.encoder.layer.1.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
24
+ 22,nncf_module.bert.encoder.layer.1.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
25
+ 23,nncf_module.bert.encoder.layer.1.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
26
+ 24,nncf_module.bert.encoder.layer.1.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
27
+ 25,nncf_module.bert.encoder.layer.1.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
28
+ 26,nncf_module.bert.encoder.layer.1.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
29
+ 27,nncf_module.bert.encoder.layer.1.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
30
+ 28,nncf_module.bert.encoder.layer.1.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
31
+ 29,nncf_module.bert.encoder.layer.1.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
32
+ 30,nncf_module.bert.encoder.layer.1.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
33
+ 31,nncf_module.bert.encoder.layer.1.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
34
+ 32,nncf_module.bert.encoder.layer.1.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
35
+ 33,nncf_module.bert.encoder.layer.1.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
36
+ 34,nncf_module.bert.encoder.layer.1.output.dense,NNCFLinear,bias,[768],768,768,0.0
37
+ 35,nncf_module.bert.encoder.layer.1.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
38
+ 36,nncf_module.bert.encoder.layer.1.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
39
+ 37,nncf_module.bert.encoder.layer.2.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
40
+ 38,nncf_module.bert.encoder.layer.2.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
41
+ 39,nncf_module.bert.encoder.layer.2.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
42
+ 40,nncf_module.bert.encoder.layer.2.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
43
+ 41,nncf_module.bert.encoder.layer.2.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
44
+ 42,nncf_module.bert.encoder.layer.2.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
45
+ 43,nncf_module.bert.encoder.layer.2.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
46
+ 44,nncf_module.bert.encoder.layer.2.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
47
+ 45,nncf_module.bert.encoder.layer.2.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
48
+ 46,nncf_module.bert.encoder.layer.2.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
49
+ 47,nncf_module.bert.encoder.layer.2.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
50
+ 48,nncf_module.bert.encoder.layer.2.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
51
+ 49,nncf_module.bert.encoder.layer.2.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
52
+ 50,nncf_module.bert.encoder.layer.2.output.dense,NNCFLinear,bias,[768],768,768,0.0
53
+ 51,nncf_module.bert.encoder.layer.2.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
54
+ 52,nncf_module.bert.encoder.layer.2.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
55
+ 53,nncf_module.bert.encoder.layer.3.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
56
+ 54,nncf_module.bert.encoder.layer.3.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
57
+ 55,nncf_module.bert.encoder.layer.3.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
58
+ 56,nncf_module.bert.encoder.layer.3.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
59
+ 57,nncf_module.bert.encoder.layer.3.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
60
+ 58,nncf_module.bert.encoder.layer.3.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
61
+ 59,nncf_module.bert.encoder.layer.3.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
62
+ 60,nncf_module.bert.encoder.layer.3.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
63
+ 61,nncf_module.bert.encoder.layer.3.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
64
+ 62,nncf_module.bert.encoder.layer.3.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
65
+ 63,nncf_module.bert.encoder.layer.3.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
66
+ 64,nncf_module.bert.encoder.layer.3.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
67
+ 65,nncf_module.bert.encoder.layer.3.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
68
+ 66,nncf_module.bert.encoder.layer.3.output.dense,NNCFLinear,bias,[768],768,768,0.0
69
+ 67,nncf_module.bert.encoder.layer.3.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
70
+ 68,nncf_module.bert.encoder.layer.3.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
71
+ 69,nncf_module.bert.encoder.layer.4.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
72
+ 70,nncf_module.bert.encoder.layer.4.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
73
+ 71,nncf_module.bert.encoder.layer.4.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
74
+ 72,nncf_module.bert.encoder.layer.4.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
75
+ 73,nncf_module.bert.encoder.layer.4.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
76
+ 74,nncf_module.bert.encoder.layer.4.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
77
+ 75,nncf_module.bert.encoder.layer.4.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
78
+ 76,nncf_module.bert.encoder.layer.4.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
79
+ 77,nncf_module.bert.encoder.layer.4.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
80
+ 78,nncf_module.bert.encoder.layer.4.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
81
+ 79,nncf_module.bert.encoder.layer.4.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
82
+ 80,nncf_module.bert.encoder.layer.4.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
83
+ 81,nncf_module.bert.encoder.layer.4.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
84
+ 82,nncf_module.bert.encoder.layer.4.output.dense,NNCFLinear,bias,[768],768,768,0.0
85
+ 83,nncf_module.bert.encoder.layer.4.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
86
+ 84,nncf_module.bert.encoder.layer.4.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
87
+ 85,nncf_module.bert.encoder.layer.5.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
88
+ 86,nncf_module.bert.encoder.layer.5.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
89
+ 87,nncf_module.bert.encoder.layer.5.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
90
+ 88,nncf_module.bert.encoder.layer.5.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
91
+ 89,nncf_module.bert.encoder.layer.5.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
92
+ 90,nncf_module.bert.encoder.layer.5.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
93
+ 91,nncf_module.bert.encoder.layer.5.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
94
+ 92,nncf_module.bert.encoder.layer.5.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
95
+ 93,nncf_module.bert.encoder.layer.5.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
96
+ 94,nncf_module.bert.encoder.layer.5.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
97
+ 95,nncf_module.bert.encoder.layer.5.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
98
+ 96,nncf_module.bert.encoder.layer.5.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
99
+ 97,nncf_module.bert.encoder.layer.5.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
100
+ 98,nncf_module.bert.encoder.layer.5.output.dense,NNCFLinear,bias,[768],768,768,0.0
101
+ 99,nncf_module.bert.encoder.layer.5.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
102
+ 100,nncf_module.bert.encoder.layer.5.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
103
+ 101,nncf_module.bert.encoder.layer.6.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
104
+ 102,nncf_module.bert.encoder.layer.6.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
105
+ 103,nncf_module.bert.encoder.layer.6.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
106
+ 104,nncf_module.bert.encoder.layer.6.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
107
+ 105,nncf_module.bert.encoder.layer.6.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
108
+ 106,nncf_module.bert.encoder.layer.6.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
109
+ 107,nncf_module.bert.encoder.layer.6.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
110
+ 108,nncf_module.bert.encoder.layer.6.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
111
+ 109,nncf_module.bert.encoder.layer.6.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
112
+ 110,nncf_module.bert.encoder.layer.6.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
113
+ 111,nncf_module.bert.encoder.layer.6.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
114
+ 112,nncf_module.bert.encoder.layer.6.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
115
+ 113,nncf_module.bert.encoder.layer.6.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
116
+ 114,nncf_module.bert.encoder.layer.6.output.dense,NNCFLinear,bias,[768],768,768,0.0
117
+ 115,nncf_module.bert.encoder.layer.6.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
118
+ 116,nncf_module.bert.encoder.layer.6.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
119
+ 117,nncf_module.bert.encoder.layer.7.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
120
+ 118,nncf_module.bert.encoder.layer.7.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
121
+ 119,nncf_module.bert.encoder.layer.7.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
122
+ 120,nncf_module.bert.encoder.layer.7.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
123
+ 121,nncf_module.bert.encoder.layer.7.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
124
+ 122,nncf_module.bert.encoder.layer.7.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
125
+ 123,nncf_module.bert.encoder.layer.7.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
126
+ 124,nncf_module.bert.encoder.layer.7.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
127
+ 125,nncf_module.bert.encoder.layer.7.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
128
+ 126,nncf_module.bert.encoder.layer.7.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
129
+ 127,nncf_module.bert.encoder.layer.7.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
130
+ 128,nncf_module.bert.encoder.layer.7.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
131
+ 129,nncf_module.bert.encoder.layer.7.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235929,0.900000274181366
132
+ 130,nncf_module.bert.encoder.layer.7.output.dense,NNCFLinear,bias,[768],768,768,0.0
133
+ 131,nncf_module.bert.encoder.layer.7.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
134
+ 132,nncf_module.bert.encoder.layer.7.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
135
+ 133,nncf_module.bert.encoder.layer.8.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
136
+ 134,nncf_module.bert.encoder.layer.8.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
137
+ 135,nncf_module.bert.encoder.layer.8.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
138
+ 136,nncf_module.bert.encoder.layer.8.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
139
+ 137,nncf_module.bert.encoder.layer.8.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
140
+ 138,nncf_module.bert.encoder.layer.8.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
141
+ 139,nncf_module.bert.encoder.layer.8.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
142
+ 140,nncf_module.bert.encoder.layer.8.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
143
+ 141,nncf_module.bert.encoder.layer.8.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
144
+ 142,nncf_module.bert.encoder.layer.8.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
145
+ 143,nncf_module.bert.encoder.layer.8.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
146
+ 144,nncf_module.bert.encoder.layer.8.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
147
+ 145,nncf_module.bert.encoder.layer.8.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
148
+ 146,nncf_module.bert.encoder.layer.8.output.dense,NNCFLinear,bias,[768],768,768,0.0
149
+ 147,nncf_module.bert.encoder.layer.8.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
150
+ 148,nncf_module.bert.encoder.layer.8.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
151
+ 149,nncf_module.bert.encoder.layer.9.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
152
+ 150,nncf_module.bert.encoder.layer.9.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
153
+ 151,nncf_module.bert.encoder.layer.9.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
154
+ 152,nncf_module.bert.encoder.layer.9.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
155
+ 153,nncf_module.bert.encoder.layer.9.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
156
+ 154,nncf_module.bert.encoder.layer.9.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
157
+ 155,nncf_module.bert.encoder.layer.9.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
158
+ 156,nncf_module.bert.encoder.layer.9.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
159
+ 157,nncf_module.bert.encoder.layer.9.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
160
+ 158,nncf_module.bert.encoder.layer.9.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
161
+ 159,nncf_module.bert.encoder.layer.9.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
162
+ 160,nncf_module.bert.encoder.layer.9.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
163
+ 161,nncf_module.bert.encoder.layer.9.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
164
+ 162,nncf_module.bert.encoder.layer.9.output.dense,NNCFLinear,bias,[768],768,768,0.0
165
+ 163,nncf_module.bert.encoder.layer.9.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
166
+ 164,nncf_module.bert.encoder.layer.9.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
167
+ 165,nncf_module.bert.encoder.layer.10.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
168
+ 166,nncf_module.bert.encoder.layer.10.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
169
+ 167,nncf_module.bert.encoder.layer.10.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
170
+ 168,nncf_module.bert.encoder.layer.10.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
171
+ 169,nncf_module.bert.encoder.layer.10.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
172
+ 170,nncf_module.bert.encoder.layer.10.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
173
+ 171,nncf_module.bert.encoder.layer.10.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
174
+ 172,nncf_module.bert.encoder.layer.10.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
175
+ 173,nncf_module.bert.encoder.layer.10.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
176
+ 174,nncf_module.bert.encoder.layer.10.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
177
+ 175,nncf_module.bert.encoder.layer.10.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
178
+ 176,nncf_module.bert.encoder.layer.10.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
179
+ 177,nncf_module.bert.encoder.layer.10.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235929,0.900000274181366
180
+ 178,nncf_module.bert.encoder.layer.10.output.dense,NNCFLinear,bias,[768],768,768,0.0
181
+ 179,nncf_module.bert.encoder.layer.10.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
182
+ 180,nncf_module.bert.encoder.layer.10.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
183
+ 181,nncf_module.bert.encoder.layer.11.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
184
+ 182,nncf_module.bert.encoder.layer.11.attention.self.query,NNCFLinear,bias,[768],768,768,0.0
185
+ 183,nncf_module.bert.encoder.layer.11.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
186
+ 184,nncf_module.bert.encoder.layer.11.attention.self.key,NNCFLinear,bias,[768],768,768,0.0
187
+ 185,nncf_module.bert.encoder.layer.11.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
188
+ 186,nncf_module.bert.encoder.layer.11.attention.self.value,NNCFLinear,bias,[768],768,768,0.0
189
+ 187,nncf_module.bert.encoder.layer.11.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
190
+ 188,nncf_module.bert.encoder.layer.11.attention.output.dense,NNCFLinear,bias,[768],768,768,0.0
191
+ 189,nncf_module.bert.encoder.layer.11.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
192
+ 190,nncf_module.bert.encoder.layer.11.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
193
+ 191,nncf_module.bert.encoder.layer.11.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
194
+ 192,nncf_module.bert.encoder.layer.11.intermediate.dense,NNCFLinear,bias,[3072],3072,3072,0.0
195
+ 193,nncf_module.bert.encoder.layer.11.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
196
+ 194,nncf_module.bert.encoder.layer.11.output.dense,NNCFLinear,bias,[768],768,768,0.0
197
+ 195,nncf_module.bert.encoder.layer.11.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
198
+ 196,nncf_module.bert.encoder.layer.11.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
199
+ 197,nncf_module.qa_outputs,NNCFLinear,weight,"[2, 768]",1536,1536,0.0
200
+ 198,nncf_module.qa_outputs,NNCFLinear,bias,[2],2,2,0.0
layer_wise_sparsity_global_rate_70.20.md ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | | layer_id | layer_type | param_type | shape | nparam | nnz | sparsity |
2
+ |----:|:-------------------------------------------------------------|:--------------|:-------------|:-------------|---------:|---------:|-----------:|
3
+ | 0 | nncf_module.bert.embeddings.word_embeddings | NNCFEmbedding | weight | [30522, 768] | 23440896 | 23440896 | 0 |
4
+ | 1 | nncf_module.bert.embeddings.position_embeddings | NNCFEmbedding | weight | [512, 768] | 393216 | 393216 | 0 |
5
+ | 2 | nncf_module.bert.embeddings.token_type_embeddings | NNCFEmbedding | weight | [2, 768] | 1536 | 1536 | 0 |
6
+ | 3 | nncf_module.bert.embeddings.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
7
+ | 4 | nncf_module.bert.embeddings.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
8
+ | 5 | nncf_module.bert.encoder.layer.0.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
9
+ | 6 | nncf_module.bert.encoder.layer.0.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
10
+ | 7 | nncf_module.bert.encoder.layer.0.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
11
+ | 8 | nncf_module.bert.encoder.layer.0.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
12
+ | 9 | nncf_module.bert.encoder.layer.0.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
13
+ | 10 | nncf_module.bert.encoder.layer.0.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
14
+ | 11 | nncf_module.bert.encoder.layer.0.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
15
+ | 12 | nncf_module.bert.encoder.layer.0.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
16
+ | 13 | nncf_module.bert.encoder.layer.0.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
17
+ | 14 | nncf_module.bert.encoder.layer.0.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
18
+ | 15 | nncf_module.bert.encoder.layer.0.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
19
+ | 16 | nncf_module.bert.encoder.layer.0.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
20
+ | 17 | nncf_module.bert.encoder.layer.0.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
21
+ | 18 | nncf_module.bert.encoder.layer.0.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
22
+ | 19 | nncf_module.bert.encoder.layer.0.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
23
+ | 20 | nncf_module.bert.encoder.layer.0.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
24
+ | 21 | nncf_module.bert.encoder.layer.1.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
25
+ | 22 | nncf_module.bert.encoder.layer.1.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
26
+ | 23 | nncf_module.bert.encoder.layer.1.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
27
+ | 24 | nncf_module.bert.encoder.layer.1.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
28
+ | 25 | nncf_module.bert.encoder.layer.1.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
29
+ | 26 | nncf_module.bert.encoder.layer.1.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
30
+ | 27 | nncf_module.bert.encoder.layer.1.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
31
+ | 28 | nncf_module.bert.encoder.layer.1.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
32
+ | 29 | nncf_module.bert.encoder.layer.1.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
33
+ | 30 | nncf_module.bert.encoder.layer.1.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
34
+ | 31 | nncf_module.bert.encoder.layer.1.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
35
+ | 32 | nncf_module.bert.encoder.layer.1.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
36
+ | 33 | nncf_module.bert.encoder.layer.1.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
37
+ | 34 | nncf_module.bert.encoder.layer.1.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
38
+ | 35 | nncf_module.bert.encoder.layer.1.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
39
+ | 36 | nncf_module.bert.encoder.layer.1.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
40
+ | 37 | nncf_module.bert.encoder.layer.2.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
41
+ | 38 | nncf_module.bert.encoder.layer.2.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
42
+ | 39 | nncf_module.bert.encoder.layer.2.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
43
+ | 40 | nncf_module.bert.encoder.layer.2.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
44
+ | 41 | nncf_module.bert.encoder.layer.2.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
45
+ | 42 | nncf_module.bert.encoder.layer.2.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
46
+ | 43 | nncf_module.bert.encoder.layer.2.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
47
+ | 44 | nncf_module.bert.encoder.layer.2.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
48
+ | 45 | nncf_module.bert.encoder.layer.2.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
49
+ | 46 | nncf_module.bert.encoder.layer.2.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
50
+ | 47 | nncf_module.bert.encoder.layer.2.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
51
+ | 48 | nncf_module.bert.encoder.layer.2.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
52
+ | 49 | nncf_module.bert.encoder.layer.2.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
53
+ | 50 | nncf_module.bert.encoder.layer.2.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
54
+ | 51 | nncf_module.bert.encoder.layer.2.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
55
+ | 52 | nncf_module.bert.encoder.layer.2.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
56
+ | 53 | nncf_module.bert.encoder.layer.3.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
57
+ | 54 | nncf_module.bert.encoder.layer.3.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
58
+ | 55 | nncf_module.bert.encoder.layer.3.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
59
+ | 56 | nncf_module.bert.encoder.layer.3.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
60
+ | 57 | nncf_module.bert.encoder.layer.3.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
61
+ | 58 | nncf_module.bert.encoder.layer.3.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
62
+ | 59 | nncf_module.bert.encoder.layer.3.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
63
+ | 60 | nncf_module.bert.encoder.layer.3.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
64
+ | 61 | nncf_module.bert.encoder.layer.3.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
65
+ | 62 | nncf_module.bert.encoder.layer.3.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
66
+ | 63 | nncf_module.bert.encoder.layer.3.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
67
+ | 64 | nncf_module.bert.encoder.layer.3.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
68
+ | 65 | nncf_module.bert.encoder.layer.3.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
69
+ | 66 | nncf_module.bert.encoder.layer.3.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
70
+ | 67 | nncf_module.bert.encoder.layer.3.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
71
+ | 68 | nncf_module.bert.encoder.layer.3.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
72
+ | 69 | nncf_module.bert.encoder.layer.4.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
73
+ | 70 | nncf_module.bert.encoder.layer.4.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
74
+ | 71 | nncf_module.bert.encoder.layer.4.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
75
+ | 72 | nncf_module.bert.encoder.layer.4.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
76
+ | 73 | nncf_module.bert.encoder.layer.4.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
77
+ | 74 | nncf_module.bert.encoder.layer.4.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
78
+ | 75 | nncf_module.bert.encoder.layer.4.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
79
+ | 76 | nncf_module.bert.encoder.layer.4.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
80
+ | 77 | nncf_module.bert.encoder.layer.4.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
81
+ | 78 | nncf_module.bert.encoder.layer.4.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
82
+ | 79 | nncf_module.bert.encoder.layer.4.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
83
+ | 80 | nncf_module.bert.encoder.layer.4.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
84
+ | 81 | nncf_module.bert.encoder.layer.4.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
85
+ | 82 | nncf_module.bert.encoder.layer.4.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
86
+ | 83 | nncf_module.bert.encoder.layer.4.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
87
+ | 84 | nncf_module.bert.encoder.layer.4.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
88
+ | 85 | nncf_module.bert.encoder.layer.5.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
89
+ | 86 | nncf_module.bert.encoder.layer.5.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
90
+ | 87 | nncf_module.bert.encoder.layer.5.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
91
+ | 88 | nncf_module.bert.encoder.layer.5.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
92
+ | 89 | nncf_module.bert.encoder.layer.5.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
93
+ | 90 | nncf_module.bert.encoder.layer.5.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
94
+ | 91 | nncf_module.bert.encoder.layer.5.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
95
+ | 92 | nncf_module.bert.encoder.layer.5.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
96
+ | 93 | nncf_module.bert.encoder.layer.5.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
97
+ | 94 | nncf_module.bert.encoder.layer.5.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
98
+ | 95 | nncf_module.bert.encoder.layer.5.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
99
+ | 96 | nncf_module.bert.encoder.layer.5.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
100
+ | 97 | nncf_module.bert.encoder.layer.5.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
101
+ | 98 | nncf_module.bert.encoder.layer.5.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
102
+ | 99 | nncf_module.bert.encoder.layer.5.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
103
+ | 100 | nncf_module.bert.encoder.layer.5.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
104
+ | 101 | nncf_module.bert.encoder.layer.6.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
105
+ | 102 | nncf_module.bert.encoder.layer.6.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
106
+ | 103 | nncf_module.bert.encoder.layer.6.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
107
+ | 104 | nncf_module.bert.encoder.layer.6.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
108
+ | 105 | nncf_module.bert.encoder.layer.6.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
109
+ | 106 | nncf_module.bert.encoder.layer.6.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
110
+ | 107 | nncf_module.bert.encoder.layer.6.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
111
+ | 108 | nncf_module.bert.encoder.layer.6.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
112
+ | 109 | nncf_module.bert.encoder.layer.6.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
113
+ | 110 | nncf_module.bert.encoder.layer.6.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
114
+ | 111 | nncf_module.bert.encoder.layer.6.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
115
+ | 112 | nncf_module.bert.encoder.layer.6.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
116
+ | 113 | nncf_module.bert.encoder.layer.6.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
117
+ | 114 | nncf_module.bert.encoder.layer.6.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
118
+ | 115 | nncf_module.bert.encoder.layer.6.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
119
+ | 116 | nncf_module.bert.encoder.layer.6.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
120
+ | 117 | nncf_module.bert.encoder.layer.7.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
121
+ | 118 | nncf_module.bert.encoder.layer.7.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
122
+ | 119 | nncf_module.bert.encoder.layer.7.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
123
+ | 120 | nncf_module.bert.encoder.layer.7.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
124
+ | 121 | nncf_module.bert.encoder.layer.7.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
125
+ | 122 | nncf_module.bert.encoder.layer.7.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
126
+ | 123 | nncf_module.bert.encoder.layer.7.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
127
+ | 124 | nncf_module.bert.encoder.layer.7.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
128
+ | 125 | nncf_module.bert.encoder.layer.7.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
129
+ | 126 | nncf_module.bert.encoder.layer.7.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
130
+ | 127 | nncf_module.bert.encoder.layer.7.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
131
+ | 128 | nncf_module.bert.encoder.layer.7.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
132
+ | 129 | nncf_module.bert.encoder.layer.7.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235929 | 0.9 |
133
+ | 130 | nncf_module.bert.encoder.layer.7.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
134
+ | 131 | nncf_module.bert.encoder.layer.7.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
135
+ | 132 | nncf_module.bert.encoder.layer.7.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
136
+ | 133 | nncf_module.bert.encoder.layer.8.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
137
+ | 134 | nncf_module.bert.encoder.layer.8.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
138
+ | 135 | nncf_module.bert.encoder.layer.8.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
139
+ | 136 | nncf_module.bert.encoder.layer.8.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
140
+ | 137 | nncf_module.bert.encoder.layer.8.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
141
+ | 138 | nncf_module.bert.encoder.layer.8.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
142
+ | 139 | nncf_module.bert.encoder.layer.8.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
143
+ | 140 | nncf_module.bert.encoder.layer.8.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
144
+ | 141 | nncf_module.bert.encoder.layer.8.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
145
+ | 142 | nncf_module.bert.encoder.layer.8.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
146
+ | 143 | nncf_module.bert.encoder.layer.8.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
147
+ | 144 | nncf_module.bert.encoder.layer.8.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
148
+ | 145 | nncf_module.bert.encoder.layer.8.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
149
+ | 146 | nncf_module.bert.encoder.layer.8.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
150
+ | 147 | nncf_module.bert.encoder.layer.8.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
151
+ | 148 | nncf_module.bert.encoder.layer.8.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
152
+ | 149 | nncf_module.bert.encoder.layer.9.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
153
+ | 150 | nncf_module.bert.encoder.layer.9.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
154
+ | 151 | nncf_module.bert.encoder.layer.9.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
155
+ | 152 | nncf_module.bert.encoder.layer.9.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
156
+ | 153 | nncf_module.bert.encoder.layer.9.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
157
+ | 154 | nncf_module.bert.encoder.layer.9.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
158
+ | 155 | nncf_module.bert.encoder.layer.9.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
159
+ | 156 | nncf_module.bert.encoder.layer.9.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
160
+ | 157 | nncf_module.bert.encoder.layer.9.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
161
+ | 158 | nncf_module.bert.encoder.layer.9.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
162
+ | 159 | nncf_module.bert.encoder.layer.9.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
163
+ | 160 | nncf_module.bert.encoder.layer.9.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
164
+ | 161 | nncf_module.bert.encoder.layer.9.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
165
+ | 162 | nncf_module.bert.encoder.layer.9.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
166
+ | 163 | nncf_module.bert.encoder.layer.9.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
167
+ | 164 | nncf_module.bert.encoder.layer.9.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
168
+ | 165 | nncf_module.bert.encoder.layer.10.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
169
+ | 166 | nncf_module.bert.encoder.layer.10.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
170
+ | 167 | nncf_module.bert.encoder.layer.10.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
171
+ | 168 | nncf_module.bert.encoder.layer.10.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
172
+ | 169 | nncf_module.bert.encoder.layer.10.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
173
+ | 170 | nncf_module.bert.encoder.layer.10.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
174
+ | 171 | nncf_module.bert.encoder.layer.10.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
175
+ | 172 | nncf_module.bert.encoder.layer.10.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
176
+ | 173 | nncf_module.bert.encoder.layer.10.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
177
+ | 174 | nncf_module.bert.encoder.layer.10.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
178
+ | 175 | nncf_module.bert.encoder.layer.10.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
179
+ | 176 | nncf_module.bert.encoder.layer.10.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
180
+ | 177 | nncf_module.bert.encoder.layer.10.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235929 | 0.9 |
181
+ | 178 | nncf_module.bert.encoder.layer.10.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
182
+ | 179 | nncf_module.bert.encoder.layer.10.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
183
+ | 180 | nncf_module.bert.encoder.layer.10.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
184
+ | 181 | nncf_module.bert.encoder.layer.11.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
185
+ | 182 | nncf_module.bert.encoder.layer.11.attention.self.query | NNCFLinear | bias | [768] | 768 | 768 | 0 |
186
+ | 183 | nncf_module.bert.encoder.layer.11.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
187
+ | 184 | nncf_module.bert.encoder.layer.11.attention.self.key | NNCFLinear | bias | [768] | 768 | 768 | 0 |
188
+ | 185 | nncf_module.bert.encoder.layer.11.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
189
+ | 186 | nncf_module.bert.encoder.layer.11.attention.self.value | NNCFLinear | bias | [768] | 768 | 768 | 0 |
190
+ | 187 | nncf_module.bert.encoder.layer.11.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
191
+ | 188 | nncf_module.bert.encoder.layer.11.attention.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
192
+ | 189 | nncf_module.bert.encoder.layer.11.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
193
+ | 190 | nncf_module.bert.encoder.layer.11.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
194
+ | 191 | nncf_module.bert.encoder.layer.11.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
195
+ | 192 | nncf_module.bert.encoder.layer.11.intermediate.dense | NNCFLinear | bias | [3072] | 3072 | 3072 | 0 |
196
+ | 193 | nncf_module.bert.encoder.layer.11.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
197
+ | 194 | nncf_module.bert.encoder.layer.11.output.dense | NNCFLinear | bias | [768] | 768 | 768 | 0 |
198
+ | 195 | nncf_module.bert.encoder.layer.11.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
199
+ | 196 | nncf_module.bert.encoder.layer.11.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
200
+ | 197 | nncf_module.qa_outputs | NNCFLinear | weight | [2, 768] | 1536 | 1536 | 0 |
201
+ | 198 | nncf_module.qa_outputs | NNCFLinear | bias | [2] | 2 | 2 | 0 |
linear_layer_sparsity_85M_params_90.00_sparsity.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer_id,layer_type,param_type,shape,nparam,nnz,sparsity
2
+ 5,nncf_module.bert.encoder.layer.0.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
3
+ 7,nncf_module.bert.encoder.layer.0.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
4
+ 9,nncf_module.bert.encoder.layer.0.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
5
+ 11,nncf_module.bert.encoder.layer.0.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
6
+ 15,nncf_module.bert.encoder.layer.0.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
7
+ 17,nncf_module.bert.encoder.layer.0.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
8
+ 21,nncf_module.bert.encoder.layer.1.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
9
+ 23,nncf_module.bert.encoder.layer.1.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
10
+ 25,nncf_module.bert.encoder.layer.1.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
11
+ 27,nncf_module.bert.encoder.layer.1.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
12
+ 31,nncf_module.bert.encoder.layer.1.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
13
+ 33,nncf_module.bert.encoder.layer.1.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
14
+ 37,nncf_module.bert.encoder.layer.2.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
15
+ 39,nncf_module.bert.encoder.layer.2.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
16
+ 41,nncf_module.bert.encoder.layer.2.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
17
+ 43,nncf_module.bert.encoder.layer.2.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
18
+ 47,nncf_module.bert.encoder.layer.2.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
19
+ 49,nncf_module.bert.encoder.layer.2.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
20
+ 53,nncf_module.bert.encoder.layer.3.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
21
+ 55,nncf_module.bert.encoder.layer.3.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
22
+ 57,nncf_module.bert.encoder.layer.3.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
23
+ 59,nncf_module.bert.encoder.layer.3.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
24
+ 63,nncf_module.bert.encoder.layer.3.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
25
+ 65,nncf_module.bert.encoder.layer.3.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
26
+ 69,nncf_module.bert.encoder.layer.4.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
27
+ 71,nncf_module.bert.encoder.layer.4.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
28
+ 73,nncf_module.bert.encoder.layer.4.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
29
+ 75,nncf_module.bert.encoder.layer.4.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
30
+ 79,nncf_module.bert.encoder.layer.4.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
31
+ 81,nncf_module.bert.encoder.layer.4.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
32
+ 85,nncf_module.bert.encoder.layer.5.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
33
+ 87,nncf_module.bert.encoder.layer.5.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
34
+ 89,nncf_module.bert.encoder.layer.5.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
35
+ 91,nncf_module.bert.encoder.layer.5.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
36
+ 95,nncf_module.bert.encoder.layer.5.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
37
+ 97,nncf_module.bert.encoder.layer.5.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
38
+ 101,nncf_module.bert.encoder.layer.6.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
39
+ 103,nncf_module.bert.encoder.layer.6.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
40
+ 105,nncf_module.bert.encoder.layer.6.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
41
+ 107,nncf_module.bert.encoder.layer.6.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
42
+ 111,nncf_module.bert.encoder.layer.6.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
43
+ 113,nncf_module.bert.encoder.layer.6.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
44
+ 117,nncf_module.bert.encoder.layer.7.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
45
+ 119,nncf_module.bert.encoder.layer.7.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
46
+ 121,nncf_module.bert.encoder.layer.7.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
47
+ 123,nncf_module.bert.encoder.layer.7.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
48
+ 127,nncf_module.bert.encoder.layer.7.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
49
+ 129,nncf_module.bert.encoder.layer.7.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235929,0.900000274181366
50
+ 133,nncf_module.bert.encoder.layer.8.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
51
+ 135,nncf_module.bert.encoder.layer.8.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
52
+ 137,nncf_module.bert.encoder.layer.8.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
53
+ 139,nncf_module.bert.encoder.layer.8.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
54
+ 143,nncf_module.bert.encoder.layer.8.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
55
+ 145,nncf_module.bert.encoder.layer.8.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
56
+ 149,nncf_module.bert.encoder.layer.9.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
57
+ 151,nncf_module.bert.encoder.layer.9.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
58
+ 153,nncf_module.bert.encoder.layer.9.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
59
+ 155,nncf_module.bert.encoder.layer.9.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
60
+ 159,nncf_module.bert.encoder.layer.9.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
61
+ 161,nncf_module.bert.encoder.layer.9.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
62
+ 165,nncf_module.bert.encoder.layer.10.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
63
+ 167,nncf_module.bert.encoder.layer.10.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
64
+ 169,nncf_module.bert.encoder.layer.10.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
65
+ 171,nncf_module.bert.encoder.layer.10.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
66
+ 175,nncf_module.bert.encoder.layer.10.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
67
+ 177,nncf_module.bert.encoder.layer.10.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235929,0.900000274181366
68
+ 181,nncf_module.bert.encoder.layer.11.attention.self.query,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
69
+ 183,nncf_module.bert.encoder.layer.11.attention.self.key,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
70
+ 185,nncf_module.bert.encoder.layer.11.attention.self.value,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
71
+ 187,nncf_module.bert.encoder.layer.11.attention.output.dense,NNCFLinear,weight,"[768, 768]",589824,58983,0.8999989628791809
72
+ 191,nncf_module.bert.encoder.layer.11.intermediate.dense,NNCFLinear,weight,"[3072, 768]",2359296,235930,0.8999998569488525
73
+ 193,nncf_module.bert.encoder.layer.11.output.dense,NNCFLinear,weight,"[768, 3072]",2359296,235930,0.8999998569488525
linear_layer_sparsity_85M_params_90.00_sparsity.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | | layer_id | layer_type | param_type | shape | nparam | nnz | sparsity |
2
+ |----:|:---------------------------------------------------------|:-------------|:-------------|:------------|---------:|-------:|-----------:|
3
+ | 5 | nncf_module.bert.encoder.layer.0.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
4
+ | 7 | nncf_module.bert.encoder.layer.0.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
5
+ | 9 | nncf_module.bert.encoder.layer.0.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
6
+ | 11 | nncf_module.bert.encoder.layer.0.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
7
+ | 15 | nncf_module.bert.encoder.layer.0.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
8
+ | 17 | nncf_module.bert.encoder.layer.0.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
9
+ | 21 | nncf_module.bert.encoder.layer.1.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
10
+ | 23 | nncf_module.bert.encoder.layer.1.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
11
+ | 25 | nncf_module.bert.encoder.layer.1.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
12
+ | 27 | nncf_module.bert.encoder.layer.1.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
13
+ | 31 | nncf_module.bert.encoder.layer.1.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
14
+ | 33 | nncf_module.bert.encoder.layer.1.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
15
+ | 37 | nncf_module.bert.encoder.layer.2.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
16
+ | 39 | nncf_module.bert.encoder.layer.2.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
17
+ | 41 | nncf_module.bert.encoder.layer.2.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
18
+ | 43 | nncf_module.bert.encoder.layer.2.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
19
+ | 47 | nncf_module.bert.encoder.layer.2.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
20
+ | 49 | nncf_module.bert.encoder.layer.2.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
21
+ | 53 | nncf_module.bert.encoder.layer.3.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
22
+ | 55 | nncf_module.bert.encoder.layer.3.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
23
+ | 57 | nncf_module.bert.encoder.layer.3.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
24
+ | 59 | nncf_module.bert.encoder.layer.3.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
25
+ | 63 | nncf_module.bert.encoder.layer.3.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
26
+ | 65 | nncf_module.bert.encoder.layer.3.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
27
+ | 69 | nncf_module.bert.encoder.layer.4.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
28
+ | 71 | nncf_module.bert.encoder.layer.4.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
29
+ | 73 | nncf_module.bert.encoder.layer.4.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
30
+ | 75 | nncf_module.bert.encoder.layer.4.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
31
+ | 79 | nncf_module.bert.encoder.layer.4.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
32
+ | 81 | nncf_module.bert.encoder.layer.4.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
33
+ | 85 | nncf_module.bert.encoder.layer.5.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
34
+ | 87 | nncf_module.bert.encoder.layer.5.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
35
+ | 89 | nncf_module.bert.encoder.layer.5.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
36
+ | 91 | nncf_module.bert.encoder.layer.5.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
37
+ | 95 | nncf_module.bert.encoder.layer.5.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
38
+ | 97 | nncf_module.bert.encoder.layer.5.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
39
+ | 101 | nncf_module.bert.encoder.layer.6.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
40
+ | 103 | nncf_module.bert.encoder.layer.6.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
41
+ | 105 | nncf_module.bert.encoder.layer.6.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
42
+ | 107 | nncf_module.bert.encoder.layer.6.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
43
+ | 111 | nncf_module.bert.encoder.layer.6.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
44
+ | 113 | nncf_module.bert.encoder.layer.6.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
45
+ | 117 | nncf_module.bert.encoder.layer.7.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
46
+ | 119 | nncf_module.bert.encoder.layer.7.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
47
+ | 121 | nncf_module.bert.encoder.layer.7.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
48
+ | 123 | nncf_module.bert.encoder.layer.7.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
49
+ | 127 | nncf_module.bert.encoder.layer.7.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
50
+ | 129 | nncf_module.bert.encoder.layer.7.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235929 | 0.9 |
51
+ | 133 | nncf_module.bert.encoder.layer.8.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
52
+ | 135 | nncf_module.bert.encoder.layer.8.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
53
+ | 137 | nncf_module.bert.encoder.layer.8.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
54
+ | 139 | nncf_module.bert.encoder.layer.8.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
55
+ | 143 | nncf_module.bert.encoder.layer.8.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
56
+ | 145 | nncf_module.bert.encoder.layer.8.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
57
+ | 149 | nncf_module.bert.encoder.layer.9.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
58
+ | 151 | nncf_module.bert.encoder.layer.9.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
59
+ | 153 | nncf_module.bert.encoder.layer.9.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
60
+ | 155 | nncf_module.bert.encoder.layer.9.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
61
+ | 159 | nncf_module.bert.encoder.layer.9.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
62
+ | 161 | nncf_module.bert.encoder.layer.9.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
63
+ | 165 | nncf_module.bert.encoder.layer.10.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
64
+ | 167 | nncf_module.bert.encoder.layer.10.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
65
+ | 169 | nncf_module.bert.encoder.layer.10.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
66
+ | 171 | nncf_module.bert.encoder.layer.10.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
67
+ | 175 | nncf_module.bert.encoder.layer.10.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
68
+ | 177 | nncf_module.bert.encoder.layer.10.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235929 | 0.9 |
69
+ | 181 | nncf_module.bert.encoder.layer.11.attention.self.query | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
70
+ | 183 | nncf_module.bert.encoder.layer.11.attention.self.key | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
71
+ | 185 | nncf_module.bert.encoder.layer.11.attention.self.value | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
72
+ | 187 | nncf_module.bert.encoder.layer.11.attention.output.dense | NNCFLinear | weight | [768, 768] | 589824 | 58983 | 0.899999 |
73
+ | 191 | nncf_module.bert.encoder.layer.11.intermediate.dense | NNCFLinear | weight | [3072, 768] | 2359296 | 235930 | 0.9 |
74
+ | 193 | nncf_module.bert.encoder.layer.11.output.dense | NNCFLinear | weight | [768, 3072] | 2359296 | 235930 | 0.9 |
nncf_bert_squad_sparsity.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "input_info": [
3
+ {
4
+ "sample_size": [1, 384],
5
+ "type": "long"
6
+ },
7
+ {
8
+ "sample_size": [1, 384],
9
+ "type": "long"
10
+ },
11
+ {
12
+ "sample_size": [1, 384],
13
+ "type": "long"
14
+ }
15
+ ],
16
+ "compression":
17
+ [
18
+ {
19
+ "algorithm": "magnitude_sparsity",
20
+ "sparsity_init": 0.0,
21
+ "params": {
22
+ "schedule": "multistep",
23
+ "multistep_steps": [
24
+ 2,
25
+ 4,
26
+ 6,
27
+ 8,
28
+ ],
29
+ "multistep_sparsity_levels": [
30
+ 0.0,
31
+ 0.0,
32
+ 0.0,
33
+ 0.0,
34
+ 0.0,
35
+ ]
36
+ },
37
+ "ignored_scopes": ["{re}.*NNCFEmbedding", "{re}.*qa_outputs*"]
38
+ },
39
+ {
40
+ "algorithm": "quantization",
41
+ "initializer": {
42
+ "range": {
43
+ "num_init_samples": 32,
44
+ "type": "percentile",
45
+ "params":
46
+ {
47
+ "min_percentile": 0.01,
48
+ "max_percentile": 99.99
49
+ }
50
+ },
51
+
52
+ "batchnorm_adaptation": {
53
+ "num_bn_adaptation_samples": 200
54
+ }
55
+ },
56
+ "activations":
57
+ {
58
+ "mode": "symmetric"
59
+ },
60
+ "weights":
61
+ {
62
+ "mode": "symmetric",
63
+ "signed": true,
64
+ "per_channel": false
65
+ }
66
+ }
67
+ ],
68
+ }
onnx_sparsity.csv ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer_id,shape,nparam,nnz,sparsity
2
+ 0,Constant_15,"[30522, 768]",23440896,23440896,0.0
3
+ 1,Constant_23,"[2, 768]",1536,1536,0.0
4
+ 2,Constant_35,"[512, 768]",393216,393216,0.0
5
+ 3,Constant_61,"[768, 768]",589824,58983,0.8999989827473959
6
+ 4,Constant_71,"[768, 768]",589824,58983,0.8999989827473959
7
+ 5,Constant_91,"[768, 768]",589824,58983,0.8999989827473959
8
+ 6,Constant_150,"[768, 768]",589824,58983,0.8999989827473959
9
+ 7,Constant_178,"[3072, 768]",2359296,235930,0.8999998304578993
10
+ 8,Constant_196,"[768, 3072]",2359296,235930,0.8999998304578993
11
+ 9,Constant_224,"[768, 768]",589824,58983,0.8999989827473959
12
+ 10,Constant_234,"[768, 768]",589824,58983,0.8999989827473959
13
+ 11,Constant_254,"[768, 768]",589824,58983,0.8999989827473959
14
+ 12,Constant_313,"[768, 768]",589824,58983,0.8999989827473959
15
+ 13,Constant_341,"[3072, 768]",2359296,235930,0.8999998304578993
16
+ 14,Constant_359,"[768, 3072]",2359296,235930,0.8999998304578993
17
+ 15,Constant_387,"[768, 768]",589824,58983,0.8999989827473959
18
+ 16,Constant_397,"[768, 768]",589824,58983,0.8999989827473959
19
+ 17,Constant_417,"[768, 768]",589824,58983,0.8999989827473959
20
+ 18,Constant_476,"[768, 768]",589824,58983,0.8999989827473959
21
+ 19,Constant_504,"[3072, 768]",2359296,235930,0.8999998304578993
22
+ 20,Constant_522,"[768, 3072]",2359296,235930,0.8999998304578993
23
+ 21,Constant_550,"[768, 768]",589824,58983,0.8999989827473959
24
+ 22,Constant_560,"[768, 768]",589824,58983,0.8999989827473959
25
+ 23,Constant_580,"[768, 768]",589824,58983,0.8999989827473959
26
+ 24,Constant_639,"[768, 768]",589824,58983,0.8999989827473959
27
+ 25,Constant_667,"[3072, 768]",2359296,235930,0.8999998304578993
28
+ 26,Constant_685,"[768, 3072]",2359296,235930,0.8999998304578993
29
+ 27,Constant_713,"[768, 768]",589824,58983,0.8999989827473959
30
+ 28,Constant_723,"[768, 768]",589824,58983,0.8999989827473959
31
+ 29,Constant_743,"[768, 768]",589824,58983,0.8999989827473959
32
+ 30,Constant_802,"[768, 768]",589824,58983,0.8999989827473959
33
+ 31,Constant_830,"[3072, 768]",2359296,235930,0.8999998304578993
34
+ 32,Constant_848,"[768, 3072]",2359296,235930,0.8999998304578993
35
+ 33,Constant_876,"[768, 768]",589824,58983,0.8999989827473959
36
+ 34,Constant_886,"[768, 768]",589824,58983,0.8999989827473959
37
+ 35,Constant_906,"[768, 768]",589824,58983,0.8999989827473959
38
+ 36,Constant_965,"[768, 768]",589824,58983,0.8999989827473959
39
+ 37,Constant_993,"[3072, 768]",2359296,235930,0.8999998304578993
40
+ 38,Constant_1011,"[768, 3072]",2359296,235930,0.8999998304578993
41
+ 39,Constant_1039,"[768, 768]",589824,58983,0.8999989827473959
42
+ 40,Constant_1049,"[768, 768]",589824,58983,0.8999989827473959
43
+ 41,Constant_1069,"[768, 768]",589824,58983,0.8999989827473959
44
+ 42,Constant_1128,"[768, 768]",589824,58983,0.8999989827473959
45
+ 43,Constant_1156,"[3072, 768]",2359296,235930,0.8999998304578993
46
+ 44,Constant_1174,"[768, 3072]",2359296,235930,0.8999998304578993
47
+ 45,Constant_1202,"[768, 768]",589824,58983,0.8999989827473959
48
+ 46,Constant_1212,"[768, 768]",589824,58983,0.8999989827473959
49
+ 47,Constant_1232,"[768, 768]",589824,58983,0.8999989827473959
50
+ 48,Constant_1291,"[768, 768]",589824,58983,0.8999989827473959
51
+ 49,Constant_1319,"[3072, 768]",2359296,235930,0.8999998304578993
52
+ 50,Constant_1337,"[768, 3072]",2359296,235929,0.900000254313151
53
+ 51,Constant_1365,"[768, 768]",589824,58983,0.8999989827473959
54
+ 52,Constant_1375,"[768, 768]",589824,58983,0.8999989827473959
55
+ 53,Constant_1395,"[768, 768]",589824,58983,0.8999989827473959
56
+ 54,Constant_1454,"[768, 768]",589824,58983,0.8999989827473959
57
+ 55,Constant_1482,"[3072, 768]",2359296,235930,0.8999998304578993
58
+ 56,Constant_1500,"[768, 3072]",2359296,235930,0.8999998304578993
59
+ 57,Constant_1528,"[768, 768]",589824,58983,0.8999989827473959
60
+ 58,Constant_1538,"[768, 768]",589824,58983,0.8999989827473959
61
+ 59,Constant_1558,"[768, 768]",589824,58983,0.8999989827473959
62
+ 60,Constant_1617,"[768, 768]",589824,58983,0.8999989827473959
63
+ 61,Constant_1645,"[3072, 768]",2359296,235930,0.8999998304578993
64
+ 62,Constant_1663,"[768, 3072]",2359296,235930,0.8999998304578993
65
+ 63,Constant_1691,"[768, 768]",589824,58983,0.8999989827473959
66
+ 64,Constant_1701,"[768, 768]",589824,58983,0.8999989827473959
67
+ 65,Constant_1721,"[768, 768]",589824,58983,0.8999989827473959
68
+ 66,Constant_1780,"[768, 768]",589824,58983,0.8999989827473959
69
+ 67,Constant_1808,"[3072, 768]",2359296,235930,0.8999998304578993
70
+ 68,Constant_1826,"[768, 3072]",2359296,235929,0.900000254313151
71
+ 69,Constant_1854,"[768, 768]",589824,58983,0.8999989827473959
72
+ 70,Constant_1864,"[768, 768]",589824,58983,0.8999989827473959
73
+ 71,Constant_1884,"[768, 768]",589824,58983,0.8999989827473959
74
+ 72,Constant_1943,"[768, 768]",589824,58983,0.8999989827473959
75
+ 73,Constant_1971,"[3072, 768]",2359296,235930,0.8999998304578993
76
+ 74,Constant_1989,"[768, 3072]",2359296,235930,0.8999998304578993
77
+ 75,Constant_2017,"[2, 768]",1536,1536,0.0
onnx_sparsity.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | | layer_id | shape | nparam | nnz | sparsity |
2
+ |---:|:--------------|:-------------|---------:|---------:|-----------:|
3
+ | 0 | Constant_15 | [30522, 768] | 23440896 | 23440896 | 0 |
4
+ | 1 | Constant_23 | [2, 768] | 1536 | 1536 | 0 |
5
+ | 2 | Constant_35 | [512, 768] | 393216 | 393216 | 0 |
6
+ | 3 | Constant_61 | [768, 768] | 589824 | 58983 | 0.899999 |
7
+ | 4 | Constant_71 | [768, 768] | 589824 | 58983 | 0.899999 |
8
+ | 5 | Constant_91 | [768, 768] | 589824 | 58983 | 0.899999 |
9
+ | 6 | Constant_150 | [768, 768] | 589824 | 58983 | 0.899999 |
10
+ | 7 | Constant_178 | [3072, 768] | 2359296 | 235930 | 0.9 |
11
+ | 8 | Constant_196 | [768, 3072] | 2359296 | 235930 | 0.9 |
12
+ | 9 | Constant_224 | [768, 768] | 589824 | 58983 | 0.899999 |
13
+ | 10 | Constant_234 | [768, 768] | 589824 | 58983 | 0.899999 |
14
+ | 11 | Constant_254 | [768, 768] | 589824 | 58983 | 0.899999 |
15
+ | 12 | Constant_313 | [768, 768] | 589824 | 58983 | 0.899999 |
16
+ | 13 | Constant_341 | [3072, 768] | 2359296 | 235930 | 0.9 |
17
+ | 14 | Constant_359 | [768, 3072] | 2359296 | 235930 | 0.9 |
18
+ | 15 | Constant_387 | [768, 768] | 589824 | 58983 | 0.899999 |
19
+ | 16 | Constant_397 | [768, 768] | 589824 | 58983 | 0.899999 |
20
+ | 17 | Constant_417 | [768, 768] | 589824 | 58983 | 0.899999 |
21
+ | 18 | Constant_476 | [768, 768] | 589824 | 58983 | 0.899999 |
22
+ | 19 | Constant_504 | [3072, 768] | 2359296 | 235930 | 0.9 |
23
+ | 20 | Constant_522 | [768, 3072] | 2359296 | 235930 | 0.9 |
24
+ | 21 | Constant_550 | [768, 768] | 589824 | 58983 | 0.899999 |
25
+ | 22 | Constant_560 | [768, 768] | 589824 | 58983 | 0.899999 |
26
+ | 23 | Constant_580 | [768, 768] | 589824 | 58983 | 0.899999 |
27
+ | 24 | Constant_639 | [768, 768] | 589824 | 58983 | 0.899999 |
28
+ | 25 | Constant_667 | [3072, 768] | 2359296 | 235930 | 0.9 |
29
+ | 26 | Constant_685 | [768, 3072] | 2359296 | 235930 | 0.9 |
30
+ | 27 | Constant_713 | [768, 768] | 589824 | 58983 | 0.899999 |
31
+ | 28 | Constant_723 | [768, 768] | 589824 | 58983 | 0.899999 |
32
+ | 29 | Constant_743 | [768, 768] | 589824 | 58983 | 0.899999 |
33
+ | 30 | Constant_802 | [768, 768] | 589824 | 58983 | 0.899999 |
34
+ | 31 | Constant_830 | [3072, 768] | 2359296 | 235930 | 0.9 |
35
+ | 32 | Constant_848 | [768, 3072] | 2359296 | 235930 | 0.9 |
36
+ | 33 | Constant_876 | [768, 768] | 589824 | 58983 | 0.899999 |
37
+ | 34 | Constant_886 | [768, 768] | 589824 | 58983 | 0.899999 |
38
+ | 35 | Constant_906 | [768, 768] | 589824 | 58983 | 0.899999 |
39
+ | 36 | Constant_965 | [768, 768] | 589824 | 58983 | 0.899999 |
40
+ | 37 | Constant_993 | [3072, 768] | 2359296 | 235930 | 0.9 |
41
+ | 38 | Constant_1011 | [768, 3072] | 2359296 | 235930 | 0.9 |
42
+ | 39 | Constant_1039 | [768, 768] | 589824 | 58983 | 0.899999 |
43
+ | 40 | Constant_1049 | [768, 768] | 589824 | 58983 | 0.899999 |
44
+ | 41 | Constant_1069 | [768, 768] | 589824 | 58983 | 0.899999 |
45
+ | 42 | Constant_1128 | [768, 768] | 589824 | 58983 | 0.899999 |
46
+ | 43 | Constant_1156 | [3072, 768] | 2359296 | 235930 | 0.9 |
47
+ | 44 | Constant_1174 | [768, 3072] | 2359296 | 235930 | 0.9 |
48
+ | 45 | Constant_1202 | [768, 768] | 589824 | 58983 | 0.899999 |
49
+ | 46 | Constant_1212 | [768, 768] | 589824 | 58983 | 0.899999 |
50
+ | 47 | Constant_1232 | [768, 768] | 589824 | 58983 | 0.899999 |
51
+ | 48 | Constant_1291 | [768, 768] | 589824 | 58983 | 0.899999 |
52
+ | 49 | Constant_1319 | [3072, 768] | 2359296 | 235930 | 0.9 |
53
+ | 50 | Constant_1337 | [768, 3072] | 2359296 | 235929 | 0.9 |
54
+ | 51 | Constant_1365 | [768, 768] | 589824 | 58983 | 0.899999 |
55
+ | 52 | Constant_1375 | [768, 768] | 589824 | 58983 | 0.899999 |
56
+ | 53 | Constant_1395 | [768, 768] | 589824 | 58983 | 0.899999 |
57
+ | 54 | Constant_1454 | [768, 768] | 589824 | 58983 | 0.899999 |
58
+ | 55 | Constant_1482 | [3072, 768] | 2359296 | 235930 | 0.9 |
59
+ | 56 | Constant_1500 | [768, 3072] | 2359296 | 235930 | 0.9 |
60
+ | 57 | Constant_1528 | [768, 768] | 589824 | 58983 | 0.899999 |
61
+ | 58 | Constant_1538 | [768, 768] | 589824 | 58983 | 0.899999 |
62
+ | 59 | Constant_1558 | [768, 768] | 589824 | 58983 | 0.899999 |
63
+ | 60 | Constant_1617 | [768, 768] | 589824 | 58983 | 0.899999 |
64
+ | 61 | Constant_1645 | [3072, 768] | 2359296 | 235930 | 0.9 |
65
+ | 62 | Constant_1663 | [768, 3072] | 2359296 | 235930 | 0.9 |
66
+ | 63 | Constant_1691 | [768, 768] | 589824 | 58983 | 0.899999 |
67
+ | 64 | Constant_1701 | [768, 768] | 589824 | 58983 | 0.899999 |
68
+ | 65 | Constant_1721 | [768, 768] | 589824 | 58983 | 0.899999 |
69
+ | 66 | Constant_1780 | [768, 768] | 589824 | 58983 | 0.899999 |
70
+ | 67 | Constant_1808 | [3072, 768] | 2359296 | 235930 | 0.9 |
71
+ | 68 | Constant_1826 | [768, 3072] | 2359296 | 235929 | 0.9 |
72
+ | 69 | Constant_1854 | [768, 768] | 589824 | 58983 | 0.899999 |
73
+ | 70 | Constant_1864 | [768, 768] | 589824 | 58983 | 0.899999 |
74
+ | 71 | Constant_1884 | [768, 768] | 589824 | 58983 | 0.899999 |
75
+ | 72 | Constant_1943 | [768, 768] | 589824 | 58983 | 0.899999 |
76
+ | 73 | Constant_1971 | [3072, 768] | 2359296 | 235930 | 0.9 |
77
+ | 74 | Constant_1989 | [768, 3072] | 2359296 | 235930 | 0.9 |
78
+ | 75 | Constant_2017 | [2, 768] | 1536 | 1536 | 0 |
original_graph.dot ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d77007b5b714b193370f40519de60c77cdcf9e51c7b62a6ed7ccca97fadca06
3
+ size 775914961
run.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98557b06e888397b54a5a54907bf8297710f574e35a6f627bf2a0d40b4ad33b7
3
+ size 2125651135
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "/data1/vchua/tld-poc-csr-dgx1-03/pruneofa-tl/run01-bert-squad-pruneofa-90pc-8eph/checkpoint-56750", "tokenizer_class": "BertTokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.0,
3
+ "train_loss": 1.526875147819519,
4
+ "train_runtime": 69.4951,
5
+ "train_samples": 88524,
6
+ "train_samples_per_second": 5.756,
7
+ "train_steps_per_second": 0.36
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.004518344478583047,
5
+ "global_step": 25,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
12
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
13
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
14
+ "compression_loss": 0.0,
15
+ "epoch": 0.0,
16
+ "label_loss": 0.5308924913406372,
17
+ "learning_rate": 4.2857142857142855e-06,
18
+ "loss": 1.8408,
19
+ "step": 1,
20
+ "teacher_loss": 1.9863686561584473
21
+ },
22
+ {
23
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
24
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
25
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
26
+ "compression_loss": 0.0,
27
+ "epoch": 0.0,
28
+ "label_loss": 0.7996352910995483,
29
+ "learning_rate": 8.571428571428571e-06,
30
+ "loss": 1.9171,
31
+ "step": 2,
32
+ "teacher_loss": 2.0412771701812744
33
+ },
34
+ {
35
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
36
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
37
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
38
+ "compression_loss": 0.0,
39
+ "epoch": 0.0,
40
+ "label_loss": 0.7973085641860962,
41
+ "learning_rate": 1.2857142857142857e-05,
42
+ "loss": 2.0705,
43
+ "step": 3,
44
+ "teacher_loss": 2.2119908332824707
45
+ },
46
+ {
47
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
48
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
49
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
50
+ "compression_loss": 0.0,
51
+ "epoch": 0.0,
52
+ "label_loss": 0.9594857692718506,
53
+ "learning_rate": 1.7142857142857142e-05,
54
+ "loss": 1.8092,
55
+ "step": 4,
56
+ "teacher_loss": 1.9035913944244385
57
+ },
58
+ {
59
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
60
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
61
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
62
+ "compression_loss": 0.0,
63
+ "epoch": 0.0,
64
+ "label_loss": 1.0630263090133667,
65
+ "learning_rate": 2.1428571428571428e-05,
66
+ "loss": 1.7453,
67
+ "step": 5,
68
+ "teacher_loss": 1.821063756942749
69
+ },
70
+ {
71
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
72
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
73
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
74
+ "compression_loss": 0.0,
75
+ "epoch": 0.0,
76
+ "label_loss": 0.7821776866912842,
77
+ "learning_rate": 2.5714285714285714e-05,
78
+ "loss": 1.8497,
79
+ "step": 6,
80
+ "teacher_loss": 1.9682769775390625
81
+ },
82
+ {
83
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
84
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
85
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
86
+ "compression_loss": 0.0,
87
+ "epoch": 0.0,
88
+ "label_loss": 0.8524608612060547,
89
+ "learning_rate": 3e-05,
90
+ "loss": 1.4181,
91
+ "step": 7,
92
+ "teacher_loss": 1.480897307395935
93
+ },
94
+ {
95
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
96
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
97
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
98
+ "compression_loss": 0.0,
99
+ "epoch": 0.0,
100
+ "label_loss": 0.6336731314659119,
101
+ "learning_rate": 2.977211629518312e-05,
102
+ "loss": 1.948,
103
+ "step": 8,
104
+ "teacher_loss": 2.094085693359375
105
+ },
106
+ {
107
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
108
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
109
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
110
+ "compression_loss": 0.0,
111
+ "epoch": 0.0,
112
+ "label_loss": 0.6002183556556702,
113
+ "learning_rate": 2.9095389311788626e-05,
114
+ "loss": 1.6932,
115
+ "step": 9,
116
+ "teacher_loss": 1.8146612644195557
117
+ },
118
+ {
119
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
120
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
121
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
122
+ "compression_loss": 0.0,
123
+ "epoch": 0.0,
124
+ "label_loss": 0.5702627301216125,
125
+ "learning_rate": 2.7990381056766583e-05,
126
+ "loss": 1.2117,
127
+ "step": 10,
128
+ "teacher_loss": 1.2829853296279907
129
+ },
130
+ {
131
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
132
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
133
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
134
+ "compression_loss": 0.0,
135
+ "epoch": 0.0,
136
+ "label_loss": 0.8245307207107544,
137
+ "learning_rate": 2.649066664678467e-05,
138
+ "loss": 1.353,
139
+ "step": 11,
140
+ "teacher_loss": 1.4116955995559692
141
+ },
142
+ {
143
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
144
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
145
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
146
+ "compression_loss": 0.0,
147
+ "epoch": 0.0,
148
+ "label_loss": 0.4113720655441284,
149
+ "learning_rate": 2.464181414529809e-05,
150
+ "loss": 1.4131,
151
+ "step": 12,
152
+ "teacher_loss": 1.524428129196167
153
+ },
154
+ {
155
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
156
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
157
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
158
+ "compression_loss": 0.0,
159
+ "epoch": 0.0,
160
+ "label_loss": 0.5899919867515564,
161
+ "learning_rate": 2.25e-05,
162
+ "loss": 1.748,
163
+ "step": 13,
164
+ "teacher_loss": 1.876697301864624
165
+ },
166
+ {
167
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
168
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
169
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
170
+ "compression_loss": 0.0,
171
+ "epoch": 0.0,
172
+ "label_loss": 0.7763683199882507,
173
+ "learning_rate": 2.0130302149885033e-05,
174
+ "loss": 1.2634,
175
+ "step": 14,
176
+ "teacher_loss": 1.317491054534912
177
+ },
178
+ {
179
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
180
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
181
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
182
+ "compression_loss": 0.0,
183
+ "epoch": 0.0,
184
+ "label_loss": 1.256364107131958,
185
+ "learning_rate": 1.760472266500396e-05,
186
+ "loss": 1.8216,
187
+ "step": 15,
188
+ "teacher_loss": 1.884453296661377
189
+ },
190
+ {
191
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
192
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
193
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
194
+ "compression_loss": 0.0,
195
+ "epoch": 0.0,
196
+ "label_loss": 0.630717396736145,
197
+ "learning_rate": 1.5e-05,
198
+ "loss": 1.052,
199
+ "step": 16,
200
+ "teacher_loss": 1.0988600254058838
201
+ },
202
+ {
203
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
204
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
205
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
206
+ "compression_loss": 0.0,
207
+ "epoch": 0.0,
208
+ "label_loss": 2.1178719997406006,
209
+ "learning_rate": 1.2395277334996045e-05,
210
+ "loss": 1.768,
211
+ "step": 17,
212
+ "teacher_loss": 1.729122519493103
213
+ },
214
+ {
215
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
216
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
217
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
218
+ "compression_loss": 0.0,
219
+ "epoch": 0.0,
220
+ "label_loss": 0.33149364590644836,
221
+ "learning_rate": 9.86969785011497e-06,
222
+ "loss": 1.0413,
223
+ "step": 18,
224
+ "teacher_loss": 1.1201426982879639
225
+ },
226
+ {
227
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
228
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
229
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
230
+ "compression_loss": 0.0,
231
+ "epoch": 0.0,
232
+ "label_loss": 1.1079105138778687,
233
+ "learning_rate": 7.500000000000004e-06,
234
+ "loss": 1.718,
235
+ "step": 19,
236
+ "teacher_loss": 1.7857446670532227
237
+ },
238
+ {
239
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
240
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
241
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
242
+ "compression_loss": 0.0,
243
+ "epoch": 0.0,
244
+ "label_loss": 1.143695592880249,
245
+ "learning_rate": 5.3581858547019095e-06,
246
+ "loss": 1.0575,
247
+ "step": 20,
248
+ "teacher_loss": 1.047965407371521
249
+ },
250
+ {
251
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
252
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
253
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
254
+ "compression_loss": 0.0,
255
+ "epoch": 0.0,
256
+ "label_loss": 0.5923559665679932,
257
+ "learning_rate": 3.5093333532153316e-06,
258
+ "loss": 0.9105,
259
+ "step": 21,
260
+ "teacher_loss": 0.9459026455879211
261
+ },
262
+ {
263
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
264
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
265
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
266
+ "compression_loss": 0.0,
267
+ "epoch": 0.0,
268
+ "label_loss": 0.4449232816696167,
269
+ "learning_rate": 2.0096189432334194e-06,
270
+ "loss": 0.9125,
271
+ "step": 22,
272
+ "teacher_loss": 0.9643977284431458
273
+ },
274
+ {
275
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
276
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
277
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
278
+ "compression_loss": 0.0,
279
+ "epoch": 0.0,
280
+ "label_loss": 1.3087965250015259,
281
+ "learning_rate": 9.046106882113753e-07,
282
+ "loss": 1.5437,
283
+ "step": 23,
284
+ "teacher_loss": 1.5698204040527344
285
+ },
286
+ {
287
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
288
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
289
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
290
+ "compression_loss": 0.0,
291
+ "epoch": 0.0,
292
+ "label_loss": 1.0976316928863525,
293
+ "learning_rate": 2.278837048168797e-07,
294
+ "loss": 1.6757,
295
+ "step": 24,
296
+ "teacher_loss": 1.7399156093597412
297
+ },
298
+ {
299
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_model": 0.7019828954219413,
300
+ "compression/statistics/magnitude_sparsity/sparsity_level_for_sparsified_layers": 0.8999995714352454,
301
+ "compression/statistics/magnitude_sparsity/target_sparsity_level": 0.0,
302
+ "compression_loss": 0.0,
303
+ "epoch": 0.0,
304
+ "label_loss": 0.7453712224960327,
305
+ "learning_rate": 0.0,
306
+ "loss": 1.3899,
307
+ "step": 25,
308
+ "teacher_loss": 1.4615265130996704
309
+ },
310
+ {
311
+ "epoch": 0.0,
312
+ "step": 25,
313
+ "total_flos": 78389685043200.0,
314
+ "train_loss": 1.526875147819519,
315
+ "train_runtime": 69.4951,
316
+ "train_samples_per_second": 5.756,
317
+ "train_steps_per_second": 0.36
318
+ }
319
+ ],
320
+ "max_steps": 25,
321
+ "num_train_epochs": 1,
322
+ "total_flos": 78389685043200.0,
323
+ "trial_name": null,
324
+ "trial_params": null
325
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a082ece292d36f3dd8f9e91064e23207e3fd464a8389e6d0a59e07c916b3cb4
3
+ size 3247
vocab.txt ADDED
The diff for this file is too large to render. See raw diff