Chua, Vui Seng commited on
Commit
663f02a
1 Parent(s): 5c3e16e

Initial model commit

Browse files
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *trainer_state.json filter=lfs diff=lfs merge=lfs -text
all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_exact_match": 79.00662251655629,
4
+ "eval_f1": 86.94488067162003,
5
+ "eval_samples": 10784,
6
+ "train_loss": 12.042027303979902,
7
+ "train_runtime": 55561.5375,
8
+ "train_samples": 88524,
9
+ "train_samples_per_second": 15.933,
10
+ "train_steps_per_second": 0.996
11
+ }
compressed_graph.dot ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-uncased",
3
+ "architectures": [
4
+ "NNCFNetwork"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.9.1",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
eval_results.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_exact_match": 79.00662251655629,
4
+ "eval_f1": 86.94488067162003,
5
+ "eval_samples": 10784
6
+ }
linear_layer_sparse_stats_total_23M_72.9_relative_sparsity.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,linear_id,shape,param_count,nnz_count
2
+ 0,bert.encoder.layer.0.attention.self.query,"[768, 768]",589824,16252
3
+ 1,bert.encoder.layer.0.attention.self.key,"[768, 768]",589824,13054
4
+ 2,bert.encoder.layer.0.attention.self.value,"[768, 768]",589824,31394
5
+ 3,bert.encoder.layer.0.attention.output.dense,"[768, 768]",589824,44074
6
+ 4,bert.encoder.layer.0.intermediate.dense,"[3072, 768]",2359296,1159424
7
+ 5,bert.encoder.layer.0.output.dense,"[768, 3072]",2359296,1061306
8
+ 6,bert.encoder.layer.1.attention.self.query,"[768, 768]",589824,58240
9
+ 7,bert.encoder.layer.1.attention.self.key,"[768, 768]",589824,54729
10
+ 8,bert.encoder.layer.1.attention.self.value,"[768, 768]",589824,57604
11
+ 9,bert.encoder.layer.1.attention.output.dense,"[768, 768]",589824,65732
12
+ 10,bert.encoder.layer.1.intermediate.dense,"[3072, 768]",2359296,1230849
13
+ 11,bert.encoder.layer.1.output.dense,"[768, 3072]",2359296,1063370
14
+ 12,bert.encoder.layer.2.attention.self.query,"[768, 768]",589824,76074
15
+ 13,bert.encoder.layer.2.attention.self.key,"[768, 768]",589824,72798
16
+ 14,bert.encoder.layer.2.attention.self.value,"[768, 768]",589824,69367
17
+ 15,bert.encoder.layer.2.attention.output.dense,"[768, 768]",589824,68078
18
+ 16,bert.encoder.layer.2.intermediate.dense,"[3072, 768]",2359296,1260119
19
+ 17,bert.encoder.layer.2.output.dense,"[768, 3072]",2359296,1081396
20
+ 18,bert.encoder.layer.3.attention.self.query,"[768, 768]",589824,79954
21
+ 19,bert.encoder.layer.3.attention.self.key,"[768, 768]",589824,84045
22
+ 20,bert.encoder.layer.3.attention.self.value,"[768, 768]",589824,97323
23
+ 21,bert.encoder.layer.3.attention.output.dense,"[768, 768]",589824,95918
24
+ 22,bert.encoder.layer.3.intermediate.dense,"[3072, 768]",2359296,1263350
25
+ 23,bert.encoder.layer.3.output.dense,"[768, 3072]",2359296,1069440
26
+ 24,bert.encoder.layer.4.attention.self.query,"[768, 768]",589824,77306
27
+ 25,bert.encoder.layer.4.attention.self.key,"[768, 768]",589824,77738
28
+ 26,bert.encoder.layer.4.attention.self.value,"[768, 768]",589824,103253
29
+ 27,bert.encoder.layer.4.attention.output.dense,"[768, 768]",589824,103673
30
+ 28,bert.encoder.layer.4.intermediate.dense,"[3072, 768]",2359296,1253106
31
+ 29,bert.encoder.layer.4.output.dense,"[768, 3072]",2359296,1031342
32
+ 30,bert.encoder.layer.5.attention.self.query,"[768, 768]",589824,59269
33
+ 31,bert.encoder.layer.5.attention.self.key,"[768, 768]",589824,66896
34
+ 32,bert.encoder.layer.5.attention.self.value,"[768, 768]",589824,97790
35
+ 33,bert.encoder.layer.5.attention.output.dense,"[768, 768]",589824,93833
36
+ 34,bert.encoder.layer.5.intermediate.dense,"[3072, 768]",2359296,1263747
37
+ 35,bert.encoder.layer.5.output.dense,"[768, 3072]",2359296,1014643
38
+ 36,bert.encoder.layer.6.attention.self.query,"[768, 768]",589824,59641
39
+ 37,bert.encoder.layer.6.attention.self.key,"[768, 768]",589824,67416
40
+ 38,bert.encoder.layer.6.attention.self.value,"[768, 768]",589824,95731
41
+ 39,bert.encoder.layer.6.attention.output.dense,"[768, 768]",589824,85998
42
+ 40,bert.encoder.layer.6.intermediate.dense,"[3072, 768]",2359296,1181526
43
+ 41,bert.encoder.layer.6.output.dense,"[768, 3072]",2359296,920703
44
+ 42,bert.encoder.layer.7.attention.self.query,"[768, 768]",589824,44935
45
+ 43,bert.encoder.layer.7.attention.self.key,"[768, 768]",589824,56868
46
+ 44,bert.encoder.layer.7.attention.self.value,"[768, 768]",589824,89913
47
+ 45,bert.encoder.layer.7.attention.output.dense,"[768, 768]",589824,74029
48
+ 46,bert.encoder.layer.7.intermediate.dense,"[3072, 768]",2359296,995977
49
+ 47,bert.encoder.layer.7.output.dense,"[768, 3072]",2359296,777939
50
+ 48,bert.encoder.layer.8.attention.self.query,"[768, 768]",589824,49884
51
+ 49,bert.encoder.layer.8.attention.self.key,"[768, 768]",589824,57649
52
+ 50,bert.encoder.layer.8.attention.self.value,"[768, 768]",589824,92159
53
+ 51,bert.encoder.layer.8.attention.output.dense,"[768, 768]",589824,71883
54
+ 52,bert.encoder.layer.8.intermediate.dense,"[3072, 768]",2359296,757196
55
+ 53,bert.encoder.layer.8.output.dense,"[768, 3072]",2359296,568698
56
+ 54,bert.encoder.layer.9.attention.self.query,"[768, 768]",589824,68469
57
+ 55,bert.encoder.layer.9.attention.self.key,"[768, 768]",589824,69557
58
+ 56,bert.encoder.layer.9.attention.self.value,"[768, 768]",589824,32458
59
+ 57,bert.encoder.layer.9.attention.output.dense,"[768, 768]",589824,23895
60
+ 58,bert.encoder.layer.9.intermediate.dense,"[3072, 768]",2359296,359909
61
+ 59,bert.encoder.layer.9.output.dense,"[768, 3072]",2359296,245729
62
+ 60,bert.encoder.layer.10.attention.self.query,"[768, 768]",589824,42730
63
+ 61,bert.encoder.layer.10.attention.self.key,"[768, 768]",589824,44139
64
+ 62,bert.encoder.layer.10.attention.self.value,"[768, 768]",589824,17533
65
+ 63,bert.encoder.layer.10.attention.output.dense,"[768, 768]",589824,12605
66
+ 64,bert.encoder.layer.10.intermediate.dense,"[3072, 768]",2359296,233883
67
+ 65,bert.encoder.layer.10.output.dense,"[768, 3072]",2359296,130882
68
+ 66,bert.encoder.layer.11.attention.self.query,"[768, 768]",589824,11427
69
+ 67,bert.encoder.layer.11.attention.self.key,"[768, 768]",589824,14775
70
+ 68,bert.encoder.layer.11.attention.self.value,"[768, 768]",589824,6865
71
+ 69,bert.encoder.layer.11.attention.output.dense,"[768, 768]",589824,3223
72
+ 70,bert.encoder.layer.11.intermediate.dense,"[3072, 768]",2359296,190784
73
+ 71,bert.encoder.layer.11.output.dense,"[768, 3072]",2359296,71760
nncf_bert_config_squad_mvnt_pruning-distill-run8.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "input_info": [
3
+ {
4
+ "sample_size": [1, 384],
5
+ "type": "long"
6
+ },
7
+ {
8
+ "sample_size": [1, 384],
9
+ "type": "long"
10
+ },
11
+ {
12
+ "sample_size": [1, 384],
13
+ "type": "long"
14
+ }
15
+ ],
16
+ "compression": {
17
+ "algorithm": "movement_sparsity",
18
+ "params": {
19
+ "schedule": "poly_threshold",
20
+ "power": 3,
21
+ "init_importance_threshold": 0.0,
22
+ "final_importance_threshold": 0.1,
23
+ "warmup_start_epoch": 1,
24
+ "warmup_end_epoch": 2.5,
25
+ "steps_per_epoch": 5533,
26
+ "regu_final_lambda": 400,
27
+ "update_per_optimizer_step": true,
28
+ },
29
+ "ignored_scopes": ["{re}.*NNCFEmbedding", "{re}.*qa_outputs*"]
30
+ },
31
+ }
original_graph.dot ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:600721be352243b4f8c1751c5c5b00d348401f728ee94076ed7b8baeb9730696
3
+ size 435643185
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tf_eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "exact_match": 31.05960264900662,
3
+ "f1": 39.84460263383556
4
+ }
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6882972800d1a921103ecfe0ae42607d1f675a092d58878441bc071b85b3e0a
3
+ size 435842064
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "train_loss": 12.042027303979902,
4
+ "train_runtime": 55561.5375,
5
+ "train_samples": 88524,
6
+ "train_samples_per_second": 15.933,
7
+ "train_steps_per_second": 0.996
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9771dceb7b8cca0a6d0861451e815fd0dbaf5518a4ab1f6246522437ab67719
3
+ size 31647642
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19466a89c86a5728654b838639bee6be304cd6da6c20fa07ca9bde40b079d3ce
3
+ size 3055
vocab.txt ADDED
The diff for this file is too large to render. See raw diff