Chua, Vui Seng
commited on
Commit
•
663f02a
1
Parent(s):
5c3e16e
Initial model commit
Browse files- .gitattributes +1 -0
- all_results.json +11 -0
- compressed_graph.dot +0 -0
- config.json +25 -0
- eval_results.json +6 -0
- linear_layer_sparse_stats_total_23M_72.9_relative_sparsity.csv +73 -0
- nncf_bert_config_squad_mvnt_pruning-distill-run8.json +31 -0
- original_graph.dot +0 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tf_eval_results.json +4 -0
- tf_model.h5 +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- train_results.json +8 -0
- trainer_state.json +3 -0
- training_args.bin +3 -0
- vocab.txt +0 -0
.gitattributes
CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*trainer_state.json filter=lfs diff=lfs merge=lfs -text
|
all_results.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 10.0,
|
3 |
+
"eval_exact_match": 79.00662251655629,
|
4 |
+
"eval_f1": 86.94488067162003,
|
5 |
+
"eval_samples": 10784,
|
6 |
+
"train_loss": 12.042027303979902,
|
7 |
+
"train_runtime": 55561.5375,
|
8 |
+
"train_samples": 88524,
|
9 |
+
"train_samples_per_second": 15.933,
|
10 |
+
"train_steps_per_second": 0.996
|
11 |
+
}
|
compressed_graph.dot
ADDED
The diff for this file is too large to render.
See raw diff
|
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"NNCFNetwork"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 3072,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 12,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.9.1",
|
22 |
+
"type_vocab_size": 2,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 30522
|
25 |
+
}
|
eval_results.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 10.0,
|
3 |
+
"eval_exact_match": 79.00662251655629,
|
4 |
+
"eval_f1": 86.94488067162003,
|
5 |
+
"eval_samples": 10784
|
6 |
+
}
|
linear_layer_sparse_stats_total_23M_72.9_relative_sparsity.csv
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,linear_id,shape,param_count,nnz_count
|
2 |
+
0,bert.encoder.layer.0.attention.self.query,"[768, 768]",589824,16252
|
3 |
+
1,bert.encoder.layer.0.attention.self.key,"[768, 768]",589824,13054
|
4 |
+
2,bert.encoder.layer.0.attention.self.value,"[768, 768]",589824,31394
|
5 |
+
3,bert.encoder.layer.0.attention.output.dense,"[768, 768]",589824,44074
|
6 |
+
4,bert.encoder.layer.0.intermediate.dense,"[3072, 768]",2359296,1159424
|
7 |
+
5,bert.encoder.layer.0.output.dense,"[768, 3072]",2359296,1061306
|
8 |
+
6,bert.encoder.layer.1.attention.self.query,"[768, 768]",589824,58240
|
9 |
+
7,bert.encoder.layer.1.attention.self.key,"[768, 768]",589824,54729
|
10 |
+
8,bert.encoder.layer.1.attention.self.value,"[768, 768]",589824,57604
|
11 |
+
9,bert.encoder.layer.1.attention.output.dense,"[768, 768]",589824,65732
|
12 |
+
10,bert.encoder.layer.1.intermediate.dense,"[3072, 768]",2359296,1230849
|
13 |
+
11,bert.encoder.layer.1.output.dense,"[768, 3072]",2359296,1063370
|
14 |
+
12,bert.encoder.layer.2.attention.self.query,"[768, 768]",589824,76074
|
15 |
+
13,bert.encoder.layer.2.attention.self.key,"[768, 768]",589824,72798
|
16 |
+
14,bert.encoder.layer.2.attention.self.value,"[768, 768]",589824,69367
|
17 |
+
15,bert.encoder.layer.2.attention.output.dense,"[768, 768]",589824,68078
|
18 |
+
16,bert.encoder.layer.2.intermediate.dense,"[3072, 768]",2359296,1260119
|
19 |
+
17,bert.encoder.layer.2.output.dense,"[768, 3072]",2359296,1081396
|
20 |
+
18,bert.encoder.layer.3.attention.self.query,"[768, 768]",589824,79954
|
21 |
+
19,bert.encoder.layer.3.attention.self.key,"[768, 768]",589824,84045
|
22 |
+
20,bert.encoder.layer.3.attention.self.value,"[768, 768]",589824,97323
|
23 |
+
21,bert.encoder.layer.3.attention.output.dense,"[768, 768]",589824,95918
|
24 |
+
22,bert.encoder.layer.3.intermediate.dense,"[3072, 768]",2359296,1263350
|
25 |
+
23,bert.encoder.layer.3.output.dense,"[768, 3072]",2359296,1069440
|
26 |
+
24,bert.encoder.layer.4.attention.self.query,"[768, 768]",589824,77306
|
27 |
+
25,bert.encoder.layer.4.attention.self.key,"[768, 768]",589824,77738
|
28 |
+
26,bert.encoder.layer.4.attention.self.value,"[768, 768]",589824,103253
|
29 |
+
27,bert.encoder.layer.4.attention.output.dense,"[768, 768]",589824,103673
|
30 |
+
28,bert.encoder.layer.4.intermediate.dense,"[3072, 768]",2359296,1253106
|
31 |
+
29,bert.encoder.layer.4.output.dense,"[768, 3072]",2359296,1031342
|
32 |
+
30,bert.encoder.layer.5.attention.self.query,"[768, 768]",589824,59269
|
33 |
+
31,bert.encoder.layer.5.attention.self.key,"[768, 768]",589824,66896
|
34 |
+
32,bert.encoder.layer.5.attention.self.value,"[768, 768]",589824,97790
|
35 |
+
33,bert.encoder.layer.5.attention.output.dense,"[768, 768]",589824,93833
|
36 |
+
34,bert.encoder.layer.5.intermediate.dense,"[3072, 768]",2359296,1263747
|
37 |
+
35,bert.encoder.layer.5.output.dense,"[768, 3072]",2359296,1014643
|
38 |
+
36,bert.encoder.layer.6.attention.self.query,"[768, 768]",589824,59641
|
39 |
+
37,bert.encoder.layer.6.attention.self.key,"[768, 768]",589824,67416
|
40 |
+
38,bert.encoder.layer.6.attention.self.value,"[768, 768]",589824,95731
|
41 |
+
39,bert.encoder.layer.6.attention.output.dense,"[768, 768]",589824,85998
|
42 |
+
40,bert.encoder.layer.6.intermediate.dense,"[3072, 768]",2359296,1181526
|
43 |
+
41,bert.encoder.layer.6.output.dense,"[768, 3072]",2359296,920703
|
44 |
+
42,bert.encoder.layer.7.attention.self.query,"[768, 768]",589824,44935
|
45 |
+
43,bert.encoder.layer.7.attention.self.key,"[768, 768]",589824,56868
|
46 |
+
44,bert.encoder.layer.7.attention.self.value,"[768, 768]",589824,89913
|
47 |
+
45,bert.encoder.layer.7.attention.output.dense,"[768, 768]",589824,74029
|
48 |
+
46,bert.encoder.layer.7.intermediate.dense,"[3072, 768]",2359296,995977
|
49 |
+
47,bert.encoder.layer.7.output.dense,"[768, 3072]",2359296,777939
|
50 |
+
48,bert.encoder.layer.8.attention.self.query,"[768, 768]",589824,49884
|
51 |
+
49,bert.encoder.layer.8.attention.self.key,"[768, 768]",589824,57649
|
52 |
+
50,bert.encoder.layer.8.attention.self.value,"[768, 768]",589824,92159
|
53 |
+
51,bert.encoder.layer.8.attention.output.dense,"[768, 768]",589824,71883
|
54 |
+
52,bert.encoder.layer.8.intermediate.dense,"[3072, 768]",2359296,757196
|
55 |
+
53,bert.encoder.layer.8.output.dense,"[768, 3072]",2359296,568698
|
56 |
+
54,bert.encoder.layer.9.attention.self.query,"[768, 768]",589824,68469
|
57 |
+
55,bert.encoder.layer.9.attention.self.key,"[768, 768]",589824,69557
|
58 |
+
56,bert.encoder.layer.9.attention.self.value,"[768, 768]",589824,32458
|
59 |
+
57,bert.encoder.layer.9.attention.output.dense,"[768, 768]",589824,23895
|
60 |
+
58,bert.encoder.layer.9.intermediate.dense,"[3072, 768]",2359296,359909
|
61 |
+
59,bert.encoder.layer.9.output.dense,"[768, 3072]",2359296,245729
|
62 |
+
60,bert.encoder.layer.10.attention.self.query,"[768, 768]",589824,42730
|
63 |
+
61,bert.encoder.layer.10.attention.self.key,"[768, 768]",589824,44139
|
64 |
+
62,bert.encoder.layer.10.attention.self.value,"[768, 768]",589824,17533
|
65 |
+
63,bert.encoder.layer.10.attention.output.dense,"[768, 768]",589824,12605
|
66 |
+
64,bert.encoder.layer.10.intermediate.dense,"[3072, 768]",2359296,233883
|
67 |
+
65,bert.encoder.layer.10.output.dense,"[768, 3072]",2359296,130882
|
68 |
+
66,bert.encoder.layer.11.attention.self.query,"[768, 768]",589824,11427
|
69 |
+
67,bert.encoder.layer.11.attention.self.key,"[768, 768]",589824,14775
|
70 |
+
68,bert.encoder.layer.11.attention.self.value,"[768, 768]",589824,6865
|
71 |
+
69,bert.encoder.layer.11.attention.output.dense,"[768, 768]",589824,3223
|
72 |
+
70,bert.encoder.layer.11.intermediate.dense,"[3072, 768]",2359296,190784
|
73 |
+
71,bert.encoder.layer.11.output.dense,"[768, 3072]",2359296,71760
|
nncf_bert_config_squad_mvnt_pruning-distill-run8.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"input_info": [
|
3 |
+
{
|
4 |
+
"sample_size": [1, 384],
|
5 |
+
"type": "long"
|
6 |
+
},
|
7 |
+
{
|
8 |
+
"sample_size": [1, 384],
|
9 |
+
"type": "long"
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"sample_size": [1, 384],
|
13 |
+
"type": "long"
|
14 |
+
}
|
15 |
+
],
|
16 |
+
"compression": {
|
17 |
+
"algorithm": "movement_sparsity",
|
18 |
+
"params": {
|
19 |
+
"schedule": "poly_threshold",
|
20 |
+
"power": 3,
|
21 |
+
"init_importance_threshold": 0.0,
|
22 |
+
"final_importance_threshold": 0.1,
|
23 |
+
"warmup_start_epoch": 1,
|
24 |
+
"warmup_end_epoch": 2.5,
|
25 |
+
"steps_per_epoch": 5533,
|
26 |
+
"regu_final_lambda": 400,
|
27 |
+
"update_per_optimizer_step": true,
|
28 |
+
},
|
29 |
+
"ignored_scopes": ["{re}.*NNCFEmbedding", "{re}.*qa_outputs*"]
|
30 |
+
},
|
31 |
+
}
|
original_graph.dot
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:600721be352243b4f8c1751c5c5b00d348401f728ee94076ed7b8baeb9730696
|
3 |
+
size 435643185
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
tf_eval_results.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"exact_match": 31.05960264900662,
|
3 |
+
"f1": 39.84460263383556
|
4 |
+
}
|
tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6882972800d1a921103ecfe0ae42607d1f675a092d58878441bc071b85b3e0a
|
3 |
+
size 435842064
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
|
train_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 10.0,
|
3 |
+
"train_loss": 12.042027303979902,
|
4 |
+
"train_runtime": 55561.5375,
|
5 |
+
"train_samples": 88524,
|
6 |
+
"train_samples_per_second": 15.933,
|
7 |
+
"train_steps_per_second": 0.996
|
8 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9771dceb7b8cca0a6d0861451e815fd0dbaf5518a4ab1f6246522437ab67719
|
3 |
+
size 31647642
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19466a89c86a5728654b838639bee6be304cd6da6c20fa07ca9bde40b079d3ce
|
3 |
+
size 3055
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|