Chua, Vui Seng
commited on
Commit
•
c25c3d2
1
Parent(s):
de12317
Initial model commit
Browse files- .gitattributes +1 -0
- all_results.json +11 -0
- compressed_graph.dot +0 -0
- config.json +25 -0
- eval_results.json +6 -0
- linear_layer_sparse_stats_total_12M_85.4_relative_sparsity.csv +73 -0
- nncf_bert_config_squad_mvnt_pruning-distill-run2.json +31 -0
- original_graph.dot +0 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tf_eval_results.json +4 -0
- tf_model.h5 +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- train_results.json +8 -0
- trainer_state.json +3 -0
- training_args.bin +3 -0
- vocab.txt +0 -0
.gitattributes
CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*trainer_state.json filter=lfs diff=lfs merge=lfs -text
|
all_results.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 10.0,
|
3 |
+
"eval_exact_match": 75.55345316934721,
|
4 |
+
"eval_f1": 84.22648496225351,
|
5 |
+
"eval_samples": 10784,
|
6 |
+
"train_loss": 6.046503622577567,
|
7 |
+
"train_runtime": 62400.7352,
|
8 |
+
"train_samples": 88524,
|
9 |
+
"train_samples_per_second": 14.186,
|
10 |
+
"train_steps_per_second": 0.887
|
11 |
+
}
|
compressed_graph.dot
ADDED
The diff for this file is too large to render.
See raw diff
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"NNCFNetwork"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 3072,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 12,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.9.1",
|
22 |
+
"type_vocab_size": 2,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 30522
|
25 |
+
}
|
eval_results.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 10.0,
|
3 |
+
"eval_exact_match": 75.55345316934721,
|
4 |
+
"eval_f1": 84.22648496225351,
|
5 |
+
"eval_samples": 10784
|
6 |
+
}
|
linear_layer_sparse_stats_total_12M_85.4_relative_sparsity.csv
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,linear_id,shape,param_count,nnz_count
|
2 |
+
0,bert.encoder.layer.0.attention.self.query,"[768, 768]",589824,1757
|
3 |
+
1,bert.encoder.layer.0.attention.self.key,"[768, 768]",589824,1800
|
4 |
+
2,bert.encoder.layer.0.attention.self.value,"[768, 768]",589824,5416
|
5 |
+
3,bert.encoder.layer.0.attention.output.dense,"[768, 768]",589824,7845
|
6 |
+
4,bert.encoder.layer.0.intermediate.dense,"[3072, 768]",2359296,701532
|
7 |
+
5,bert.encoder.layer.0.output.dense,"[768, 3072]",2359296,591365
|
8 |
+
6,bert.encoder.layer.1.attention.self.query,"[768, 768]",589824,27422
|
9 |
+
7,bert.encoder.layer.1.attention.self.key,"[768, 768]",589824,27291
|
10 |
+
8,bert.encoder.layer.1.attention.self.value,"[768, 768]",589824,22259
|
11 |
+
9,bert.encoder.layer.1.attention.output.dense,"[768, 768]",589824,27236
|
12 |
+
10,bert.encoder.layer.1.intermediate.dense,"[3072, 768]",2359296,775297
|
13 |
+
11,bert.encoder.layer.1.output.dense,"[768, 3072]",2359296,599687
|
14 |
+
12,bert.encoder.layer.2.attention.self.query,"[768, 768]",589824,31292
|
15 |
+
13,bert.encoder.layer.2.attention.self.key,"[768, 768]",589824,30911
|
16 |
+
14,bert.encoder.layer.2.attention.self.value,"[768, 768]",589824,24690
|
17 |
+
15,bert.encoder.layer.2.attention.output.dense,"[768, 768]",589824,24902
|
18 |
+
16,bert.encoder.layer.2.intermediate.dense,"[3072, 768]",2359296,804492
|
19 |
+
17,bert.encoder.layer.2.output.dense,"[768, 3072]",2359296,606736
|
20 |
+
18,bert.encoder.layer.3.attention.self.query,"[768, 768]",589824,35285
|
21 |
+
19,bert.encoder.layer.3.attention.self.key,"[768, 768]",589824,38990
|
22 |
+
20,bert.encoder.layer.3.attention.self.value,"[768, 768]",589824,38379
|
23 |
+
21,bert.encoder.layer.3.attention.output.dense,"[768, 768]",589824,39416
|
24 |
+
22,bert.encoder.layer.3.intermediate.dense,"[3072, 768]",2359296,803280
|
25 |
+
23,bert.encoder.layer.3.output.dense,"[768, 3072]",2359296,592789
|
26 |
+
24,bert.encoder.layer.4.attention.self.query,"[768, 768]",589824,28239
|
27 |
+
25,bert.encoder.layer.4.attention.self.key,"[768, 768]",589824,30768
|
28 |
+
26,bert.encoder.layer.4.attention.self.value,"[768, 768]",589824,39479
|
29 |
+
27,bert.encoder.layer.4.attention.output.dense,"[768, 768]",589824,39703
|
30 |
+
28,bert.encoder.layer.4.intermediate.dense,"[3072, 768]",2359296,784797
|
31 |
+
29,bert.encoder.layer.4.output.dense,"[768, 3072]",2359296,550626
|
32 |
+
30,bert.encoder.layer.5.attention.self.query,"[768, 768]",589824,16702
|
33 |
+
31,bert.encoder.layer.5.attention.self.key,"[768, 768]",589824,21640
|
34 |
+
32,bert.encoder.layer.5.attention.self.value,"[768, 768]",589824,35016
|
35 |
+
33,bert.encoder.layer.5.attention.output.dense,"[768, 768]",589824,33592
|
36 |
+
34,bert.encoder.layer.5.intermediate.dense,"[3072, 768]",2359296,785249
|
37 |
+
35,bert.encoder.layer.5.output.dense,"[768, 3072]",2359296,530621
|
38 |
+
36,bert.encoder.layer.6.attention.self.query,"[768, 768]",589824,16901
|
39 |
+
37,bert.encoder.layer.6.attention.self.key,"[768, 768]",589824,20927
|
40 |
+
38,bert.encoder.layer.6.attention.self.value,"[768, 768]",589824,29547
|
41 |
+
39,bert.encoder.layer.6.attention.output.dense,"[768, 768]",589824,26602
|
42 |
+
40,bert.encoder.layer.6.intermediate.dense,"[3072, 768]",2359296,706133
|
43 |
+
41,bert.encoder.layer.6.output.dense,"[768, 3072]",2359296,458484
|
44 |
+
42,bert.encoder.layer.7.attention.self.query,"[768, 768]",589824,15264
|
45 |
+
43,bert.encoder.layer.7.attention.self.key,"[768, 768]",589824,21032
|
46 |
+
44,bert.encoder.layer.7.attention.self.value,"[768, 768]",589824,34524
|
47 |
+
45,bert.encoder.layer.7.attention.output.dense,"[768, 768]",589824,26950
|
48 |
+
46,bert.encoder.layer.7.intermediate.dense,"[3072, 768]",2359296,558548
|
49 |
+
47,bert.encoder.layer.7.output.dense,"[768, 3072]",2359296,366404
|
50 |
+
48,bert.encoder.layer.8.attention.self.query,"[768, 768]",589824,14420
|
51 |
+
49,bert.encoder.layer.8.attention.self.key,"[768, 768]",589824,18006
|
52 |
+
50,bert.encoder.layer.8.attention.self.value,"[768, 768]",589824,27492
|
53 |
+
51,bert.encoder.layer.8.attention.output.dense,"[768, 768]",589824,21056
|
54 |
+
52,bert.encoder.layer.8.intermediate.dense,"[3072, 768]",2359296,388041
|
55 |
+
53,bert.encoder.layer.8.output.dense,"[768, 3072]",2359296,246678
|
56 |
+
54,bert.encoder.layer.9.attention.self.query,"[768, 768]",589824,32843
|
57 |
+
55,bert.encoder.layer.9.attention.self.key,"[768, 768]",589824,31522
|
58 |
+
56,bert.encoder.layer.9.attention.self.value,"[768, 768]",589824,8125
|
59 |
+
57,bert.encoder.layer.9.attention.output.dense,"[768, 768]",589824,6000
|
60 |
+
58,bert.encoder.layer.9.intermediate.dense,"[3072, 768]",2359296,148663
|
61 |
+
59,bert.encoder.layer.9.output.dense,"[768, 3072]",2359296,82095
|
62 |
+
60,bert.encoder.layer.10.attention.self.query,"[768, 768]",589824,19588
|
63 |
+
61,bert.encoder.layer.10.attention.self.key,"[768, 768]",589824,19232
|
64 |
+
62,bert.encoder.layer.10.attention.self.value,"[768, 768]",589824,5844
|
65 |
+
63,bert.encoder.layer.10.attention.output.dense,"[768, 768]",589824,4472
|
66 |
+
64,bert.encoder.layer.10.intermediate.dense,"[3072, 768]",2359296,95918
|
67 |
+
65,bert.encoder.layer.10.output.dense,"[768, 3072]",2359296,40352
|
68 |
+
66,bert.encoder.layer.11.attention.self.query,"[768, 768]",589824,4707
|
69 |
+
67,bert.encoder.layer.11.attention.self.key,"[768, 768]",589824,5807
|
70 |
+
68,bert.encoder.layer.11.attention.self.value,"[768, 768]",589824,2160
|
71 |
+
69,bert.encoder.layer.11.attention.output.dense,"[768, 768]",589824,1310
|
72 |
+
70,bert.encoder.layer.11.intermediate.dense,"[3072, 768]",2359296,71477
|
73 |
+
71,bert.encoder.layer.11.output.dense,"[768, 3072]",2359296,26152
|
nncf_bert_config_squad_mvnt_pruning-distill-run2.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"input_info": [
|
3 |
+
{
|
4 |
+
"sample_size": [1, 384],
|
5 |
+
"type": "long"
|
6 |
+
},
|
7 |
+
{
|
8 |
+
"sample_size": [1, 384],
|
9 |
+
"type": "long"
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"sample_size": [1, 384],
|
13 |
+
"type": "long"
|
14 |
+
}
|
15 |
+
],
|
16 |
+
"compression": {
|
17 |
+
"algorithm": "movement_sparsity",
|
18 |
+
"params": {
|
19 |
+
"schedule": "poly_threshold",
|
20 |
+
"power": 3,
|
21 |
+
"init_importance_threshold": 0.0,
|
22 |
+
"final_importance_threshold": 0.1,
|
23 |
+
"warmup_start_epoch": 1,
|
24 |
+
"warmup_end_epoch": 5,
|
25 |
+
"steps_per_epoch": 5533,
|
26 |
+
"regu_final_lambda": 400,
|
27 |
+
"update_per_optimizer_step": true,
|
28 |
+
},
|
29 |
+
"ignored_scopes": ["{re}.*NNCFEmbedding", "{re}.*qa_outputs*"]
|
30 |
+
},
|
31 |
+
}
|
original_graph.dot
ADDED
The diff for this file is too large to render.
See raw diff
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a9066033642ddc29a673c4ddbe010d6d14f73dced3460c401ef2bfe9b7eb8f2
|
3 |
+
size 435643185
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
tf_eval_results.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"exact_match": 14.257332071901608,
|
3 |
+
"f1": 23.49169853506678
|
4 |
+
}
|
tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99005555f720aa0616d0a0b2933db5848c3111fd67dee51ad708f58390392801
|
3 |
+
size 435842064
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
|
train_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 10.0,
|
3 |
+
"train_loss": 6.046503622577567,
|
4 |
+
"train_runtime": 62400.7352,
|
5 |
+
"train_samples": 88524,
|
6 |
+
"train_samples_per_second": 14.186,
|
7 |
+
"train_steps_per_second": 0.887
|
8 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f5481d7590696c650fd9792b1b96544e6616942da18a79fddd9e5444ddc25e4
|
3 |
+
size 31965164
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86104797dbc9b497fc75531ba856e382a4d7799efe6475a2ab72782259e2ceb2
|
3 |
+
size 3055
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|