helenai commited on
Commit
d87fc97
1 Parent(s): 8f00ff7
README.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - generated_from_trainer
5
+ datasets:
6
+ - squad
7
+ model-index:
8
+ - name: bert-base-uncased-squad-v1-jpqd-ov-int8
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # bert-base-uncased-squad-v1-jpqd-ov-int8
16
+
17
+ This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on the squad dataset.
18
+ It was compressed using [NNCF](https://github.com/openvinotoolkit/nncf) with [Optimum
19
+ Intel](https://github.com/huggingface/optimum-intel#openvino) following the [JPQD question-answering
20
+ example](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino/question-answering#joint-pruning-quantization-and-distillation-jpqd-for-bert-on-squad10).
21
+
22
+ ### Training hyperparameters
23
+
24
+ The following hyperparameters were used during training:
25
+ - learning_rate: 3e-05
26
+ - train_batch_size: 16
27
+ - eval_batch_size: 128
28
+ - seed: 42
29
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
30
+ - lr_scheduler_type: linear
31
+ - num_epochs: 8.0
32
+ - mixed_precision_training: Native AMP
33
+
34
+ ### Training results
35
+
36
+ ```
37
+ ***** eval metrics *****
38
+ epoch = 8.0
39
+ eval_exact_match = 83.141
40
+ eval_f1 = 89.5906
41
+ eval_samples = 10784
42
+ ```
43
+
44
+ ### Framework versions
45
+
46
+ - Transformers 4.26.1
47
+ - Pytorch 1.13.1+cu117
48
+ - Datasets 2.8.0
49
+ - Tokenizers 0.13.2
all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "eval_exact_match": 83.14096499526963,
4
+ "eval_f1": 89.59061048191492,
5
+ "eval_samples": 10784,
6
+ "train_loss": 2.368001897127539,
7
+ "train_runtime": 49077.3955,
8
+ "train_samples": 88524,
9
+ "train_samples_per_second": 14.43,
10
+ "train_steps_per_second": 0.902
11
+ }
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-uncased",
3
+ "architectures": [
4
+ "NNCFNetwork"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.26.1",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
openvino_config.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compression": [
3
+ {
4
+ "algorithm": "movement_sparsity",
5
+ "ignored_scopes": [
6
+ "{re}.*NNCFEmbedding.*",
7
+ "{re}.*qa_outputs.*",
8
+ "{re}.*LayerNorm.*"
9
+ ],
10
+ "params": {
11
+ "enable_structured_masking": true,
12
+ "importance_regularization_factor": 0.02,
13
+ "warmup_end_epoch": 4,
14
+ "warmup_start_epoch": 1
15
+ },
16
+ "sparse_structure_by_scopes": [
17
+ {
18
+ "mode": "block",
19
+ "sparse_factors": [
20
+ 32,
21
+ 32
22
+ ],
23
+ "target_scopes": "{re}.*BertAttention.*"
24
+ },
25
+ {
26
+ "axis": 0,
27
+ "mode": "per_dim",
28
+ "target_scopes": "{re}.*BertIntermediate.*"
29
+ },
30
+ {
31
+ "axis": 1,
32
+ "mode": "per_dim",
33
+ "target_scopes": "{re}.*BertOutput.*"
34
+ }
35
+ ]
36
+ },
37
+ {
38
+ "algorithm": "quantization",
39
+ "export_to_onnx_standard_ops": false,
40
+ "ignored_scopes": [
41
+ "{re}.*__add___[0-1]",
42
+ "{re}.*layer_norm_0",
43
+ "{re}.*matmul_1",
44
+ "{re}.*__truediv__*"
45
+ ],
46
+ "initializer": {
47
+ "batchnorm_adaptation": {
48
+ "num_bn_adaptation_samples": 200
49
+ },
50
+ "range": {
51
+ "num_init_samples": 32,
52
+ "params": {
53
+ "max_percentile": 99.99,
54
+ "min_percentile": 0.01
55
+ },
56
+ "type": "percentile"
57
+ }
58
+ },
59
+ "overflow_fix": "disable",
60
+ "preset": "mixed",
61
+ "scope_overrides": {
62
+ "activations": {
63
+ "{re}.*matmul_0": {
64
+ "mode": "symmetric"
65
+ }
66
+ }
67
+ }
68
+ }
69
+ ],
70
+ "input_info": [
71
+ {
72
+ "keyword": "input_ids",
73
+ "sample_size": [
74
+ 16,
75
+ 384
76
+ ],
77
+ "type": "long"
78
+ },
79
+ {
80
+ "keyword": "token_type_ids",
81
+ "sample_size": [
82
+ 16,
83
+ 384
84
+ ],
85
+ "type": "long"
86
+ },
87
+ {
88
+ "keyword": "attention_mask",
89
+ "sample_size": [
90
+ 16,
91
+ 384
92
+ ],
93
+ "type": "long"
94
+ }
95
+ ],
96
+ "optimum_version": "1.6.4",
97
+ "save_onnx_model": false,
98
+ "transformers_version": "4.26.1"
99
+ }
openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81f4e1eba20221fd9fcc6bb8820422413f8413123d7418b0a303fc262449ce59
3
+ size 75477404
openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
structured_sparsity.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,group_id,type,torch_module,weight_shape,pruned_weight_shape,bias_shape,pruned_bias_shape,head_or_channel_id_to_keep,module_node_name
2
+ 0,0,MHSA,nncf_module.bert.encoder.layer.0.attention.self.query,"(768, 768)","(512, 768)","(768,)","(512,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
3
+ 1,0,MHSA,nncf_module.bert.encoder.layer.0.attention.self.key,"(768, 768)","(512, 768)","(768,)","(512,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
4
+ 2,0,MHSA,nncf_module.bert.encoder.layer.0.attention.self.value,"(768, 768)","(512, 768)","(768,)","(512,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
5
+ 3,0,MHSA,nncf_module.bert.encoder.layer.0.attention.output.dense,"(768, 768)","(768, 512)","(768,)","(768,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
6
+ 4,1,FF,nncf_module.bert.encoder.layer.0.intermediate.dense,"(3072, 768)","(2066, 768)","(3072,)","(2066,)",[2066 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
7
+ 5,1,FF,nncf_module.bert.encoder.layer.0.output.dense,"(768, 3072)","(768, 2066)","(768,)","(768,)",[2066 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertOutput[output]/NNCFLinear[dense]/linear_0
8
+ 6,2,MHSA,nncf_module.bert.encoder.layer.1.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
9
+ 7,2,MHSA,nncf_module.bert.encoder.layer.1.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
10
+ 8,2,MHSA,nncf_module.bert.encoder.layer.1.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
11
+ 9,2,MHSA,nncf_module.bert.encoder.layer.1.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
12
+ 10,3,FF,nncf_module.bert.encoder.layer.1.intermediate.dense,"(3072, 768)","(2067, 768)","(3072,)","(2067,)",[2067 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
13
+ 11,3,FF,nncf_module.bert.encoder.layer.1.output.dense,"(768, 3072)","(768, 2067)","(768,)","(768,)",[2067 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertOutput[output]/NNCFLinear[dense]/linear_0
14
+ 12,4,MHSA,nncf_module.bert.encoder.layer.2.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
15
+ 13,4,MHSA,nncf_module.bert.encoder.layer.2.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
16
+ 14,4,MHSA,nncf_module.bert.encoder.layer.2.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
17
+ 15,4,MHSA,nncf_module.bert.encoder.layer.2.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
18
+ 16,5,FF,nncf_module.bert.encoder.layer.2.intermediate.dense,"(3072, 768)","(2082, 768)","(3072,)","(2082,)",[2082 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
19
+ 17,5,FF,nncf_module.bert.encoder.layer.2.output.dense,"(768, 3072)","(768, 2082)","(768,)","(768,)",[2082 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertOutput[output]/NNCFLinear[dense]/linear_0
20
+ 18,6,MHSA,nncf_module.bert.encoder.layer.3.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
21
+ 19,6,MHSA,nncf_module.bert.encoder.layer.3.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
22
+ 20,6,MHSA,nncf_module.bert.encoder.layer.3.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
23
+ 21,6,MHSA,nncf_module.bert.encoder.layer.3.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
24
+ 22,7,FF,nncf_module.bert.encoder.layer.3.intermediate.dense,"(3072, 768)","(2136, 768)","(3072,)","(2136,)",[2136 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
25
+ 23,7,FF,nncf_module.bert.encoder.layer.3.output.dense,"(768, 3072)","(768, 2136)","(768,)","(768,)",[2136 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertOutput[output]/NNCFLinear[dense]/linear_0
26
+ 24,8,MHSA,nncf_module.bert.encoder.layer.4.attention.self.query,"(768, 768)","(640, 768)","(768,)","(640,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
27
+ 25,8,MHSA,nncf_module.bert.encoder.layer.4.attention.self.key,"(768, 768)","(640, 768)","(768,)","(640,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
28
+ 26,8,MHSA,nncf_module.bert.encoder.layer.4.attention.self.value,"(768, 768)","(640, 768)","(768,)","(640,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
29
+ 27,8,MHSA,nncf_module.bert.encoder.layer.4.attention.output.dense,"(768, 768)","(768, 640)","(768,)","(768,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
30
+ 28,9,FF,nncf_module.bert.encoder.layer.4.intermediate.dense,"(3072, 768)","(2023, 768)","(3072,)","(2023,)",[2023 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
31
+ 29,9,FF,nncf_module.bert.encoder.layer.4.output.dense,"(768, 3072)","(768, 2023)","(768,)","(768,)",[2023 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertOutput[output]/NNCFLinear[dense]/linear_0
32
+ 30,10,MHSA,nncf_module.bert.encoder.layer.5.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
33
+ 31,10,MHSA,nncf_module.bert.encoder.layer.5.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
34
+ 32,10,MHSA,nncf_module.bert.encoder.layer.5.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
35
+ 33,10,MHSA,nncf_module.bert.encoder.layer.5.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
36
+ 34,11,FF,nncf_module.bert.encoder.layer.5.intermediate.dense,"(3072, 768)","(2011, 768)","(3072,)","(2011,)",[2011 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
37
+ 35,11,FF,nncf_module.bert.encoder.layer.5.output.dense,"(768, 3072)","(768, 2011)","(768,)","(768,)",[2011 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertOutput[output]/NNCFLinear[dense]/linear_0
38
+ 36,12,MHSA,nncf_module.bert.encoder.layer.6.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
39
+ 37,12,MHSA,nncf_module.bert.encoder.layer.6.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
40
+ 38,12,MHSA,nncf_module.bert.encoder.layer.6.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
41
+ 39,12,MHSA,nncf_module.bert.encoder.layer.6.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
42
+ 40,13,FF,nncf_module.bert.encoder.layer.6.intermediate.dense,"(3072, 768)","(1871, 768)","(3072,)","(1871,)",[1871 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
43
+ 41,13,FF,nncf_module.bert.encoder.layer.6.output.dense,"(768, 3072)","(768, 1871)","(768,)","(768,)",[1871 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertOutput[output]/NNCFLinear[dense]/linear_0
44
+ 42,14,MHSA,nncf_module.bert.encoder.layer.7.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
45
+ 43,14,MHSA,nncf_module.bert.encoder.layer.7.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
46
+ 44,14,MHSA,nncf_module.bert.encoder.layer.7.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
47
+ 45,14,MHSA,nncf_module.bert.encoder.layer.7.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
48
+ 46,15,FF,nncf_module.bert.encoder.layer.7.intermediate.dense,"(3072, 768)","(1858, 768)","(3072,)","(1858,)",[1858 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
49
+ 47,15,FF,nncf_module.bert.encoder.layer.7.output.dense,"(768, 3072)","(768, 1858)","(768,)","(768,)",[1858 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertOutput[output]/NNCFLinear[dense]/linear_0
50
+ 48,16,MHSA,nncf_module.bert.encoder.layer.8.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
51
+ 49,16,MHSA,nncf_module.bert.encoder.layer.8.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
52
+ 50,16,MHSA,nncf_module.bert.encoder.layer.8.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
53
+ 51,16,MHSA,nncf_module.bert.encoder.layer.8.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
54
+ 52,17,FF,nncf_module.bert.encoder.layer.8.intermediate.dense,"(3072, 768)","(1637, 768)","(3072,)","(1637,)",[1637 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
55
+ 53,17,FF,nncf_module.bert.encoder.layer.8.output.dense,"(768, 3072)","(768, 1637)","(768,)","(768,)",[1637 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertOutput[output]/NNCFLinear[dense]/linear_0
56
+ 54,18,MHSA,nncf_module.bert.encoder.layer.9.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
57
+ 55,18,MHSA,nncf_module.bert.encoder.layer.9.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
58
+ 56,18,MHSA,nncf_module.bert.encoder.layer.9.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
59
+ 57,18,MHSA,nncf_module.bert.encoder.layer.9.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
60
+ 58,19,FF,nncf_module.bert.encoder.layer.9.intermediate.dense,"(3072, 768)","(1257, 768)","(3072,)","(1257,)",[1257 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
61
+ 59,19,FF,nncf_module.bert.encoder.layer.9.output.dense,"(768, 3072)","(768, 1257)","(768,)","(768,)",[1257 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertOutput[output]/NNCFLinear[dense]/linear_0
62
+ 60,20,MHSA,nncf_module.bert.encoder.layer.10.attention.self.query,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
63
+ 61,20,MHSA,nncf_module.bert.encoder.layer.10.attention.self.key,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
64
+ 62,20,MHSA,nncf_module.bert.encoder.layer.10.attention.self.value,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
65
+ 63,20,MHSA,nncf_module.bert.encoder.layer.10.attention.output.dense,"(768, 768)","(768, 384)","(768,)","(768,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
66
+ 64,21,FF,nncf_module.bert.encoder.layer.10.intermediate.dense,"(3072, 768)","(1159, 768)","(3072,)","(1159,)",[1159 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
67
+ 65,21,FF,nncf_module.bert.encoder.layer.10.output.dense,"(768, 3072)","(768, 1159)","(768,)","(768,)",[1159 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertOutput[output]/NNCFLinear[dense]/linear_0
68
+ 66,22,MHSA,nncf_module.bert.encoder.layer.11.attention.self.query,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
69
+ 67,22,MHSA,nncf_module.bert.encoder.layer.11.attention.self.key,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
70
+ 68,22,MHSA,nncf_module.bert.encoder.layer.11.attention.self.value,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
71
+ 69,22,MHSA,nncf_module.bert.encoder.layer.11.attention.output.dense,"(768, 768)","(768, 384)","(768,)","(768,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
72
+ 70,23,FF,nncf_module.bert.encoder.layer.11.intermediate.dense,"(3072, 768)","(1017, 768)","(3072,)","(1017,)",[1017 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
73
+ 71,23,FF,nncf_module.bert.encoder.layer.11.output.dense,"(768, 3072)","(768, 1017)","(768,)","(768,)",[1017 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertOutput[output]/NNCFLinear[dense]/linear_0
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "name_or_path": "bert-base-uncased",
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "special_tokens_map_file": null,
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5552aa8097d9d47b0343cdacd36e5bc50157c344cb9488c095d5eb9a1bc83f5
3
+ size 3579
vocab.txt ADDED
The diff for this file is too large to render. See raw diff