madlag commited on
Commit
5330b37
1 Parent(s): 563c7c5

Adding modes, graphs and metadata.

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. model_card/pruning.svg +1 -1
  3. model_meta.json +160 -0
README.md CHANGED
@@ -42,7 +42,7 @@ Here is a detailed view on how the remaining heads are distributed in the networ
42
 
43
  ## Density plot
44
 
45
- <script src="/madlag/bert-base-uncased-squad1.1-block-sparse-0.13-v1/raw/main/model_card/density.js" id="59895319-1808-4770-b2c0-616dd35ac762"></script>
46
 
47
  ## Details
48
 
 
42
 
43
  ## Density plot
44
 
45
+ <script src="/madlag/bert-base-uncased-squad1.1-block-sparse-0.13-v1/raw/main/model_card/density.js" id="2087227b-6b81-4065-969b-41ea1f61c72e"></script>
46
 
47
  ## Details
48
 
model_card/pruning.svg CHANGED
model_meta.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "adam_epsilon": 1e-08,
4
+ "alpha_ce": 0.1,
5
+ "alpha_distil": 0.9,
6
+ "ampere_learning_rate": 0.01,
7
+ "ampere_mask_init": "constant",
8
+ "ampere_mask_scale": 0.0,
9
+ "ampere_pruning_method": "disabled",
10
+ "cache_dir": "",
11
+ "config_name": "",
12
+ "data_dir": "squad_data",
13
+ "do_eval": true,
14
+ "do_lower_case": true,
15
+ "do_train": true,
16
+ "doc_stride": 128,
17
+ "eval_all_checkpoints": true,
18
+ "eval_batch_size": 16,
19
+ "evaluate_during_training": false,
20
+ "final_ampere_temperature": 20,
21
+ "final_lambda": 100,
22
+ "final_shuffling_temperature": 20,
23
+ "final_threshold": 0.1,
24
+ "final_warmup": 10,
25
+ "fp16": false,
26
+ "fp16_opt_level": "O1",
27
+ "global_topk": false,
28
+ "global_topk_frequency_compute": 25,
29
+ "gradient_accumulation_steps": 1,
30
+ "in_shuffling_group": 4,
31
+ "initial_ampere_temperature": 0.0,
32
+ "initial_shuffling_temperature": 0.1,
33
+ "initial_threshold": 0.0,
34
+ "initial_warmup": 1,
35
+ "lang_id": 0,
36
+ "learning_rate": 3e-05,
37
+ "local_rank": -1,
38
+ "logging_steps": 500,
39
+ "mask_block_cols": 32,
40
+ "mask_block_rows": 32,
41
+ "mask_init": "constant",
42
+ "mask_scale": 0.0,
43
+ "mask_scores_learning_rate": 0.01,
44
+ "max_answer_length": 30,
45
+ "max_grad_norm": 1.0,
46
+ "max_query_length": 64,
47
+ "max_seq_length": 384,
48
+ "max_steps": -1,
49
+ "model_name_or_path": "bert-base-uncased",
50
+ "model_type": "masked_bert",
51
+ "n_best_size": 20,
52
+ "n_gpu": 1,
53
+ "no_cuda": false,
54
+ "null_score_diff_threshold": 0.0,
55
+ "num_train_epochs": 20.0,
56
+ "out_shuffling_group": 4,
57
+ "overwrite_cache": false,
58
+ "overwrite_output_dir": true,
59
+ "per_gpu_eval_batch_size": 16,
60
+ "per_gpu_train_batch_size": 16,
61
+ "predict_file": "dev-v1.1.json",
62
+ "pruning_method": "sigmoied_threshold",
63
+ "pruning_submethod": "default",
64
+ "regularization": "l1",
65
+ "save_steps": 5000,
66
+ "seed": 42,
67
+ "server_ip": "",
68
+ "server_port": "",
69
+ "shuffling_learning_rate": 0.001,
70
+ "shuffling_method": "disabled",
71
+ "teacher_name_or_path": "csarron/bert-base-uncased-squad-v1",
72
+ "teacher_type": "bert",
73
+ "temperature": 2.0,
74
+ "threads": 8,
75
+ "tokenizer_name": "",
76
+ "train_batch_size": 16,
77
+ "train_file": "train-v1.1.json",
78
+ "truncate_train_examples": -1,
79
+ "verbose_logging": false,
80
+ "version_2_with_negative": false,
81
+ "warmup_steps": 5400,
82
+ "weight_decay": 0.0
83
+ },
84
+ "config": {
85
+ "_name_or_path": "bert-base-uncased",
86
+ "ampere_mask_init": "constant",
87
+ "ampere_mask_scale": 0.0,
88
+ "ampere_pruning_method": "disabled",
89
+ "architectures": ["MaskedBertForQuestionAnswering"],
90
+ "attention_probs_dropout_prob": 0.1,
91
+ "hidden_act": "gelu",
92
+ "hidden_dropout_prob": 0.1,
93
+ "hidden_size": 768,
94
+ "in_shuffling_group": 4,
95
+ "initializer_range": 0.02,
96
+ "intermediate_size": 3072,
97
+ "layer_norm_eps": 1e-12,
98
+ "mask_block_cols": 32,
99
+ "mask_block_rows": 32,
100
+ "mask_init": "constant",
101
+ "mask_scale": 0.0,
102
+ "max_position_embeddings": 512,
103
+ "model_type": "masked_bert",
104
+ "num_attention_heads": 12,
105
+ "num_hidden_layers": 12,
106
+ "out_shuffling_group": 4,
107
+ "pad_token_id": 0,
108
+ "pruning_method": "sigmoied_threshold",
109
+ "pruning_submethod": "default",
110
+ "shuffling_method": "disabled",
111
+ "type_vocab_size": 2,
112
+ "vocab_size": 30522
113
+ },
114
+ "packaging": {
115
+ "model_name": "madlag/bert-base-uncased-squad1.1-block-sparse-0.13-v1",
116
+ "model_owner": "madlag",
117
+ "pytorch_final_file_size": 359300023
118
+ },
119
+ "performance": {
120
+ "dense": {
121
+ "eval_elapsed_time": 43.292644419998396
122
+ },
123
+ "pytorch_block_sparse": {
124
+ "eval_elapsed_time": 26.211968197021633
125
+ },
126
+ "speedup": 1.6516365384922744
127
+ },
128
+ "precision": {
129
+ "exact": 74.38978576660156,
130
+ "f1": 83.25814819335938
131
+ },
132
+ "sparsity": {
133
+ "ampere": false,
134
+ "block_size": [32, 32],
135
+ "block_sparse": true,
136
+ "block_sparse_density": 0.12510850694444445,
137
+ "block_sparse_nnz": 10377,
138
+ "block_sparse_total": 82944,
139
+ "global_density": 0.32128202590889765,
140
+ "is_block_sparse_valid": true,
141
+ "nnz_parameters": 35175170,
142
+ "parameters": 109483778,
143
+ "pruned_heads": {
144
+ "0": [0, 1, 2, 4, 5, 6, 7, 9, 11],
145
+ "1": [0, 1, 2, 3, 5, 6, 7, 8, 9],
146
+ "2": [1, 2, 3, 4, 5, 7, 8, 10, 11],
147
+ "3": [2, 3, 4, 6, 7, 10],
148
+ "4": [0, 1, 2, 4, 6, 7, 8, 10, 11],
149
+ "5": [0, 1, 2, 4, 5, 6, 7, 11],
150
+ "6": [0, 2, 3, 4, 6, 7, 10],
151
+ "7": [1, 2, 3, 5, 6, 7, 11],
152
+ "8": [0, 1, 2, 3, 4, 5, 6, 7, 8],
153
+ "9": [1, 3, 4, 5, 7, 9, 10],
154
+ "10": [0, 1, 2, 4, 5, 6, 7, 8, 9],
155
+ "11": [0, 2, 3, 5, 7, 8, 10, 11]
156
+ },
157
+ "total_attention_heads": 144,
158
+ "total_pruned_attention_heads": 97
159
+ }
160
+ }