File size: 5,435 Bytes
979839b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
{
    "args": {
        "adam_epsilon": 1e-08, 
        "alpha_ce": 0.1, 
        "alpha_distil": 0.9, 
        "ampere_learning_rate": 0.01, 
        "ampere_mask_init": "constant", 
        "ampere_mask_scale": 0.0, 
        "ampere_pruning_method": "disabled", 
        "cache_dir": "", 
        "config_name": "", 
        "data_dir": "squad_data", 
        "do_eval": true, 
        "do_lower_case": true, 
        "do_train": true, 
        "doc_stride": 128, 
        "eval_all_checkpoints": true, 
        "eval_batch_size": 16, 
        "evaluate_during_training": false, 
        "final_ampere_temperature": 20, 
        "final_lambda": 50, 
        "final_shuffling_temperature": 20, 
        "final_threshold": 0.1, 
        "final_warmup": 10, 
        "fp16": false, 
        "fp16_opt_level": "O1", 
        "global_topk": false, 
        "global_topk_frequency_compute": 25, 
        "gradient_accumulation_steps": 1, 
        "in_shuffling_group": 4, 
        "initial_ampere_temperature": 0.0, 
        "initial_shuffling_temperature": 0.1, 
        "initial_threshold": 0.0, 
        "initial_warmup": 1, 
        "lang_id": 0, 
        "learning_rate": 3e-05, 
        "local_rank": -1, 
        "logging_steps": 500, 
        "mask_block_cols": 32, 
        "mask_block_rows": 32, 
        "mask_init": "constant", 
        "mask_scale": 0.0, 
        "mask_scores_learning_rate": 0.01, 
        "max_answer_length": 30, 
        "max_grad_norm": 1.0, 
        "max_query_length": 64, 
        "max_seq_length": 384, 
        "max_steps": -1, 
        "model_name_or_path": "bert-base-uncased", 
        "model_type": "masked_bert", 
        "n_best_size": 20, 
        "n_gpu": 1, 
        "no_cuda": false, 
        "null_score_diff_threshold": 0.0, 
        "num_train_epochs": 20.0, 
        "out_shuffling_group": 4, 
        "overwrite_cache": false, 
        "overwrite_output_dir": true, 
        "per_gpu_eval_batch_size": 16, 
        "per_gpu_train_batch_size": 16, 
        "predict_file": "dev-v1.1.json", 
        "pruning_method": "sigmoied_threshold", 
        "pruning_submethod": "default", 
        "regularization": "l1", 
        "save_steps": 5000, 
        "seed": 42, 
        "server_ip": "", 
        "server_port": "", 
        "shuffling_learning_rate": 0.001, 
        "shuffling_method": "disabled", 
        "teacher_name_or_path": "csarron/bert-base-uncased-squad-v1", 
        "teacher_type": "bert", 
        "temperature": 2.0, 
        "threads": 8, 
        "tokenizer_name": "", 
        "train_batch_size": 16, 
        "train_file": "train-v1.1.json", 
        "truncate_train_examples": -1, 
        "verbose_logging": false, 
        "version_2_with_negative": false, 
        "warmup_steps": 5400, 
        "weight_decay": 0.0
    }, 
    "config": {
        "_name_or_path": "bert-base-uncased", 
        "ampere_mask_init": "constant", 
        "ampere_mask_scale": 0.0, 
        "ampere_pruning_method": "disabled", 
        "architectures": ["MaskedBertForQuestionAnswering"], 
        "attention_probs_dropout_prob": 0.1, 
        "hidden_act": "gelu", 
        "hidden_dropout_prob": 0.1, 
        "hidden_size": 768, 
        "in_shuffling_group": 4, 
        "initializer_range": 0.02, 
        "intermediate_size": 3072, 
        "layer_norm_eps": 1e-12, 
        "mask_block_cols": 32, 
        "mask_block_rows": 32, 
        "mask_init": "constant", 
        "mask_scale": 0.0, 
        "max_position_embeddings": 512, 
        "model_type": "masked_bert", 
        "num_attention_heads": 12, 
        "num_hidden_layers": 12, 
        "out_shuffling_group": 4, 
        "pad_token_id": 0, 
        "pruning_method": "sigmoied_threshold", 
        "pruning_submethod": "default", 
        "shuffling_method": "disabled", 
        "type_vocab_size": 2, 
        "vocab_size": 30522
    }, 
    "packaging": {
        "model_name": "madlag/bert-base-uncased-squad1.1-block-sparse-0.20-v1", 
        "model_owner": "madlag", 
        "pytorch_final_file_size": 364810487
    }, 
    "performance": {
        "dense": {
            "eval_elapsed_time": 42.67642272700323
        }, 
        "pytorch_block_sparse": {
            "eval_elapsed_time": 30.794714744988596
        }, 
        "speedup": 1.3858359488115801
    }, 
    "precision": {
        "exact": 76.98202514648438, 
        "f1": 85.4483871459961
    }, 
    "sparsity": {
        "ampere": false, 
        "block_size": [32, 32], 
        "block_sparse": true, 
        "block_sparse_density": 0.2017264660493827, 
        "block_sparse_nnz": 16732, 
        "block_sparse_total": 82944, 
        "global_density": 0.3807202378419934, 
        "is_block_sparse_valid": true, 
        "nnz_parameters": 41682690, 
        "parameters": 109483778, 
        "pruned_heads": {
            "0": [0, 2, 4, 5, 6, 7, 9, 11], 
            "1": [0, 1, 2, 3, 5, 6, 7, 8, 9], 
            "2": [1, 2, 3, 4, 5, 7, 8, 11], 
            "3": [2, 3, 4, 6, 7, 10], 
            "4": [0, 1, 2, 6, 7, 8, 10, 11], 
            "5": [0, 1, 2, 4, 5, 6, 7, 11], 
            "6": [0, 2, 3, 4, 6, 7, 10], 
            "7": [1, 3, 6, 7, 11], 
            "8": [0, 2, 3, 4, 5, 6, 7, 8], 
            "9": [1, 3, 4, 5, 7, 9, 10], 
            "10": [1, 4, 5, 6, 7, 8, 9], 
            "11": [0, 2, 3, 5, 6, 7, 8, 10, 11]
        }, 
        "total_attention_heads": 144, 
        "total_pruned_attention_heads": 90
    }
}