madlag's picture
Adding modes, graphs and metadata.
979839b
{
"args": {
"adam_epsilon": 1e-08,
"alpha_ce": 0.1,
"alpha_distil": 0.9,
"ampere_learning_rate": 0.01,
"ampere_mask_init": "constant",
"ampere_mask_scale": 0.0,
"ampere_pruning_method": "disabled",
"cache_dir": "",
"config_name": "",
"data_dir": "squad_data",
"do_eval": true,
"do_lower_case": true,
"do_train": true,
"doc_stride": 128,
"eval_all_checkpoints": true,
"eval_batch_size": 16,
"evaluate_during_training": false,
"final_ampere_temperature": 20,
"final_lambda": 50,
"final_shuffling_temperature": 20,
"final_threshold": 0.1,
"final_warmup": 10,
"fp16": false,
"fp16_opt_level": "O1",
"global_topk": false,
"global_topk_frequency_compute": 25,
"gradient_accumulation_steps": 1,
"in_shuffling_group": 4,
"initial_ampere_temperature": 0.0,
"initial_shuffling_temperature": 0.1,
"initial_threshold": 0.0,
"initial_warmup": 1,
"lang_id": 0,
"learning_rate": 3e-05,
"local_rank": -1,
"logging_steps": 500,
"mask_block_cols": 32,
"mask_block_rows": 32,
"mask_init": "constant",
"mask_scale": 0.0,
"mask_scores_learning_rate": 0.01,
"max_answer_length": 30,
"max_grad_norm": 1.0,
"max_query_length": 64,
"max_seq_length": 384,
"max_steps": -1,
"model_name_or_path": "bert-base-uncased",
"model_type": "masked_bert",
"n_best_size": 20,
"n_gpu": 1,
"no_cuda": false,
"null_score_diff_threshold": 0.0,
"num_train_epochs": 20.0,
"out_shuffling_group": 4,
"overwrite_cache": false,
"overwrite_output_dir": true,
"per_gpu_eval_batch_size": 16,
"per_gpu_train_batch_size": 16,
"predict_file": "dev-v1.1.json",
"pruning_method": "sigmoied_threshold",
"pruning_submethod": "default",
"regularization": "l1",
"save_steps": 5000,
"seed": 42,
"server_ip": "",
"server_port": "",
"shuffling_learning_rate": 0.001,
"shuffling_method": "disabled",
"teacher_name_or_path": "csarron/bert-base-uncased-squad-v1",
"teacher_type": "bert",
"temperature": 2.0,
"threads": 8,
"tokenizer_name": "",
"train_batch_size": 16,
"train_file": "train-v1.1.json",
"truncate_train_examples": -1,
"verbose_logging": false,
"version_2_with_negative": false,
"warmup_steps": 5400,
"weight_decay": 0.0
},
"config": {
"_name_or_path": "bert-base-uncased",
"ampere_mask_init": "constant",
"ampere_mask_scale": 0.0,
"ampere_pruning_method": "disabled",
"architectures": ["MaskedBertForQuestionAnswering"],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"in_shuffling_group": 4,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"mask_block_cols": 32,
"mask_block_rows": 32,
"mask_init": "constant",
"mask_scale": 0.0,
"max_position_embeddings": 512,
"model_type": "masked_bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"out_shuffling_group": 4,
"pad_token_id": 0,
"pruning_method": "sigmoied_threshold",
"pruning_submethod": "default",
"shuffling_method": "disabled",
"type_vocab_size": 2,
"vocab_size": 30522
},
"packaging": {
"model_name": "madlag/bert-base-uncased-squad1.1-block-sparse-0.20-v1",
"model_owner": "madlag",
"pytorch_final_file_size": 364810487
},
"performance": {
"dense": {
"eval_elapsed_time": 42.67642272700323
},
"pytorch_block_sparse": {
"eval_elapsed_time": 30.794714744988596
},
"speedup": 1.3858359488115801
},
"precision": {
"exact": 76.98202514648438,
"f1": 85.4483871459961
},
"sparsity": {
"ampere": false,
"block_size": [32, 32],
"block_sparse": true,
"block_sparse_density": 0.2017264660493827,
"block_sparse_nnz": 16732,
"block_sparse_total": 82944,
"global_density": 0.3807202378419934,
"is_block_sparse_valid": true,
"nnz_parameters": 41682690,
"parameters": 109483778,
"pruned_heads": {
"0": [0, 2, 4, 5, 6, 7, 9, 11],
"1": [0, 1, 2, 3, 5, 6, 7, 8, 9],
"2": [1, 2, 3, 4, 5, 7, 8, 11],
"3": [2, 3, 4, 6, 7, 10],
"4": [0, 1, 2, 6, 7, 8, 10, 11],
"5": [0, 1, 2, 4, 5, 6, 7, 11],
"6": [0, 2, 3, 4, 6, 7, 10],
"7": [1, 3, 6, 7, 11],
"8": [0, 2, 3, 4, 5, 6, 7, 8],
"9": [1, 3, 4, 5, 7, 9, 10],
"10": [1, 4, 5, 6, 7, 8, 9],
"11": [0, 2, 3, 5, 6, 7, 8, 10, 11]
},
"total_attention_heads": 144,
"total_pruned_attention_heads": 90
}
}