{ "args": { "adam_epsilon": 1e-08, "alpha_ce": 0.1, "alpha_distil": 0.9, "ampere_learning_rate": 0.01, "ampere_mask_init": "constant", "ampere_mask_scale": 0.0, "ampere_pruning_method": "disabled", "cache_dir": "", "config_name": "", "data_dir": "squad_data", "do_eval": true, "do_lower_case": true, "do_train": true, "doc_stride": 128, "eval_all_checkpoints": true, "eval_batch_size": 16, "evaluate_during_training": false, "final_ampere_temperature": 20, "final_lambda": 200, "final_shuffling_temperature": 20, "final_threshold": 0.1, "final_warmup": 10, "fp16": false, "fp16_opt_level": "O1", "global_topk": false, "global_topk_frequency_compute": 25, "gradient_accumulation_steps": 1, "in_shuffling_group": 4, "initial_ampere_temperature": 0.0, "initial_shuffling_temperature": 0.1, "initial_threshold": 0.0, "initial_warmup": 1, "lang_id": 0, "learning_rate": 3e-05, "local_rank": -1, "logging_steps": 500, "mask_block_cols": 32, "mask_block_rows": 32, "mask_init": "constant", "mask_scale": 0.0, "mask_scores_learning_rate": 0.01, "max_answer_length": 30, "max_grad_norm": 1.0, "max_query_length": 64, "max_seq_length": 384, "max_steps": -1, "model_name_or_path": "bert-base-uncased", "model_type": "masked_bert", "n_best_size": 20, "n_gpu": 1, "no_cuda": false, "null_score_diff_threshold": 0.0, "num_train_epochs": 20.0, "out_shuffling_group": 4, "overwrite_cache": false, "overwrite_output_dir": true, "per_gpu_eval_batch_size": 16, "per_gpu_train_batch_size": 16, "predict_file": "dev-v1.1.json", "pruning_method": "sigmoied_threshold", "pruning_submethod": "default", "regularization": "l1", "save_steps": 5000, "seed": 42, "server_ip": "", "server_port": "", "shuffling_learning_rate": 0.001, "shuffling_method": "disabled", "teacher_name_or_path": "csarron/bert-base-uncased-squad-v1", "teacher_type": "bert", "temperature": 2.0, "threads": 8, "tokenizer_name": "", "train_batch_size": 16, "train_file": "train-v1.1.json", "truncate_train_examples": -1, "verbose_logging": false, "version_2_with_negative": false, "warmup_steps": 5400, "weight_decay": 0.0 }, "config": { "_name_or_path": "bert-base-uncased", "ampere_mask_init": "constant", "ampere_mask_scale": 0.0, "ampere_pruning_method": "disabled", "architectures": ["MaskedBertForQuestionAnswering"], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "in_shuffling_group": 4, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "mask_block_cols": 32, "mask_block_rows": 32, "mask_init": "constant", "mask_scale": 0.0, "max_position_embeddings": 512, "model_type": "masked_bert", "num_attention_heads": 12, "num_hidden_layers": 12, "out_shuffling_group": 4, "pad_token_id": 0, "pruning_method": "sigmoied_threshold", "pruning_submethod": "default", "shuffling_method": "disabled", "type_vocab_size": 2, "vocab_size": 30522 }, "packaging": { "model_name": "madlag/bert-base-uncased-squad1.1-block-sparse-0.07-v1", "model_owner": "madlag", "pytorch_final_file_size": 352215223 }, "performance": { "dense": { "eval_elapsed_time": 43.41997216496384 }, "pytorch_block_sparse": { "eval_elapsed_time": 22.587281233048998 }, "speedup": 1.922319544214693 }, "precision": { "exact": 71.8826904296875, "f1": 81.3593978881836 }, "sparsity": { "ampere": false, "block_size": [32, 32], "block_sparse": true, "block_sparse_density": 0.07493007330246913, "block_sparse_nnz": 6215, "block_sparse_total": 82944, "global_density": 0.2823549074092054, "is_block_sparse_valid": true, "nnz_parameters": 30913282, "parameters": 109483778, "pruned_heads": { "0": [0, 1, 2, 4, 5, 6, 7, 9, 11], "1": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "2": [1, 2, 3, 4, 5, 7, 8, 10, 11], "3": [2, 3, 4, 6, 7, 9, 10, 11], "4": [0, 1, 2, 4, 6, 7, 8, 9, 10, 11], "5": [0, 1, 2, 4, 5, 6, 7, 11], "6": [0, 1, 2, 3, 4, 5, 6, 7, 10, 11], "7": [1, 2, 3, 5, 6, 7, 11], "8": [0, 1, 2, 3, 4, 5, 6, 7, 8], "9": [1, 3, 4, 5, 7, 9, 10, 11], "10": [0, 1, 2, 4, 5, 6, 7, 8, 9], "11": [0, 2, 3, 5, 7, 8, 9, 10, 11] }, "total_attention_heads": 144, "total_pruned_attention_heads": 106 } }