|
{ |
|
"ampere_pruning_method": "disabled", |
|
"attention_block_cols": 32, |
|
"attention_block_rows": 32, |
|
"attention_lambda": 1.0, |
|
"attention_output_with_dense": 0, |
|
"attention_pruning_method": "sigmoied_threshold", |
|
"bias_mask": true, |
|
"decoder_attention_lambda": null, |
|
"decoder_dense_lambda": null, |
|
"dense_block_cols": 1, |
|
"dense_block_rows": 1, |
|
"dense_lambda": 1.0, |
|
"dense_pruning_method": "sigmoied_threshold:1d_alt", |
|
"distil_alpha_ce": 0.1, |
|
"distil_alpha_teacher": 0.9, |
|
"distil_teacher_name_or_path": "madlag/bert-large-uncased-whole-word-masking-finetuned-squadv2", |
|
"distil_temperature": 2.0, |
|
"eval_with_current_patch_params": false, |
|
"final_ampere_temperature": 20.0, |
|
"final_finetune": false, |
|
"final_threshold": 0.1, |
|
"final_warmup": 10, |
|
"gelu_patch": 0, |
|
"gelu_patch_steps": 50000, |
|
"initial_ampere_temperature": 0.0, |
|
"initial_threshold": 0, |
|
"initial_warmup": 1, |
|
"layer_norm_patch": 0, |
|
"layer_norm_patch_start_delta": 0.99, |
|
"layer_norm_patch_steps": 50000, |
|
"linear_min_parameters": 0.005, |
|
"mask_init": "constant", |
|
"mask_scale": 0.0, |
|
"mask_scores_learning_rate": 0.01, |
|
"qat": false, |
|
"qconfig": "default", |
|
"regularization": "l1", |
|
"regularization_final_lambda": 2, |
|
"rewind_model_name_or_path": null |
|
} |