|
{ |
|
"force": true, |
|
"dump_path": "/local/musaeed/UofkDistill/Dumps/", |
|
"data_file": "/local/musaeed/UofkDistill/transformers/examples/research_projects/distillation/The_data/merged_data_binarized.pickle", |
|
"student_type": "distilbert", |
|
"student_config": "/local/musaeed/UofkDistill/transformers_/transformers/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json", |
|
"student_pretrained_weights": "/local/musaeed/UofkDistill/model/checkpoint.pth", |
|
"teacher_type": "bert", |
|
"teacher_name": "asafaya/bert-large-arabic", |
|
"temperature": 2.0, |
|
"alpha_ce": 5.0, |
|
"alpha_mlm": 2.0, |
|
"alpha_clm": 0.0, |
|
"alpha_mse": 0.0, |
|
"alpha_cos": 1.0, |
|
"mlm": true, |
|
"mlm_mask_prop": 0.15, |
|
"word_mask": 0.8, |
|
"word_keep": 0.1, |
|
"word_rand": 0.1, |
|
"mlm_smoothing": 0.7, |
|
"token_counts": "/local/musaeed/UofkDistill/transformers/examples/research_projects/distillation/The_data/merged_token_count.pickle", |
|
"restrict_ce_to_mask": false, |
|
"freeze_pos_embs": true, |
|
"freeze_token_type_embds": false, |
|
"n_epoch": 3, |
|
"batch_size": 16, |
|
"group_by_size": true, |
|
"gradient_accumulation_steps": 50, |
|
"warmup_prop": 0.05, |
|
"weight_decay": 0.0, |
|
"learning_rate": 3e-05, |
|
"adam_epsilon": 1e-06, |
|
"max_grad_norm": 5.0, |
|
"initializer_range": 0.02, |
|
"fp16": false, |
|
"fp16_opt_level": "O1", |
|
"n_gpu": 4, |
|
"local_rank": 0, |
|
"seed": 56, |
|
"log_interval": 500, |
|
"checkpoint_interval": 10000, |
|
"world_size": 4, |
|
"n_gpu_per_node": 4, |
|
"global_rank": 0, |
|
"n_nodes": 1, |
|
"node_id": 0, |
|
"multi_gpu": true, |
|
"is_master": true, |
|
"multi_node": false |
|
} |