{ "force": true, "dump_path": "data/distilgpt2-e7", "student_type": "gpt2", "student_config": "distilgpt2-ja.json", "student_pretrained_weights": "data/distilgpt2-e6/checkpoint.pth", "teacher_type": "gpt2", "teacher_name": "rinna/japanese-gpt2-medium", "temperature": 2.0, "alpha_ce": 5.0, "alpha_mlm": 0.0, "alpha_clm": 0.5, "alpha_mse": 0.0, "alpha_cos": 1.0, "mlm": false, "mlm_mask_prop": 0.15, "word_mask": 0.8, "word_keep": 0.1, "word_rand": 0.1, "mlm_smoothing": 0.7, "restrict_ce_to_mask": false, "freeze_pos_embs": true, "freeze_token_type_embds": false, "n_epoch": 2, "batch_size": 16, "group_by_size": false, "gradient_accumulation_steps": 50, "warmup_prop": 0.0, "weight_decay": 0.0, "learning_rate": 4.9e-05, "adam_epsilon": 1e-06, "max_grad_norm": 5.0, "initializer_range": 0.02, "fp16": false, "fp16_opt_level": "O1", "n_gpu": 4, "local_rank": 0, "seed": 56, "log_interval": 500, "checkpoint_interval": 4000, "world_size": 4, "n_gpu_per_node": 4, "global_rank": 0, "n_nodes": 1, "node_id": 0, "multi_gpu": true, "is_master": true, "multi_node": false }