{ "accumulate_gradients": 1, "ae_steps": [], "amp": 1, "architectures": [ "XLMModel" ], "asm": false, "attention_dropout": 0.1, "batch_size": 64, "beam_size": 1, "bos_index": 0, "bos_token_id": 0, "bptt": 256, "bt_src_langs": [], "bt_steps": [], "causal": false, "clip_grad_norm": 5, "clm_steps": [], "command": "python train.py --local_rank=0 --exp_name unihan_zh_ja --dump_path '/mnt/exp/ft_char' --data_path 'data/processed/xlm_zh_ja/new' --lgs 'zh-ja' --clm_steps '' --mlm_steps 'ja,zh' --emb_dim 1024 --n_layers 12 --n_heads 16 --dropout '0.1' --attention_dropout '0.1' --gelu_activation true --batch_size 64 --bptt 256 --optimizer 'adam_inverse_sqrt,lr=0.00005,warmup_updates=30000,beta1=0.9,beta2=0.999,weight_decay=0.01,eps=0.000001' --epoch_size 300000 --max_epoch 100000 --validation_metrics _valid_mlm_ppl --stopping_criterion '_valid_mlm_ppl,25' --fp16 true --amp 1 --exp_id epoch169 --reload_model '/mnt/exp/hard_pretrain/unihan_zh_ja/recycled/converted-best-valid_mlm_ppl.pth' --exp_id \"epoch169\"", "context_size": 0, "data_path": "data/processed/xlm_zh_ja/new", "debug": false, "debug_slurm": false, "debug_train": false, "dropout": 0.1, "dump_path": "/mnt/exp/ft_char/unihan_zh_ja/epoch169", "emb_dim": 1024, "embed_init_std": 0.02209708691207961, "encoder_only": true, "end_n_top": 5, "eos_index": 1, "epoch_size": 300000, "eval_bleu": false, "eval_only": false, "exp_id": "epoch169", "exp_name": "unihan_zh_ja", "fp16": true, "gelu_activation": true, "global_rank": 0, "group_by_size": true, "hyp_path": "/mnt/exp/ft_char/unihan_zh_ja/epoch169/hypotheses", "id2lang": { "0": "ja", "1": "zh" }, "init_std": 0.02, "is_encoder": true, "is_master": true, "is_slurm_job": false, "lambda_ae": 1.0, "lambda_ae_config": null, "lambda_bt": 1.0, "lambda_bt_config": null, "lambda_clm": 1.0, "lambda_clm_config": null, "lambda_mlm": 1.0, "lambda_mlm_config": null, "lambda_mt": 1.0, "lambda_mt_config": null, "lambda_pc": 1.0, "lambda_pc_config": null, "lang2id": { "ja": 0, "zh": 1 }, "lang_id": 0, "langs": [ "zh", "ja" ], "layer_norm_eps": 1e-12, "lg_sampling_factor": -1, "lgs": "zh-ja", "local_rank": 0, "mask_index": 5, "mask_token_id": 0, "master_port": -1, "max_batch_size": 0, "max_epoch": 100000, "max_len": 100, "max_position_embeddings": 512, "max_vocab": -1, "min_count": 0, "mlm_steps": [ [ "ja", null ], [ "zh", null ] ], "model_type": "xlm", "mono_dataset": { "ja": { "test": "data/processed/xlm_zh_ja/new/test.ja.pth", "train": "data/processed/xlm_zh_ja/new/train.ja.pth", "valid": "data/processed/xlm_zh_ja/new/valid.ja.pth" }, "zh": { "test": "data/processed/xlm_zh_ja/new/test.zh.pth", "train": "data/processed/xlm_zh_ja/new/train.zh.pth", "valid": "data/processed/xlm_zh_ja/new/valid.zh.pth" } }, "mt_steps": [], "multi_gpu": true, "multi_node": false, "n_gpu_per_node": 8, "n_heads": 16, "n_langs": 2, "n_layers": 12, "n_nodes": 1, "node_id": 0, "optimizer": "adam_inverse_sqrt,lr=0.00005,warmup_updates=30000,beta1=0.9,beta2=0.999,weight_decay=0.01,eps=0.000001", "pad_index": 2, "pad_token_id": 2, "para_dataset": {}, "pc_steps": [], "ref_paths": {}, "reload_checkpoint": "", "reload_emb": "", "reload_model": "/mnt/exp/hard_pretrain/unihan_zh_ja/recycled/converted-best-valid_mlm_ppl.pth", "sample_alpha": 0, "save_periodic": 0, "share_inout_emb": true, "sinusoidal_embeddings": false, "split_data": false, "start_n_top": 5, "stopping_criterion": "_valid_mlm_ppl,25", "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "first", "summary_use_proj": true, "tokens_per_batch": -1, "unk_index": 3, "use_lang_emb": true, "use_memory": false, "validation_metrics": "_valid_mlm_ppl", "vocab_size": 24044, "word_blank": 0, "word_dropout": 0, "word_keep": 0.1, "word_mask": 0.8, "word_mask_keep_rand": "0.8,0.1,0.1", "word_pred": 0.15, "word_rand": 0.1, "word_shuffle": 0, "world_size": 8 }