train_file: wiki_erica_path/v7/union/train_distant.path_v7.train.0.pkl dev_file: wiki_erica_path/v7/union/train_distant.path_v7.dev.pkl test_file: null model: _target_: models.deberta.DebertaV2ForMultipleChoicePreTrain.from_pretrained mlp_hidden_size: 3072 fs_checkpoint: false fs_checkpoint_offload_to_cpu: false read_tensor: _target_: dataset.wiki_entity_path_v8_2.convert_examples_into_features max_neg_num: 3 aug_num: 1 max_seq_length: 256 shuffle_context: true min_rep_num: 5 geo_p: 0.4 deduct_ratio: 1.0 context_ratio: 1.0 num_workers: 32 extended_vocab: null collator: _target_: dataset.wiki_entity_path_v8.WikiPathDatasetCollatorWithContext max_seq_length: 256 tokenizer: pretrained-models/deberta-v2-xxlarge mlm_probability: 0.15 max_option_num: 4 swap: true num_workers: 4 prefetch_factor: 4 model_name_or_path: pretrained-models/deberta-v2-xxlarge pretrain: null output_dir: experiments/deberta.v2.xxlarge.path.v7_v8.2.2.1aug.ctx.A100.v1.3.w4.s${seed}.fsdp.adamw do_train: Train evaluate_during_training: true do_eval: false eval_sub_path: null do_preprocess: false per_gpu_train_batch_size: 2 per_gpu_eval_batch_size: 2 learning_rate: 1.0e-05 gradient_accumulation_steps: 512 weight_decay: 0.01 adam_epsilon: 1.0e-06 adam_betas: (0.9, 0.999) max_grad_norm: 1.0 num_train_epochs: 1 max_steps: 200 warmup_proportion: 0.2 warmup_steps: 0 optimizer: null use_nvlamb: null bit_training: null multi_tensor: null logging_steps: 1 save_steps: 50 eval_steps: 50 no_cuda: false seed: 42 local_rank: 0 fp16: true fp16_opt_level: O2 ds_cfg: train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} gradient_accumulation_steps: ${gradient_accumulation_steps} optimizer: type: AdamW params: lr: ${learning_rate} betas: - 0.9 - 0.999 eps: ${adam_epsilon} weight_decay: ${weight_decay} scheduler: type: WarmupDecayLR params: total_num_steps: null warmup_max_lr: ${learning_rate} warmup_num_steps: null warmup_type: linear gradient_clipping: ${max_grad_norm} fp16: enabled: ${fp16} initial_scale_power: 12 zero_optimization: stage: 3 steps_per_print: 1024 reshard_after_forward: false flatten_parameters: true move_grads_to_cpu: false move_params_to_cpu: false n_gpu: 1 device: cuda:0 train_batch_size: 2 eval_batch_size: 2 note: null