File size: 2,377 Bytes
30f7a38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
train_file: wiki_erica_path/v7/union/train_distant.path_v7.train.0.pkl
dev_file: wiki_erica_path/v7/union/train_distant.path_v7.dev.pkl
test_file: null
model:
  _target_: models.deberta.DebertaV2ForMultipleChoicePreTrain.from_pretrained
  mlp_hidden_size: 3072
  fs_checkpoint: false
  fs_checkpoint_offload_to_cpu: false
read_tensor:
  _target_: dataset.wiki_entity_path_v8_2.convert_examples_into_features
  max_neg_num: 3
  aug_num: 1
  max_seq_length: 256
  shuffle_context: true
  min_rep_num: 5
  geo_p: 0.4
  deduct_ratio: 1.0
  context_ratio: 1.0
  num_workers: 32
extended_vocab: null
collator:
  _target_: dataset.wiki_entity_path_v8.WikiPathDatasetCollatorWithContext
  max_seq_length: 256
  tokenizer: pretrained-models/deberta-v2-xxlarge
  mlm_probability: 0.15
  max_option_num: 4
  swap: true
num_workers: 4
prefetch_factor: 4
model_name_or_path: pretrained-models/deberta-v2-xxlarge
pretrain: null
output_dir: experiments/deberta.v2.xxlarge.path.v7_v8.2.2.1aug.ctx.A100.v1.3.w4.s${seed}.fsdp.adamw
do_train: Train
evaluate_during_training: true
do_eval: false
eval_sub_path: null
do_preprocess: false
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 2
learning_rate: 1.0e-05
gradient_accumulation_steps: 512
weight_decay: 0.01
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.999)
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 200
warmup_proportion: 0.2
warmup_steps: 0
optimizer: null
use_nvlamb: null
bit_training: null
multi_tensor: null
logging_steps: 1
save_steps: 50
eval_steps: 50
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O2
ds_cfg:
  train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
  gradient_accumulation_steps: ${gradient_accumulation_steps}
  optimizer:
    type: AdamW
    params:
      lr: ${learning_rate}
      betas:
      - 0.9
      - 0.999
      eps: ${adam_epsilon}
      weight_decay: ${weight_decay}
  scheduler:
    type: WarmupDecayLR
    params:
      total_num_steps: null
      warmup_max_lr: ${learning_rate}
      warmup_num_steps: null
      warmup_type: linear
  gradient_clipping: ${max_grad_norm}
  fp16:
    enabled: ${fp16}
    initial_scale_power: 12
  zero_optimization:
    stage: 3
  steps_per_print: 1024
reshard_after_forward: false
flatten_parameters: true
move_grads_to_cpu: false
move_params_to_cpu: false
n_gpu: 1
device: cuda:0
train_batch_size: 2
eval_batch_size: 2
note: null