File size: 3,703 Bytes
91f9d3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
aws_output_bucket: s3://sagemaker-us-east-1-107457652907/experiments/falcon.40b.q_lora.merit_v91_v91.seq2seq.v5.0.3aug.w16.adamw.500steps.NA100.0528.aws
train_file: /opt/ml/input/data/train/distant_path_v9.1_fix_no_shuffle.train.0.pkl
test_file: null
model:
  _target_: models.rw.RWForConditionalGenerationFlan.from_pretrained
  pad_token_id: 11
  use_peft: true
  lora_config:
    _recursive_: false
    _target_: models.rw.LoraConfig
    task_type: CAUSAL_LM
    inference_mode: false
    target_modules:
      _target_: models.rw.find_all_linear_names
      bits: 4
    r: 64
    lora_alpha: 16
    lora_dropout: 0.05
  gradient_checkpointing: true
  torch_dtype:
    _target_: general_util.training_utils.return_torch_dtype
    dtype: bfloat16
  quantization_config:
    _target_: transformers.utils.quantization_config.BitsAndBytesConfig
    load_in_4bit: true
    bnb_4bit_compute_dtype:
      _target_: general_util.training_utils.return_torch_dtype
      dtype: bfloat16
    bnb_4bit_use_double_quant: true
    bnb_4bit_quant_type: nf4
  device_map:
    _target_: models.rw.return_single_device_map
  load_in_4bit: true
  max_memory: true
read_tensor_train:
  _target_: data.wiki_entity_path_v9_1_2.convert_examples_into_features_seq2seq
  max_neg_num: 3
  aug_num: 3
  max_seq_length: 512
  shuffle_context: true
  min_rep_num: 5
  geo_p: 0.4
  deduct_ratio: 1.0
  context_ratio: 1.0
  noise_sent_ratio: 0.0
  num_workers: 128
extended_vocab: null
collator:
  _target_: data.collators.wiki_seq2seq_collator.WikiSeq2SeqCollatorWithCausalLM
  max_seq_length: 512
  tokenizer: ${model_name_or_path}
  causal_lm: true
  causal_lm_add_eos: false
  generative_mode: true
num_workers: 4
prefetch_factor: 2
do_preprocess: false
model_name_or_path: /tmp/falcon-40b
pretrain: null
exp_name: falcon.40b.q_lora.merit_v91_v91.seq2seq.v5.0.3aug.w16.adamw.500steps.NA100.0528.aws
exp_notes: null
output_dir: /tmp/${exp_name}
do_train: true
evaluate_during_training: false
do_eval: true
eval_sub_path: checkpoint-*
per_gpu_train_batch_size: 16
per_gpu_eval_batch_size: 8
learning_rate: 0.0005
gradient_accumulation_steps: 16
weight_decay: 0.0
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.99)
max_grad_norm: 0.3
num_train_epochs: 1
max_steps: -1
warmup_proportion: 0
warmup_steps: 50
optimizer: null
use_nvlamb: null
bit_training: null
logging_steps: 1
save_best: false
save_steps: 100
eval_steps: -1
ddp_eval: true
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O1
fp16_bfloat16: true
prediction_cfg:
  metric: acc
  measure: 1
  best_checkpoint: null
  best_result: null
eval_forward_fn:
  _target_: general_util.evaluator.DiscriminatorForwardFn
post_process: null
compile: false
fairscale_config: null
fsdp_config: null
ds_cfg:
  train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
  gradient_accumulation_steps: ${gradient_accumulation_steps}
  optimizer:
    type: AdamW
    params:
      lr: ${learning_rate}
      betas:
      - 0.9
      - 0.999
      eps: ${adam_epsilon}
      weight_decay: ${weight_decay}
  scheduler:
    type: WarmupLR
    params:
      warmup_max_lr: ${learning_rate}
      warmup_num_steps: 50
      warmup_type: linear
  gradient_clipping: ${max_grad_norm}
  bf16:
    enabled: ${fp16}
  zero_optimization:
    stage: 1
    contiguous_gradients: true
    overlap_comm: true
    reduce_scatter: true
    reduce_bucket_size: 500000000.0
    allgather_bucket_size: 500000000.0
  steps_per_print: 1024
with_lightseq: false
summary_helper:
  _target_: general_util.tensorboard_helper.WandbWriter
  batch_index_or_keys: null
  outputs_index_or_keys:
    train/mlm_loss: mlm_loss
n_gpu: 1
device: cuda:0
train_batch_size: 16
eval_batch_size: null
world_size: 16