File size: 3,597 Bytes
b9df06d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
train_file: wiki_erica_path/v9.1_fixed/distant_path_v9.1_fix_no_shuffle.train.0.pkl_llama_False_3_6_512_0.4_5_1.0_1.0_0.0_8_path_v9.1.2_seq2seq_bin_filter
test_file: null
model:
  _target_: models.llama.LlamaForConditionalGenerationFlan.from_pretrained
  use_peft: true
  lora_config:
    _recursive_: false
    _target_: models.llama.LoraConfig
    task_type: CAUSAL_LM
    inference_mode: false
    target_modules:
      _target_: models.llama.find_all_linear_names
      bits: 4
    r: 64
    lora_alpha: 16
    lora_dropout: 0.05
  gradient_checkpointing: true
  torch_dtype:
    _target_: general_util.training_utils.return_torch_dtype
    dtype: bfloat16
  quantization_config:
    _target_: transformers.utils.quantization_config.BitsAndBytesConfig
    load_in_4bit: true
    bnb_4bit_compute_dtype:
      _target_: general_util.training_utils.return_torch_dtype
      dtype: bfloat16
    bnb_4bit_use_double_quant: true
    bnb_4bit_quant_type: nf4
  device_map:
    _target_: models.llama.return_single_device_map
  load_in_4bit: true
  low_cpu_mem_usage: true
  max_memory: true
  use_flash_attention_2: true
dist_load_data_barrier: false
read_tensor_train:
  _target_: data.collators.wiki.WikiPathDatasetV5.init_from_bin_file
extended_vocab: null
collator:
  _target_: data.collators.wiki_seq2seq_collator.WikiSeq2SeqCollatorWithCausalLMFixPaddingSide
  max_seq_length: 512
  tokenizer: ${model_name_or_path}
  causal_lm: true
  causal_lm_add_eos: false
  generative_mode: true
  padding_side: right
num_workers: 4
prefetch_factor: 2
do_preprocess: false
model_name_or_path: pretrained-models/Llama-2-70b-hf
pretrain: null
exp_name: llama2.70b.q_lora.merit_v91_v91.seq2seq.v5.0.6aug.filter.w4.adamw.500steps.NA100.1010
exp_notes: null
output_dir: experiments/${exp_name}
do_train: true
evaluate_during_training: false
do_eval: true
eval_sub_path: checkpoint-*
per_gpu_train_batch_size: 8
per_gpu_eval_batch_size: 16
learning_rate: 0.0002
gradient_accumulation_steps: 128
weight_decay: 0.0
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.99)
max_grad_norm: 0.3
num_train_epochs: 1
max_steps: -1
warmup_proportion: 0
warmup_steps: 50
total_dataset_len: 1808348
optimizer: null
use_nvlamb: null
bit_training: null
logging_steps: 1
save_best: false
save_steps: 100
eval_steps: -1
ddp_eval: true
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O1
fp16_bfloat16: true
prediction_cfg:
  metric: acc
  measure: 1
  best_checkpoint: null
  best_result: null
eval_forward_fn:
  _target_: general_util.evaluator.DiscriminatorForwardFn
post_process: null
compile: false
fairscale_config: null
fsdp_config: null
ds_cfg:
  train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
  gradient_accumulation_steps: ${gradient_accumulation_steps}
  optimizer:
    type: AdamW
    params:
      lr: ${learning_rate}
      betas:
      - 0.9
      - 0.999
      eps: ${adam_epsilon}
      weight_decay: ${weight_decay}
  scheduler:
    type: WarmupLR
    params:
      warmup_max_lr: ${learning_rate}
      warmup_num_steps: null
      warmup_type: linear
  gradient_clipping: ${max_grad_norm}
  bf16:
    enabled: ${fp16}
  zero_optimization:
    stage: 1
    contiguous_gradients: true
    overlap_comm: true
    reduce_scatter: true
    reduce_bucket_size: 500000000.0
    allgather_bucket_size: 500000000.0
  steps_per_print: 1024
with_lightseq: false
summary_helper:
  _target_: general_util.tensorboard_helper.WandbWriter
  batch_index_or_keys: null
  outputs_index_or_keys:
    train/mlm_loss: mlm_loss
n_gpu: 1
device: cuda:0
train_batch_size: null
eval_batch_size: null
world_size: 4