File size: 3,212 Bytes
0d761bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
aws_output_bucket: s3://panda-us-west-2/experiments/Llama-2-13b-chat-hf-code-github-c4_v1.pp8.dp2.0822.aws
data_dir: null
dist_load_data_barrier: false
train_file: /tmp/data-train-code-c4/code/
dev_file: null
test_file: null
model:
  _target_: models.llama.LlamaForConditionalGeneration.from_pretrained
  vocab_size: 79458
  use_peft: false
  gradient_checkpointing: true
  enable_flash_attention: true
  flash_attention_vanilla_torch: true
  pad_token_id: 2
read_tensor:
  _target_: data.collators.zh_instruct.TextDatasetCombineV2_CodeGithub
  extra_data:
    _target_: data.collators.zh_instruct.C4CombinedDataset
    tokenizer: null
    file_path: /tmp/data-train-code-c4/c4/
    file_num: 50
extended_vocab: null
collator:
  _target_: data.collators.flan.CombineCollator
  max_seq_length: 2048
  tokenizer: ${model_name_or_path}
  decoder_only: true
  padding: longest
  padding_side: right
num_workers: 4
prefetch_factor: 2
do_preprocess: false
model_name_or_path: /tmp/Llama-2-13b-chat-hf-code-github-c4/
pretrain: null
exp_name: llama2.13b.Code.Github.C4.combine.v1.0.seq2k.w16.adamw.NA100.0822.aws.ds
exp_notes: null
output_dir: /tmp/${exp_name}
resume: null
do_train: true
evaluate_during_training: false
do_eval: false
eval_sub_path: checkpoint-*
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 1
learning_rate: 1.0e-05
gradient_accumulation_steps: 64
weight_decay: 0.01
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.99)
max_grad_norm: 1.0
num_train_epochs: 1
total_dataset_len: 10000000
max_steps: 0
warmup_proportion: 0
warmup_steps: 0
optimizer: null
use_nvlamb: null
bit_training: null
logging_steps: 1
save_best: false
save_steps: 250
eval_steps: 250
ddp_eval: true
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O1
fp16_bfloat16: true
prediction_cfg:
  metric: acc
  measure: 1
  best_checkpoint: null
  best_result: null
eval_forward_fn:
  _target_: general_util.evaluator.DiscriminatorForwardFn
post_process: null
fairscale_config:
  _target_: general_util.fsdp_utils.default_initialize
  fp16: ${fp16}
  move_grads_to_cpu: false
  move_params_to_cpu: false
  flatten_parameters: false
with_lightseq: false
load_lr_scheduler_states: false
ds_cfg:
  train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
  gradient_accumulation_steps: ${gradient_accumulation_steps}
  optimizer:
    type: AdamW
    params:
      lr: ${learning_rate}
      betas:
      - 0.9
      - 0.96
      eps: ${adam_epsilon}
      weight_decay: ${weight_decay}
  scheduler:
    type: WarmupDecayLR
    params:
      total_num_steps: 4882
      warmup_max_lr: ${learning_rate}
      warmup_num_steps: 0
      warmup_type: linear
  gradient_clipping: ${max_grad_norm}
  bf16:
    enabled: ${fp16}
  zero_optimization:
    stage: 1
    contiguous_gradients: true
    overlap_comm: true
    reduce_scatter: true
    reduce_bucket_size: 500000000.0
    allgather_bucket_size: 500000000.0
    offload_optimizer:
      device: cpu
      pin_memory: true
  steps_per_print: 1
summary_helper:
  _target_: general_util.tensorboard_helper.WandbWriter
  batch_index_or_keys: null
  outputs_index_or_keys: null
n_gpu: 1
device: cuda:0
train_batch_size: 2
eval_batch_size: null
world_size: 16
world_rank: null