File size: 3,592 Bytes
b9df06d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
train_file: wiki_erica_path/v9.1_fixed/distant_path_v9.1_fix_no_shuffle.train.0.pkl_llama_False_3_6_512_0.4_5_1.0_1.0_0.0_8_path_v9.1.2_seq2seq_bin_filter
test_file: null
model:
_target_: models.llama.LlamaForConditionalGenerationFlan.from_pretrained
use_peft: true
lora_config:
_recursive_: false
_target_: models.llama.LoraConfig
task_type: CAUSAL_LM
inference_mode: false
target_modules:
_target_: models.llama.find_all_linear_names
bits: 4
r: 64
lora_alpha: 16
lora_dropout: 0.05
gradient_checkpointing: true
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
quantization_config:
_target_: transformers.utils.quantization_config.BitsAndBytesConfig
load_in_4bit: true
bnb_4bit_compute_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
bnb_4bit_use_double_quant: true
bnb_4bit_quant_type: nf4
device_map:
_target_: models.llama.return_single_device_map
load_in_4bit: true
low_cpu_mem_usage: true
max_memory: true
use_flash_attention_2: true
dist_load_data_barrier: false
read_tensor_train:
_target_: data.collators.wiki.WikiPathDatasetV5.init_from_bin_file
extended_vocab: null
collator:
_target_: data.collators.wiki_seq2seq_collator.WikiSeq2SeqCollatorWithCausalLMFixPaddingSide
max_seq_length: 512
tokenizer: ${model_name_or_path}
causal_lm: true
causal_lm_add_eos: false
generative_mode: true
padding_side: right
num_workers: 4
prefetch_factor: 2
do_preprocess: false
model_name_or_path: pretrained-models/Llama-2-70b-hf
pretrain: null
exp_name: llama2.70b.q_lora.merit_v91_v91.seq2seq.v5.0.6aug.filter.w4.adamw.500steps.NA100.1010
exp_notes: null
output_dir: experiments/${exp_name}
do_train: true
evaluate_during_training: false
do_eval: true
eval_sub_path: checkpoint-*
per_gpu_train_batch_size: 8
per_gpu_eval_batch_size: 16
learning_rate: 0.0002
gradient_accumulation_steps: 128
weight_decay: 0.0
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.99)
max_grad_norm: 0.3
num_train_epochs: 1
max_steps: -1
warmup_proportion: 0
warmup_steps: 50
total_dataset_len: 1808348
optimizer: null
use_nvlamb: null
bit_training: null
logging_steps: 1
save_best: false
save_steps: 100
eval_steps: -1
ddp_eval: true
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O1
fp16_bfloat16: true
prediction_cfg:
metric: acc
measure: 1
best_checkpoint: null
best_result: null
eval_forward_fn:
_target_: general_util.evaluator.DiscriminatorForwardFn
post_process: null
compile: false
fairscale_config: null
fsdp_config: null
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas:
- 0.9
- 0.999
eps: ${adam_epsilon}
weight_decay: ${weight_decay}
scheduler:
type: WarmupLR
params:
warmup_max_lr: ${learning_rate}
warmup_num_steps: 50
warmup_type: linear
gradient_clipping: ${max_grad_norm}
bf16:
enabled: ${fp16}
zero_optimization:
stage: 1
contiguous_gradients: true
overlap_comm: true
reduce_scatter: true
reduce_bucket_size: 500000000.0
allgather_bucket_size: 500000000.0
steps_per_print: 1024
with_lightseq: false
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys: null
outputs_index_or_keys:
train/mlm_loss: mlm_loss
n_gpu: 1
device: cuda:0
train_batch_size: 8
eval_batch_size: null
world_size: 4
|