File size: 3,703 Bytes
91f9d3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
aws_output_bucket: s3://sagemaker-us-east-1-107457652907/experiments/falcon.40b.q_lora.merit_v91_v91.seq2seq.v5.0.3aug.w16.adamw.500steps.NA100.0528.aws
train_file: /opt/ml/input/data/train/distant_path_v9.1_fix_no_shuffle.train.0.pkl
test_file: null
model:
_target_: models.rw.RWForConditionalGenerationFlan.from_pretrained
pad_token_id: 11
use_peft: true
lora_config:
_recursive_: false
_target_: models.rw.LoraConfig
task_type: CAUSAL_LM
inference_mode: false
target_modules:
_target_: models.rw.find_all_linear_names
bits: 4
r: 64
lora_alpha: 16
lora_dropout: 0.05
gradient_checkpointing: true
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
quantization_config:
_target_: transformers.utils.quantization_config.BitsAndBytesConfig
load_in_4bit: true
bnb_4bit_compute_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
bnb_4bit_use_double_quant: true
bnb_4bit_quant_type: nf4
device_map:
_target_: models.rw.return_single_device_map
load_in_4bit: true
max_memory: true
read_tensor_train:
_target_: data.wiki_entity_path_v9_1_2.convert_examples_into_features_seq2seq
max_neg_num: 3
aug_num: 3
max_seq_length: 512
shuffle_context: true
min_rep_num: 5
geo_p: 0.4
deduct_ratio: 1.0
context_ratio: 1.0
noise_sent_ratio: 0.0
num_workers: 128
extended_vocab: null
collator:
_target_: data.collators.wiki_seq2seq_collator.WikiSeq2SeqCollatorWithCausalLM
max_seq_length: 512
tokenizer: ${model_name_or_path}
causal_lm: true
causal_lm_add_eos: false
generative_mode: true
num_workers: 4
prefetch_factor: 2
do_preprocess: false
model_name_or_path: /tmp/falcon-40b
pretrain: null
exp_name: falcon.40b.q_lora.merit_v91_v91.seq2seq.v5.0.3aug.w16.adamw.500steps.NA100.0528.aws
exp_notes: null
output_dir: /tmp/${exp_name}
do_train: true
evaluate_during_training: false
do_eval: true
eval_sub_path: checkpoint-*
per_gpu_train_batch_size: 16
per_gpu_eval_batch_size: 8
learning_rate: 0.0005
gradient_accumulation_steps: 16
weight_decay: 0.0
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.99)
max_grad_norm: 0.3
num_train_epochs: 1
max_steps: -1
warmup_proportion: 0
warmup_steps: 50
optimizer: null
use_nvlamb: null
bit_training: null
logging_steps: 1
save_best: false
save_steps: 100
eval_steps: -1
ddp_eval: true
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O1
fp16_bfloat16: true
prediction_cfg:
metric: acc
measure: 1
best_checkpoint: null
best_result: null
eval_forward_fn:
_target_: general_util.evaluator.DiscriminatorForwardFn
post_process: null
compile: false
fairscale_config: null
fsdp_config: null
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas:
- 0.9
- 0.999
eps: ${adam_epsilon}
weight_decay: ${weight_decay}
scheduler:
type: WarmupLR
params:
warmup_max_lr: ${learning_rate}
warmup_num_steps: 50
warmup_type: linear
gradient_clipping: ${max_grad_norm}
bf16:
enabled: ${fp16}
zero_optimization:
stage: 1
contiguous_gradients: true
overlap_comm: true
reduce_scatter: true
reduce_bucket_size: 500000000.0
allgather_bucket_size: 500000000.0
steps_per_print: 1024
with_lightseq: false
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys: null
outputs_index_or_keys:
train/mlm_loss: mlm_loss
n_gpu: 1
device: cuda:0
train_batch_size: 16
eval_batch_size: null
world_size: 16
|