falcon.40b.q_lora.merit_v91_v91.seq2seq.v5.0.3aug.w16.adamw.500steps.NA100.0528
/
checkpoint-300
/training_config.yaml
aws_output_bucket: s3://sagemaker-us-east-1-107457652907/experiments/falcon.40b.q_lora.merit_v91_v91.seq2seq.v5.0.3aug.w16.adamw.500steps.NA100.0528.aws | |
train_file: /opt/ml/input/data/train/distant_path_v9.1_fix_no_shuffle.train.0.pkl | |
test_file: null | |
model: | |
_target_: models.rw.RWForConditionalGenerationFlan.from_pretrained | |
pad_token_id: 11 | |
use_peft: true | |
lora_config: | |
_recursive_: false | |
_target_: models.rw.LoraConfig | |
task_type: CAUSAL_LM | |
inference_mode: false | |
target_modules: | |
_target_: models.rw.find_all_linear_names | |
bits: 4 | |
r: 64 | |
lora_alpha: 16 | |
lora_dropout: 0.05 | |
gradient_checkpointing: true | |
torch_dtype: | |
_target_: general_util.training_utils.return_torch_dtype | |
dtype: bfloat16 | |
quantization_config: | |
_target_: transformers.utils.quantization_config.BitsAndBytesConfig | |
load_in_4bit: true | |
bnb_4bit_compute_dtype: | |
_target_: general_util.training_utils.return_torch_dtype | |
dtype: bfloat16 | |
bnb_4bit_use_double_quant: true | |
bnb_4bit_quant_type: nf4 | |
device_map: | |
_target_: models.rw.return_single_device_map | |
load_in_4bit: true | |
max_memory: true | |
read_tensor_train: | |
_target_: data.wiki_entity_path_v9_1_2.convert_examples_into_features_seq2seq | |
max_neg_num: 3 | |
aug_num: 3 | |
max_seq_length: 512 | |
shuffle_context: true | |
min_rep_num: 5 | |
geo_p: 0.4 | |
deduct_ratio: 1.0 | |
context_ratio: 1.0 | |
noise_sent_ratio: 0.0 | |
num_workers: 128 | |
extended_vocab: null | |
collator: | |
_target_: data.collators.wiki_seq2seq_collator.WikiSeq2SeqCollatorWithCausalLM | |
max_seq_length: 512 | |
tokenizer: ${model_name_or_path} | |
causal_lm: true | |
causal_lm_add_eos: false | |
generative_mode: true | |
num_workers: 4 | |
prefetch_factor: 2 | |
do_preprocess: false | |
model_name_or_path: /tmp/falcon-40b | |
pretrain: null | |
exp_name: falcon.40b.q_lora.merit_v91_v91.seq2seq.v5.0.3aug.w16.adamw.500steps.NA100.0528.aws | |
exp_notes: null | |
output_dir: /tmp/${exp_name} | |
do_train: true | |
evaluate_during_training: false | |
do_eval: true | |
eval_sub_path: checkpoint-* | |
per_gpu_train_batch_size: 16 | |
per_gpu_eval_batch_size: 8 | |
learning_rate: 0.0005 | |
gradient_accumulation_steps: 16 | |
weight_decay: 0.0 | |
adam_epsilon: 1.0e-06 | |
adam_betas: (0.9, 0.99) | |
max_grad_norm: 0.3 | |
num_train_epochs: 1 | |
max_steps: -1 | |
warmup_proportion: 0 | |
warmup_steps: 50 | |
optimizer: null | |
use_nvlamb: null | |
bit_training: null | |
logging_steps: 1 | |
save_best: false | |
save_steps: 100 | |
eval_steps: -1 | |
ddp_eval: true | |
no_cuda: false | |
seed: 42 | |
local_rank: 0 | |
fp16: true | |
fp16_opt_level: O1 | |
fp16_bfloat16: true | |
prediction_cfg: | |
metric: acc | |
measure: 1 | |
best_checkpoint: null | |
best_result: null | |
eval_forward_fn: | |
_target_: general_util.evaluator.DiscriminatorForwardFn | |
post_process: null | |
compile: false | |
fairscale_config: null | |
fsdp_config: null | |
ds_cfg: | |
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} | |
gradient_accumulation_steps: ${gradient_accumulation_steps} | |
optimizer: | |
type: AdamW | |
params: | |
lr: ${learning_rate} | |
betas: | |
- 0.9 | |
- 0.999 | |
eps: ${adam_epsilon} | |
weight_decay: ${weight_decay} | |
scheduler: | |
type: WarmupLR | |
params: | |
warmup_max_lr: ${learning_rate} | |
warmup_num_steps: 50 | |
warmup_type: linear | |
gradient_clipping: ${max_grad_norm} | |
bf16: | |
enabled: ${fp16} | |
zero_optimization: | |
stage: 1 | |
contiguous_gradients: true | |
overlap_comm: true | |
reduce_scatter: true | |
reduce_bucket_size: 500000000.0 | |
allgather_bucket_size: 500000000.0 | |
steps_per_print: 1024 | |
with_lightseq: false | |
summary_helper: | |
_target_: general_util.tensorboard_helper.WandbWriter | |
batch_index_or_keys: null | |
outputs_index_or_keys: | |
train/mlm_loss: mlm_loss | |
n_gpu: 1 | |
device: cuda:0 | |
train_batch_size: 16 | |
eval_batch_size: null | |
world_size: 16 | |