adam_beta1: 0.9 | |
adam_beta2: 0.95 | |
adjust_step: 0 | |
autoresume: false | |
batch_size: 6 | |
clip_grad_norm: 1.0 | |
comment: null | |
cycle_length: null | |
dtype: bfloat16 | |
emb_freeze: null | |
eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ | |
eval_every: 1000 | |
first_attention: false | |
first_attention_resume: true | |
gradient_accumulation: 13 | |
keep_checkpoints: null | |
layer_freeze: null | |
layer_freeze_2: false | |
load_optimizer_state_on_resume: true | |
lr: 0.0004 | |
max_length: 1024 | |
max_train_tokens: null | |
min_lr_ratio: 0.1 | |
model_config: model_config/478m.json | |
model_name_or_path: null | |
model_revision: null | |
num_training_steps: 15000 | |
optimizer: Adam | |
restart_warmup_steps: null | |
resume_from: null | |
run_name: first_attention_resume_unfreeze | |
save_dir: checkpoints/first_attention_resume_unfreeze | |
save_every: 1000 | |
scheduler: cosine | |
seed: 0 | |
shuffle: true | |
skip_batches: !!set {} | |
tags: | |
- 396m-for-680m | |
total_batch_size: 624 | |
train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ | |
training_config: training_config/two_stage/478m_first_attention_resume_unfreeze.yaml | |
wandb_watch: true | |
warmed_up_model: null | |
warmup_steps: 1500 | |
weight_decay: 0.0 | |
workers: 8 | |