adam_beta1: 0.9 adam_beta2: 0.95 adjust_step: 0 autoresume: false batch_size: 6 clip_grad_norm: 1.0 comment: null cycle_length: null dtype: bfloat16 emb_freeze: null eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ eval_every: 1000 first_attention: null gradient_accumulation: 13 keep_checkpoints: null layer_freeze: null load_optimizer_state_on_resume: true lr: 0.0004 max_length: 1024 max_train_tokens: null min_lr_ratio: 0.1 model_config: model_config/478m.json model_name_or_path: null model_revision: null num_training_steps: 15000 optimizer: Adam restart_warmup_steps: null resume_from: null run_name: silver-butterfly-62 save_dir: checkpoints/silver-butterfly-62 save_every: 1000 scheduler: cosine seed: 0 shuffle: true skip_batches: !!set {} tags: - 396m-for-680m total_batch_size: 624 train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ training_config: training_config/478m.yaml wandb_watch: true warmed_up_model: null warmup_steps: 1500 weight_decay: 0.0 workers: 8