File size: 1,080 Bytes
1b37db9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
adam_beta1: 0.9
adam_beta2: 0.95
adjust_step: 0
autoresume: false
batch_size: 6
clip_grad_norm: 1.0
comment: null
cycle_length: null
dtype: bfloat16
emb_freeze: null
eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
eval_every: 1000
first_attention: false
first_attention_resume: false
first_layer: true
gradient_accumulation: 13
keep_checkpoints: null
layer_freeze: null
layer_freeze_2: false
load_optimizer_state_on_resume: true
lr: 0.0004
max_length: 1024
max_train_tokens: null
min_lr_ratio: 0.1
model_config: model_config/478m.json
model_name_or_path: null
model_revision: null
num_training_steps: 15000
optimizer: Adam
restart_warmup_steps: null
resume_from: null
run_name: first_layer
save_dir: checkpoints/first_layer
save_every: 1000
scheduler: cosine
seed: 0
shuffle: true
skip_batches: !!set {}
tags:
- 396m-for-680m
total_batch_size: 624
train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
training_config: training_config/two_stage/478m_first_layer.yaml
wandb_watch: true
warmed_up_model: null
warmup_steps: 1500
weight_decay: 0.0
workers: 8
|