accum-freq: 4 beta1: 0.9 beta2: 0.95 data-key: "json" dataset-resampled: True # delete-previous-checkpoint: False # Total 25B * 40 = 1T tokens epochs: 40 fsdp: True fsdp-limit-all-gathers: True # grad-checkpointing: False grad-clip-norm: 1 log-every-n-steps: 20 model: "open_lm_7b" name: "sample_7b" precision: "amp_bfloat16" report-to: "wandb" seed: 124 train-data-mix-weights: [0.725, 0.275] train-data: ["TODO"] train-num-samples: 25_000_000_000 wandb-project-name: "lm1" workers: 4 logs: /opt/ml/checkpoints/ # Some important parameters, double checked with Mitchell: batch-size: 16 ffn-type: swiglu # fsdp-amp: False fsdp-pure-bf16: True fsdp-backward-prefetch: True lr: 3.e-4 lr-cooldown-end: 3.e-5 model-norm: "gain_only_lp_layer_norm" qk-norm: True warmup: 5000 wd: 0.1 z-loss-coefficient: 1.e-4