accum-freq: 4
beta1: 0.9
beta2: 0.95
data-key: "json"
dataset-resampled: True
# delete-previous-checkpoint: False
# Total 25B * 40 = 1T tokens
epochs: 40
fsdp: True
fsdp-limit-all-gathers: True
# grad-checkpointing: False
grad-clip-norm: 1
log-every-n-steps: 20
model: "open_lm_7b"
name: "sample_7b"
precision: "amp_bfloat16"
report-to: "wandb"
seed: 124
train-data-mix-weights: [0.725, 0.275]
train-data: ["TODO"]
train-num-samples: 25_000_000_000
wandb-project-name: "lm1"
workers: 4
logs: /opt/ml/checkpoints/

# Some important parameters, double checked with Mitchell:
batch-size: 16
ffn-type: swiglu
# fsdp-amp: False
fsdp-pure-bf16: True
fsdp-backward-prefetch: True
lr: 3.e-4
lr-cooldown-end: 3.e-5
model-norm: "gain_only_lp_layer_norm"
qk-norm: True
warmup: 5000
wd: 0.1
z-loss-coefficient: 1.e-4