mode: pt device: gpu precision: bf16 eval_only: false predict_only: false seed: 2137 tokenizer: name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5 model: klass: custom_seq2seq name: google/t5-v1_1-base overwrite: null add_config: null checkpoint_path: '' random_init: true compile: true data: input_length: 1024 mlm_probability: 0.15 mean_noise_span_length: 3.0 num_workers: 8 optim: name: adamwscale base_lr: 0.02 batch_size: 64 total_steps: 65536 epochs: -1 warmup_steps: 10000 lr_scheduler: cosine weight_decay: 0.001 grad_clip: 1.0 grad_acc: 4 final_cosine: 1.0e-05 eval: every_steps: 100000 steps: 500 checkpoint: every_steps: 5000 logging: every_steps: 100 grad_l2: true weights_l2: true use_wandb: true wandb_config: project: nano-custom-seq2seq entity: amazingvince tags: - nanoT5 - my_tag mode: online