mode: pt | |
device: gpu | |
precision: bf16 | |
eval_only: false | |
predict_only: false | |
seed: 93789 | |
tokenizer: | |
name: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5 | |
working_dir: null | |
model: | |
liger: true | |
klass: local_t5 | |
name: pszemraj/tFINE-850m-24x24-1024ctx | |
overwrite: | |
dropout_rate: 0.0 | |
num_decoder_layers: 16 | |
num_key_value_heads: 4 | |
num_layers: 16 | |
use_gqa: true | |
add_config: | |
is_bf16: false | |
checkpoint_path: '' | |
random_init: true | |
compile: true | |
data: | |
multi_task: true | |
NTP: 0.3 | |
input_length: 512 | |
max_seq_len: 512 | |
mlm_probability: 0.15 | |
mean_noise_span_length: 3.0 | |
num_workers: 0 | |
optim: | |
name: adamwscale | |
base_lr: 0.001 | |
batch_size: 128 | |
total_steps: 65536 | |
epochs: -1 | |
warmup_steps: 5000 | |
lr_scheduler: cosine | |
weight_decay: 0.01 | |
grad_clip: 1.0 | |
grad_acc: 16 | |
final_cosine: 2.0e-05 | |
eval: | |
every_steps: 500 | |
steps: 0 | |
checkpoint: | |
every_steps: 1500 | |
logging: | |
every_steps: 25 | |
grad_l2: true | |
weights_l2: true | |
use_wandb: true | |
wandb_config: | |
project: nanoT5 | |
entity: amazingvince | |
tags: | |
- gqa | |
- large | |
- e32-d16 | |
- 512 ctx | |
mode: online | |