| checkpointing: | |
| checkpoints_dir: checkpoints | |
| evaluation: | |
| eval_results_dir: eval_results | |
| fabric_checkpoint_dir: fabric_state | |
| fabric_checkpoint_filename: checkpoint.pt | |
| hf_checkpoint: | |
| collection_slug: null | |
| repo_id: pico-lm/pico-decoder-small | |
| learning_dynamics: | |
| batch_size: 128 | |
| eval_data: pico-lm/pretokenized-paloma-tinsy | |
| layer_suffixes: | |
| - attention.v_proj | |
| - attention.o_proj | |
| - swiglu.w_2 | |
| sequence_idx: -1 | |
| learning_dynamics_dir: learning_dynamics | |
| logs_dir: logs | |
| run_name: pico-decoder-small-1 | |
| runs_dir: runs | |
| save_every_n_steps: 1000 | |
| save_to_hf: true | |
| training: | |
| auto_resume: true | |
| data: | |
| dataloader: | |
| batch_size: 1024 | |
| dataset: | |
| name: pico-lm/pretokenized-dolma | |
| tokenizer: | |
| name: allenai/OLMo-7B-0724-hf | |
| vocab_size: 50304 | |
| evaluation: | |
| metrics: | |
| - paloma | |
| paloma: | |
| batch_size: 16 | |
| dataset_name: pico-lm/pretokenized-paloma-tinsy | |
| dataset_split: val | |
| max_length: 2048 | |
| model: | |
| activation_hidden_dim: 1536 | |
| attention_n_heads: 12 | |
| attention_n_kv_heads: 4 | |
| batch_size: 1024 | |
| d_model: 384 | |
| max_seq_len: 2048 | |
| model_type: pico_decoder | |
| n_layers: 12 | |
| norm_eps: 1.0e-06 | |
| position_emb_theta: 10000.0 | |
| vocab_size: 50304 | |
| monitoring: | |
| logging: | |
| log_every_n_steps: 100 | |
| log_level: INFO | |
| save_to_wandb: true | |
| wandb: | |
| entity: pico-lm | |
| project: pico-decoder | |
| training: | |
| fabric: | |
| accelerator: cuda | |
| num_devices: 4 | |
| num_nodes: 4 | |
| precision: bf16-mixed | |
| max_steps: 200000 | |
| optimization: | |
| gradient_accumulation_steps: 8 | |
| lr: 0.0003 | |
| lr_scheduler: linear_with_warmup | |
| lr_warmup_steps: 2500 | |
| optimizer: adamw | |