ThomasTheMaker's picture
Upload folder using huggingface_hub
287391d verified
checkpointing:
checkpoints_dir: checkpoints
evaluation:
eval_results_dir: eval_results
fabric_checkpoint_dir: fabric_state
fabric_checkpoint_filename: checkpoint.pt
hf_checkpoint:
collection_slug: null
repo_id: ThomasTheMaker/pico-decoder-tiny
learning_dynamics:
batch_size: 1
eval_data: null
layer_suffixes:
- attention.v_proj
- attention.o_proj
- swiglu.w_2
sequence_idx: -1
learning_dynamics_dir: learning_dynamics
logs_dir: logs
run_name: pico-decoder-tiny-dolma5M-v1
runs_dir: runs
save_every_n_steps: 500
save_to_hf: true
training:
auto_resume: true
data:
dataloader:
batch_size: 4
dataset:
name: ThomasTheMaker/pretokenized-dolma-5M
tokenizer:
name: allenai/OLMo-7B-0724-hf
vocab_size: 50304
evaluation:
metrics:
- paloma
paloma:
batch_size: 1
dataset_name: pico-lm/pretokenized-paloma-tinsy
dataset_split: val
max_length: 2048
model:
activation_hidden_dim: 384
attention_n_heads: 12
attention_n_kv_heads: 4
batch_size: 1024
d_model: 96
max_seq_len: 2048
model_type: pico_decoder
n_layers: 12
norm_eps: 1.0e-06
position_emb_theta: 10000.0
vocab_size: 50304
monitoring:
logging:
log_every_n_steps: 25
log_level: INFO
save_to_wandb: false
wandb:
entity: boymyc
project: pico-decoder-tiny
training:
fabric:
accelerator: cuda
num_devices: 1
num_nodes: 1
precision: bf16-mixed
max_steps: 20000
optimization:
gradient_accumulation_steps: 4
lr: 5.0e-05
lr_scheduler: cosine
lr_warmup_steps: 8000
optimizer: adamw