| model_name: tangled-alpha-0.9-core | |
| model_config: | |
| name: tangled-alpha-0.9-core | |
| hf_config: {} | |
| block_size: 131072 | |
| n_layer: 32 | |
| n_embd: 512 | |
| vocab_size: 131072 | |
| padding_multiple: 512 | |
| padded_vocab_size: 131072 | |
| norm_class_name: RMSNorm | |
| norm_eps: 1.0e-05 | |
| norm_qk: false | |
| post_attention_norm: false | |
| post_mlp_norm: false | |
| parallel_residual: false | |
| shared_attention_norm: false | |
| n_head: 8 | |
| head_size: 128 | |
| n_query_groups: 8 | |
| attn_bias: false | |
| rope_base: 84000 | |
| rotary_percentage: 1.0 | |
| rope_condense_ratio: 1 | |
| intermediate_size: 2048 | |
| bias: false | |
| mlp_class_name: LLaMAMLP | |
| gelu_approximate: none | |
| n_expert: 0 | |
| n_expert_per_token: 0 | |
| scale_embeddings: false | |
| lm_head_bias: false | |
| out_dir: ../out/pretrain-core-3 | |
| precision: bf16-true | |
| initial_checkpoint_dir: ../out/pretrain-core-2/checkpoint | |
| data: | |
| class_path: litgpt.data.LitData | |
| init_args: | |
| data_path: ../core-data-3-4097-8193-8193-2000/ | |
| seed: 42 | |
| num_workers: 32 | |
| train: | |
| save_interval: 10 | |
| log_interval: 1 | |
| global_batch_size: 512 | |
| micro_batch_size: 1 | |
| lr_warmup_steps: 0 | |
| max_tokens: 464649609 | |
| max_seq_length: 8192 | |
| tie_embeddings: true | |
| max_norm: 1.0 | |
| min_lr: 5.0e-05 | |
| eval: | |
| interval: 10 | |
| max_iters: 100 | |
| initial_validation: true | |
| final_validation: true | |
| evaluate_example: first | |
| optimizer: | |
| class_path: sophia_opt.SophiaG | |
| init_args: | |
| lr: 0.0001 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| rho: 0.05 | |
| weight_decay: 0.1 | |
| devices: auto | |
| num_nodes: 1 | |
| tokenizer_dir: ../tokenizer | |
| logger_name: wandb | |
| seed: 23 | |