Delta-Vector commited on 7 days ago

Commit

2d38ae8

verified ·

1 Parent(s): a1e45db

Upload folder using huggingface_hub

Browse files

Files changed (33) hide show

accelerate.yaml +16 -0
base.toml +52 -0
grow40_simple.toml +53 -0
grow40_winning.toml +54 -0
grow40_winning_v2.toml +53 -0
replicate_zero4.toml +52 -0
sn97_eval_long.toml +53 -0
sn97_eval_long_alt.toml +53 -0
sn97_mm_refine.toml +53 -0
sn97_mm_top8.toml +58 -0
sn97_smoke.toml +53 -0
sn97_text32_long_phase1.toml +53 -0
sn97_text32_long_phase2.toml +53 -0
sn97_text32_long_phase2b.toml +53 -0
sn97_text32_long_phase3.toml +53 -0
sn97_text32_long_phase4.toml +53 -0
sn97_text32_long_phase5.toml +53 -0
sn97_text35_full.toml +53 -0
sn97_text35_warm.toml +54 -0
sweep/A_resume_lr1e7_cos.toml +52 -0
sweep/B_resume_lr5e8_cos.toml +51 -0
sweep/C_resume_lr2e8_cos.toml +51 -0
sweep/D_resume_lr1e7_const.toml +51 -0
sweep/E_resume_lr5e8_b95.toml +52 -0
sweep/F_cold_lr1e7_grow40.toml +51 -0
sweep/G_cold_lr2e7_grow40.toml +51 -0
sweep/H_cold_lr1e7_32L.toml +52 -0
sweep/I_cold_paramgroups_grow40.toml +52 -0
sweep/J_phase2_lr5e9_const.toml +52 -0
sweep/K_phase2_lr2e8_const.toml +51 -0
sweep/L_phase2_lr1e8_warmup500.toml +51 -0
sweep/M_phase2_lr2e8_largebatch.toml +52 -0
zero_14_17.toml +53 -0

accelerate.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+mixed_precision: bf16
+num_processes: 8
+num_machines: 1
+machine_rank: 0
+gpu_ids: all
+rdzv_backend: static
+same_network: true
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+debug: false
+enable_cpu_affinity: false
+main_training_function: main
+downcast_bf16: 'no'

base.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Base distillation config (smoketest variant).
+# Every value the script reads must live in this file - no defaults in code.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 640
+kl_start_pos   = 128
+seed           = 42
+shuffle_buffer = 10000
+[train]
+seed                 = 42
+lr                   = 5.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 5
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 0
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 5
+samples     = 16
+seed        = 1234
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "smoketest"
+log_every     = 1
+output_dir    = "./out/smoketest"
+[init]
+zero_layers        = []
+target_num_layers  = 32

grow40_simple.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Grow student to 40 layers with the current (bf16, seq=640) hparams.
+# Tests the architectural change in isolation without the winning hparams,
+# so we can attribute any improvement.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 640
+kl_start_pos   = 128
+seed           = 42
+shuffle_buffer = 10000
+[train]
+seed                 = 42
+lr                   = 5.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 8
+micro_batch_size     = 8
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 0
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 64
+seed        = 1234
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "grow40_simple"
+log_every     = 1
+output_dir    = "./out/grow40_simple"
+[init]
+zero_layers        = []
+target_num_layers  = 40

grow40_winning.toml ADDED Viewed

	@@ -0,0 +1,54 @@

+# Grow student to 40 layers AND apply the winning hparams from zero4_long.
+# Note: student is bf16 (not fp32 as in the original winning run) because the
+# fp32 master weights + 40 layers + Adam state + bf16 teacher OOMs on B200
+# without sharding. Everything else matches the winning recipe.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "grow40_winning"
+log_every     = 1
+output_dir    = "./out/grow40_winning"
+[init]
+zero_layers        = []
+target_num_layers  = 40

grow40_winning_v2.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+# grow40_winning v2: same hparams, but the scheduler bug is fixed in distill.py
+# (we no longer prepare the scheduler with accelerate, so cosine reaches its
+# minimum at step max_steps instead of step max_steps / num_processes).
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "grow40_winning_v2"
+log_every     = 1
+output_dir    = "./out/grow40_winning_v2"
+[init]
+zero_layers        = []
+target_num_layers  = 40

replicate_zero4.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Replicates wandb run "zero4_long" (mepqfry1, eval kl 0.275).
+# Same hparams as that run; same 4-layer zero (14-17). 32-layer student.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 1
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "float32"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "replicate_zero4"
+log_every     = 1
+output_dir    = "./out/replicate_zero4"
+[init]
+zero_layers        = [14, 15, 16, 17]
+target_num_layers  = 32

sn97_eval_long.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-text"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 1
+max_steps            = 1
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 1
+samples     = 16
+seed        = 4242
+cache_path  = "./cache/sn97_eval_long_16.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_eval_long"
+log_every      = 1
+output_dir     = "./out/sn97_eval_long"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_eval_long_alt.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-text"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 1
+max_steps            = 1
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 1
+samples     = 32
+seed        = 9917
+cache_path  = "./cache/sn97_eval_long_alt_32.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_eval_long_alt"
+log_every      = 1
+output_dir     = "./out/sn97_eval_long_alt"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_mm_refine.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-xxxn"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 768
+max_seq_len    = 768
+kl_start_pos   = 64
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-7
+schedule             = "cosine"
+warmup_steps         = 20
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.98]
+eps                  = 1.0e-6
+samples_per_step     = 2
+micro_batch_size     = 1
+max_steps            = 20
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 128
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 5
+samples     = 32
+seed        = 4242
+cache_path  = "./cache/sn97_eval_32.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_mm_refine"
+log_every      = 1
+output_dir     = "./out/sn97_mm_refine"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_mm_top8.toml ADDED Viewed

	@@ -0,0 +1,58 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-xxxn"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 768
+max_seq_len    = 768
+kl_start_pos   = 64
+seed           = 7777
+shuffle_buffer = 10000
+[train]
+seed                 = 7777
+lr                   = 3.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 2
+micro_batch_size     = 1
+max_steps            = 20
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 128
+new_layer_lr_mul     = 1.0
+trainable_patterns   = [
+  "^language_model\\.model\\.layers\\.(24|25|26|27|28|29|30|31)\\.",
+  "^language_model\\.lm_head",
+  "^language_model\\.model\\.norm"
+]
+[eval]
+every_steps = 5
+samples     = 32
+seed        = 4242
+cache_path  = "./cache/sn97_eval_32.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_mm_top8"
+log_every      = 1
+output_dir     = "./out/sn97_mm_top8"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_smoke.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-xxxn"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 512
+max_seq_len    = 512
+kl_start_pos   = 64
+seed           = 1234
+shuffle_buffer = 10000
+[train]
+seed                 = 1234
+lr                   = 1.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 1
+micro_batch_size     = 1
+max_steps            = 2
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 128
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 1
+samples     = 4
+seed        = 4242
+cache_path  = "./cache/sn97_smoke_eval.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_smoke"
+log_every      = 1
+output_dir     = "./out/sn97_smoke"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_text32_long_phase1.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-text"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 1
+max_steps            = 30
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 10
+samples     = 16
+seed        = 4242
+cache_path  = "./cache/sn97_eval_long_16.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text32_long_phase1"
+log_every      = 1
+output_dir     = "./out/sn97_text32_long_phase1"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_text32_long_phase2.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-text"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-8
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 16
+micro_batch_size     = 1
+max_steps            = 40
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 10
+samples     = 16
+seed        = 4242
+cache_path  = "./cache/sn97_eval_long_16.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text32_long_phase2"
+log_every      = 1
+output_dir     = "./out/sn97_text32_long_phase2"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_text32_long_phase2b.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "./out/sn97_text32_long_phase2/best"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-8
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 16
+micro_batch_size     = 1
+max_steps            = 40
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 10
+samples     = 16
+seed        = 4242
+cache_path  = "./cache/sn97_eval_long_16.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text32_long_phase2b"
+log_every      = 1
+output_dir     = "./out/sn97_text32_long_phase2b"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_text32_long_phase3.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "./out/sn97_text32_long_phase2/best"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-9
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-2
+samples_per_step     = 16
+micro_batch_size     = 1
+max_steps            = 40
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 10
+samples     = 16
+seed        = 4242
+cache_path  = "./cache/sn97_eval_long_16.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text32_long_phase3"
+log_every      = 1
+output_dir     = "./out/sn97_text32_long_phase3"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_text32_long_phase4.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "./out/sn97_text32_long_phase3/best"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-9
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-2
+samples_per_step     = 16
+micro_batch_size     = 1
+max_steps            = 20
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 10
+samples     = 16
+seed        = 4242
+cache_path  = "./cache/sn97_eval_long_16.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text32_long_phase4"
+log_every      = 1
+output_dir     = "./out/sn97_text32_long_phase4"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_text32_long_phase5.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "./out/sn97_text32_long_phase3/best"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-9
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-2
+samples_per_step     = 16
+micro_batch_size     = 1
+max_steps            = 20
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 10
+samples     = 16
+seed        = 4242
+cache_path  = "./cache/sn97_eval_long_16.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text32_long_phase5"
+log_every      = 1
+output_dir     = "./out/sn97_text32_long_phase5"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 32

sn97_text35_full.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "./out/sn97_text35_warm/best"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 768
+max_seq_len    = 1024
+kl_start_pos   = 32
+seed           = 6868
+shuffle_buffer = 10000
+[train]
+seed                 = 6868
+lr                   = 2.0e-8
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 1
+max_steps            = 80
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 128
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 10
+samples     = 32
+seed        = 4242
+cache_path  = "./cache/sn97_eval_32.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text35_full"
+log_every      = 1
+output_dir     = "./out/sn97_text35_full"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 35

sn97_text35_warm.toml ADDED Viewed

	@@ -0,0 +1,54 @@

+[model]
+teacher                = "../models/Qwen3.5-35B-A3B"
+student                = "../models/sn97-text"
+tokenizer              = "../models/Qwen3.5-35B-A3B"
+student_device         = "cuda:7"
+teacher_devices        = [0, 1, 2, 3, 4, 5]
+teacher_max_memory_gb  = 70
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 768
+max_seq_len    = 768
+kl_start_pos   = 64
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "cosine"
+warmup_steps         = 20
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 2
+micro_batch_size     = 1
+max_steps            = 20
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+kl_chunk_size        = 128
+new_layer_lr_mul     = 5.0
+trainable_patterns   = ["^model\\.layers\\.(32|33|34)\\.", "^lm_head", "^model\\.norm"]
+[eval]
+every_steps = 5
+samples     = 32
+seed        = 4242
+cache_path  = "./cache/sn97_eval_32.pt"
+[log]
+wandb          = false
+wandb_project  = "distil-subnet97"
+wandb_run      = "sn97_text35_warm"
+log_every      = 1
+output_dir     = "./out/sn97_text35_warm"
+experiment_log = "./out/experiments.jsonl"
+[init]
+zero_layers       = []
+target_num_layers = 35

sweep/A_resume_lr1e7_cos.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Resume from grow40_winning best (eval kl 0.2219). Lower peak LR to avoid the
+# overshoot we saw at 5e-7. Cosine warmup 100, 1500 steps.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/grow40_winning/best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 1500
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "A_resume_lr1e7_cos"
+log_every     = 1
+output_dir    = "./out/sweep/A_resume_lr1e7_cos"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/B_resume_lr5e8_cos.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Resume from grow40_winning best (eval kl 0.2219). Even lower peak LR.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/grow40_winning/best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-8
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 1500
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "B_resume_lr5e8_cos"
+log_every     = 1
+output_dir    = "./out/sweep/B_resume_lr5e8_cos"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/C_resume_lr2e8_cos.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Resume from grow40_winning best. Very small LR - basically a fine-tune.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/grow40_winning/best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-8
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 1500
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "C_resume_lr2e8_cos"
+log_every     = 1
+output_dir    = "./out/sweep/C_resume_lr2e8_cos"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/D_resume_lr1e7_const.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Resume from grow40_winning best. Constant LR (no schedule overshoot at all).
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/grow40_winning/best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 1500
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "D_resume_lr1e7_const"
+log_every     = 1
+output_dir    = "./out/sweep/D_resume_lr1e7_const"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/E_resume_lr5e8_b95.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Resume from grow40_winning best. Smaller second-moment memory (beta2=0.95)
+# so Adam stabilizes faster. Same low LR.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/grow40_winning/best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-8
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 1500
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "E_resume_lr5e8_b95"
+log_every     = 1
+output_dir    = "./out/sweep/E_resume_lr5e8_b95"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/F_cold_lr1e7_grow40.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Cold start, 40 layers, lower peak LR than the original winning recipe.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "F_cold_lr1e7_grow40"
+log_every     = 1
+output_dir    = "./out/sweep/F_cold_lr1e7_grow40"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/G_cold_lr2e7_grow40.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Cold start, 40 layers, lr=2e-7 (between 1e-7 and the failing 5e-7).
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "G_cold_lr2e7_grow40"
+log_every     = 1
+output_dir    = "./out/sweep/G_cold_lr2e7_grow40"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/H_cold_lr1e7_32L.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Cold start, 32 layers (NO grow), lower LR. Tests whether the +8 layers were
+# helping at all once we use the right LR.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "H_cold_lr1e7_32L"
+log_every     = 1
+output_dir    = "./out/sweep/H_cold_lr1e7_32L"
+[init]
+zero_layers        = []
+target_num_layers  = 32

sweep/I_cold_paramgroups_grow40.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Cold start, 40 layers, low LR for original layers + 5x for the new ones.
+# Lets the new layers wake up faster without disturbing the trained layers.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 5.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "I_cold_paramgroups_grow40"
+log_every     = 1
+output_dir    = "./out/sweep/I_cold_paramgroups_grow40"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/J_phase2_lr5e9_const.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Phase 2: ultra-conservative resume from phase 1 best.
+# Tiny LR, constant, zero warmup, very high beta2 for max smoothing.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-9
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-2
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 3000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "J_phase2_lr5e9_const"
+log_every     = 1
+output_dir    = "./out/sweep/J_phase2_lr5e9_const"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/K_phase2_lr2e8_const.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Phase 2: still conservative but a bit more LR than J.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-8
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 3000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "K_phase2_lr2e8_const"
+log_every     = 1
+output_dir    = "./out/sweep/K_phase2_lr2e8_const"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/L_phase2_lr1e8_warmup500.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Phase 2: very gentle cosine warmup over 500 steps to avoid any LR shock.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-8
+schedule             = "cosine"
+warmup_steps         = 500
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 3000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "L_phase2_lr1e8_warmup500"
+log_every     = 1
+output_dir    = "./out/sweep/L_phase2_lr1e8_warmup500"
+[init]
+zero_layers        = []
+target_num_layers  = 40

sweep/M_phase2_lr2e8_largebatch.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Phase 2: same tiny LR but larger inner batch (16/rank → effective 128) so the
+# gradients are much smoother. Should give the smoothest descent of all.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-8
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 16
+micro_batch_size     = 1
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "M_phase2_lr2e8_largebatch"
+log_every     = 1
+output_dir    = "./out/sweep/M_phase2_lr2e8_largebatch"
+[init]
+zero_layers        = []
+target_num_layers  = 40

zero_14_17.toml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Layer-zero distillation: zero student layers 14-17 at init,
+# constant LR 5e-7, 2000 steps. Aim: lower KL than the prior checkpoint
+# despite the surgery.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 640
+kl_start_pos   = 128
+seed           = 42
+shuffle_buffer = 10000
+[train]
+seed                 = 42
+lr                   = 5.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 8
+micro_batch_size     = 8
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 0
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 64
+seed        = 1234
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "m-6a3lnzvb-zero14_17"
+log_every     = 1
+output_dir    = "./out/zero_14_17"
+[init]
+zero_layers        = [14, 15, 16, 17]
+target_num_layers  = 32