Delta-Vector commited on
Commit
2d38ae8
·
verified ·
1 Parent(s): a1e45db

Upload folder using huggingface_hub

Browse files
accelerate.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ distributed_type: MULTI_GPU
3
+ mixed_precision: bf16
4
+ num_processes: 8
5
+ num_machines: 1
6
+ machine_rank: 0
7
+ gpu_ids: all
8
+ rdzv_backend: static
9
+ same_network: true
10
+ tpu_use_cluster: false
11
+ tpu_use_sudo: false
12
+ use_cpu: false
13
+ debug: false
14
+ enable_cpu_affinity: false
15
+ main_training_function: main
16
+ downcast_bf16: 'no'
base.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base distillation config (smoketest variant).
2
+ # Every value the script reads must live in this file - no defaults in code.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "Troiaaa/m-6a3lnzvb"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 640
14
+ kl_start_pos = 128
15
+ seed = 42
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 42
20
+ lr = 5.0e-7
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.95]
26
+ eps = 1.0e-8
27
+ samples_per_step = 4
28
+ micro_batch_size = 4
29
+ max_steps = 5
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 0
36
+ new_layer_lr_mul = 1.0
37
+
38
+ [eval]
39
+ every_steps = 5
40
+ samples = 16
41
+ seed = 1234
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "smoketest"
47
+ log_every = 1
48
+ output_dir = "./out/smoketest"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 32
grow40_simple.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grow student to 40 layers with the current (bf16, seq=640) hparams.
2
+ # Tests the architectural change in isolation without the winning hparams,
3
+ # so we can attribute any improvement.
4
+
5
+ [model]
6
+ teacher = "Qwen/Qwen3.5-35B-A3B"
7
+ student = "Troiaaa/m-6a3lnzvb"
8
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
9
+
10
+ [data]
11
+ dataset = "karpathy/climbmix-400b-shuffle"
12
+ text_field = "text"
13
+ min_chars = 2560
14
+ max_seq_len = 640
15
+ kl_start_pos = 128
16
+ seed = 42
17
+ shuffle_buffer = 10000
18
+
19
+ [train]
20
+ seed = 42
21
+ lr = 5.0e-7
22
+ schedule = "constant"
23
+ warmup_steps = 0
24
+ weight_decay = 0.0
25
+ grad_clip = 1.0
26
+ betas = [0.9, 0.95]
27
+ eps = 1.0e-8
28
+ samples_per_step = 8
29
+ micro_batch_size = 8
30
+ max_steps = 2000
31
+ grad_checkpointing = true
32
+ attn_implementation = "flash_attention_2"
33
+ student_dtype = "bfloat16"
34
+ teacher_dtype = "bfloat16"
35
+ mixed_precision = "bf16"
36
+ kl_chunk_size = 0
37
+ new_layer_lr_mul = 1.0
38
+
39
+ [eval]
40
+ every_steps = 50
41
+ samples = 64
42
+ seed = 1234
43
+
44
+ [log]
45
+ wandb = true
46
+ wandb_project = "distil-subnet97"
47
+ wandb_run = "grow40_simple"
48
+ log_every = 1
49
+ output_dir = "./out/grow40_simple"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 40
grow40_winning.toml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grow student to 40 layers AND apply the winning hparams from zero4_long.
2
+ # Note: student is bf16 (not fp32 as in the original winning run) because the
3
+ # fp32 master weights + 40 layers + Adam state + bf16 teacher OOMs on B200
4
+ # without sharding. Everything else matches the winning recipe.
5
+
6
+ [model]
7
+ teacher = "Qwen/Qwen3.5-35B-A3B"
8
+ student = "Troiaaa/m-6a3lnzvb"
9
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
10
+
11
+ [data]
12
+ dataset = "karpathy/climbmix-400b-shuffle"
13
+ text_field = "text"
14
+ min_chars = 2560
15
+ max_seq_len = 2048
16
+ kl_start_pos = 128
17
+ seed = 6767
18
+ shuffle_buffer = 10000
19
+
20
+ [train]
21
+ seed = 6767
22
+ lr = 5.0e-7
23
+ schedule = "cosine"
24
+ warmup_steps = 100
25
+ weight_decay = 0.0
26
+ grad_clip = 1.0
27
+ betas = [0.9, 0.999]
28
+ eps = 1.0e-3
29
+ samples_per_step = 4
30
+ micro_batch_size = 4
31
+ max_steps = 2000
32
+ grad_checkpointing = true
33
+ attn_implementation = "flash_attention_2"
34
+ student_dtype = "bfloat16"
35
+ teacher_dtype = "bfloat16"
36
+ mixed_precision = "bf16"
37
+ kl_chunk_size = 256
38
+ new_layer_lr_mul = 1.0
39
+
40
+ [eval]
41
+ every_steps = 50
42
+ samples = 500
43
+ seed = 4242
44
+
45
+ [log]
46
+ wandb = true
47
+ wandb_project = "distil-subnet97"
48
+ wandb_run = "grow40_winning"
49
+ log_every = 1
50
+ output_dir = "./out/grow40_winning"
51
+
52
+ [init]
53
+ zero_layers = []
54
+ target_num_layers = 40
grow40_winning_v2.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # grow40_winning v2: same hparams, but the scheduler bug is fixed in distill.py
2
+ # (we no longer prepare the scheduler with accelerate, so cosine reaches its
3
+ # minimum at step max_steps instead of step max_steps / num_processes).
4
+
5
+ [model]
6
+ teacher = "Qwen/Qwen3.5-35B-A3B"
7
+ student = "Troiaaa/m-6a3lnzvb"
8
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
9
+
10
+ [data]
11
+ dataset = "karpathy/climbmix-400b-shuffle"
12
+ text_field = "text"
13
+ min_chars = 2560
14
+ max_seq_len = 2048
15
+ kl_start_pos = 128
16
+ seed = 6767
17
+ shuffle_buffer = 10000
18
+
19
+ [train]
20
+ seed = 6767
21
+ lr = 5.0e-7
22
+ schedule = "cosine"
23
+ warmup_steps = 100
24
+ weight_decay = 0.0
25
+ grad_clip = 1.0
26
+ betas = [0.9, 0.999]
27
+ eps = 1.0e-3
28
+ samples_per_step = 4
29
+ micro_batch_size = 4
30
+ max_steps = 2000
31
+ grad_checkpointing = true
32
+ attn_implementation = "flash_attention_2"
33
+ student_dtype = "bfloat16"
34
+ teacher_dtype = "bfloat16"
35
+ mixed_precision = "bf16"
36
+ kl_chunk_size = 256
37
+ new_layer_lr_mul = 1.0
38
+
39
+ [eval]
40
+ every_steps = 50
41
+ samples = 500
42
+ seed = 4242
43
+
44
+ [log]
45
+ wandb = true
46
+ wandb_project = "distil-subnet97"
47
+ wandb_run = "grow40_winning_v2"
48
+ log_every = 1
49
+ output_dir = "./out/grow40_winning_v2"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 40
replicate_zero4.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Replicates wandb run "zero4_long" (mepqfry1, eval kl 0.275).
2
+ # Same hparams as that run; same 4-layer zero (14-17). 32-layer student.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "Troiaaa/m-6a3lnzvb"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 5.0e-7
21
+ schedule = "cosine"
22
+ warmup_steps = 100
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 1
29
+ max_steps = 2000
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "float32"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 256
36
+ new_layer_lr_mul = 1.0
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "replicate_zero4"
47
+ log_every = 1
48
+ output_dir = "./out/replicate_zero4"
49
+
50
+ [init]
51
+ zero_layers = [14, 15, 16, 17]
52
+ target_num_layers = 32
sn97_eval_long.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-text"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 1.0e-7
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 1
29
+ max_steps = 1
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 1
39
+ samples = 16
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_long_16.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_eval_long"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_eval_long"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_eval_long_alt.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-text"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 1.0e-7
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 1
29
+ max_steps = 1
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 1
39
+ samples = 32
40
+ seed = 9917
41
+ cache_path = "./cache/sn97_eval_long_alt_32.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_eval_long_alt"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_eval_long_alt"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_mm_refine.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-xxxn"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 768
13
+ max_seq_len = 768
14
+ kl_start_pos = 64
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 5.0e-7
21
+ schedule = "cosine"
22
+ warmup_steps = 20
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.98]
26
+ eps = 1.0e-6
27
+ samples_per_step = 2
28
+ micro_batch_size = 1
29
+ max_steps = 20
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 128
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 5
39
+ samples = 32
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_32.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_mm_refine"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_mm_refine"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_mm_top8.toml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-xxxn"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 768
13
+ max_seq_len = 768
14
+ kl_start_pos = 64
15
+ seed = 7777
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 7777
20
+ lr = 3.0e-7
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-3
27
+ samples_per_step = 2
28
+ micro_batch_size = 1
29
+ max_steps = 20
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 128
35
+ new_layer_lr_mul = 1.0
36
+ trainable_patterns = [
37
+ "^language_model\\.model\\.layers\\.(24|25|26|27|28|29|30|31)\\.",
38
+ "^language_model\\.lm_head",
39
+ "^language_model\\.model\\.norm"
40
+ ]
41
+
42
+ [eval]
43
+ every_steps = 5
44
+ samples = 32
45
+ seed = 4242
46
+ cache_path = "./cache/sn97_eval_32.pt"
47
+
48
+ [log]
49
+ wandb = false
50
+ wandb_project = "distil-subnet97"
51
+ wandb_run = "sn97_mm_top8"
52
+ log_every = 1
53
+ output_dir = "./out/sn97_mm_top8"
54
+ experiment_log = "./out/experiments.jsonl"
55
+
56
+ [init]
57
+ zero_layers = []
58
+ target_num_layers = 32
sn97_smoke.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-xxxn"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 512
13
+ max_seq_len = 512
14
+ kl_start_pos = 64
15
+ seed = 1234
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 1234
20
+ lr = 1.0e-7
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.95]
26
+ eps = 1.0e-8
27
+ samples_per_step = 1
28
+ micro_batch_size = 1
29
+ max_steps = 2
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 128
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 1
39
+ samples = 4
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_smoke_eval.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_smoke"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_smoke"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_text32_long_phase1.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-text"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 1.0e-7
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 1
29
+ max_steps = 30
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 10
39
+ samples = 16
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_long_16.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_text32_long_phase1"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_text32_long_phase1"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_text32_long_phase2.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-text"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 2.0e-8
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-3
27
+ samples_per_step = 16
28
+ micro_batch_size = 1
29
+ max_steps = 40
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 10
39
+ samples = 16
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_long_16.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_text32_long_phase2"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_text32_long_phase2"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_text32_long_phase2b.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "./out/sn97_text32_long_phase2/best"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 2.0e-8
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-3
27
+ samples_per_step = 16
28
+ micro_batch_size = 1
29
+ max_steps = 40
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 10
39
+ samples = 16
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_long_16.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_text32_long_phase2b"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_text32_long_phase2b"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_text32_long_phase3.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "./out/sn97_text32_long_phase2/best"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 5.0e-9
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-2
27
+ samples_per_step = 16
28
+ micro_batch_size = 1
29
+ max_steps = 40
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 10
39
+ samples = 16
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_long_16.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_text32_long_phase3"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_text32_long_phase3"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_text32_long_phase4.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "./out/sn97_text32_long_phase3/best"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 5.0e-9
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-2
27
+ samples_per_step = 16
28
+ micro_batch_size = 1
29
+ max_steps = 20
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 10
39
+ samples = 16
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_long_16.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_text32_long_phase4"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_text32_long_phase4"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_text32_long_phase5.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "./out/sn97_text32_long_phase3/best"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 2.0e-9
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-2
27
+ samples_per_step = 16
28
+ micro_batch_size = 1
29
+ max_steps = 20
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 10
39
+ samples = 16
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_long_16.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_text32_long_phase5"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_text32_long_phase5"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 32
sn97_text35_full.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "./out/sn97_text35_warm/best"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 768
13
+ max_seq_len = 1024
14
+ kl_start_pos = 32
15
+ seed = 6868
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6868
20
+ lr = 2.0e-8
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 1
29
+ max_steps = 80
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 128
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 10
39
+ samples = 32
40
+ seed = 4242
41
+ cache_path = "./cache/sn97_eval_32.pt"
42
+
43
+ [log]
44
+ wandb = false
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "sn97_text35_full"
47
+ log_every = 1
48
+ output_dir = "./out/sn97_text35_full"
49
+ experiment_log = "./out/experiments.jsonl"
50
+
51
+ [init]
52
+ zero_layers = []
53
+ target_num_layers = 35
sn97_text35_warm.toml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ teacher = "../models/Qwen3.5-35B-A3B"
3
+ student = "../models/sn97-text"
4
+ tokenizer = "../models/Qwen3.5-35B-A3B"
5
+ student_device = "cuda:7"
6
+ teacher_devices = [0, 1, 2, 3, 4, 5]
7
+ teacher_max_memory_gb = 70
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 768
13
+ max_seq_len = 768
14
+ kl_start_pos = 64
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 1.0e-7
21
+ schedule = "cosine"
22
+ warmup_steps = 20
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 2
28
+ micro_batch_size = 1
29
+ max_steps = 20
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ kl_chunk_size = 128
35
+ new_layer_lr_mul = 5.0
36
+ trainable_patterns = ["^model\\.layers\\.(32|33|34)\\.", "^lm_head", "^model\\.norm"]
37
+
38
+ [eval]
39
+ every_steps = 5
40
+ samples = 32
41
+ seed = 4242
42
+ cache_path = "./cache/sn97_eval_32.pt"
43
+
44
+ [log]
45
+ wandb = false
46
+ wandb_project = "distil-subnet97"
47
+ wandb_run = "sn97_text35_warm"
48
+ log_every = 1
49
+ output_dir = "./out/sn97_text35_warm"
50
+ experiment_log = "./out/experiments.jsonl"
51
+
52
+ [init]
53
+ zero_layers = []
54
+ target_num_layers = 35
sweep/A_resume_lr1e7_cos.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Resume from grow40_winning best (eval kl 0.2219). Lower peak LR to avoid the
2
+ # overshoot we saw at 5e-7. Cosine warmup 100, 1500 steps.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "./out/grow40_winning/best"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 1.0e-7
21
+ schedule = "cosine"
22
+ warmup_steps = 100
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 4
29
+ max_steps = 1500
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 256
36
+ new_layer_lr_mul = 1.0
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "A_resume_lr1e7_cos"
47
+ log_every = 1
48
+ output_dir = "./out/sweep/A_resume_lr1e7_cos"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 40
sweep/B_resume_lr5e8_cos.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Resume from grow40_winning best (eval kl 0.2219). Even lower peak LR.
2
+
3
+ [model]
4
+ teacher = "Qwen/Qwen3.5-35B-A3B"
5
+ student = "./out/grow40_winning/best"
6
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
7
+
8
+ [data]
9
+ dataset = "karpathy/climbmix-400b-shuffle"
10
+ text_field = "text"
11
+ min_chars = 2560
12
+ max_seq_len = 2048
13
+ kl_start_pos = 128
14
+ seed = 6767
15
+ shuffle_buffer = 10000
16
+
17
+ [train]
18
+ seed = 6767
19
+ lr = 5.0e-8
20
+ schedule = "cosine"
21
+ warmup_steps = 100
22
+ weight_decay = 0.0
23
+ grad_clip = 1.0
24
+ betas = [0.9, 0.999]
25
+ eps = 1.0e-3
26
+ samples_per_step = 4
27
+ micro_batch_size = 4
28
+ max_steps = 1500
29
+ grad_checkpointing = true
30
+ attn_implementation = "flash_attention_2"
31
+ student_dtype = "bfloat16"
32
+ teacher_dtype = "bfloat16"
33
+ mixed_precision = "bf16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 50
39
+ samples = 500
40
+ seed = 4242
41
+
42
+ [log]
43
+ wandb = true
44
+ wandb_project = "distil-subnet97"
45
+ wandb_run = "B_resume_lr5e8_cos"
46
+ log_every = 1
47
+ output_dir = "./out/sweep/B_resume_lr5e8_cos"
48
+
49
+ [init]
50
+ zero_layers = []
51
+ target_num_layers = 40
sweep/C_resume_lr2e8_cos.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Resume from grow40_winning best. Very small LR - basically a fine-tune.
2
+
3
+ [model]
4
+ teacher = "Qwen/Qwen3.5-35B-A3B"
5
+ student = "./out/grow40_winning/best"
6
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
7
+
8
+ [data]
9
+ dataset = "karpathy/climbmix-400b-shuffle"
10
+ text_field = "text"
11
+ min_chars = 2560
12
+ max_seq_len = 2048
13
+ kl_start_pos = 128
14
+ seed = 6767
15
+ shuffle_buffer = 10000
16
+
17
+ [train]
18
+ seed = 6767
19
+ lr = 2.0e-8
20
+ schedule = "cosine"
21
+ warmup_steps = 100
22
+ weight_decay = 0.0
23
+ grad_clip = 1.0
24
+ betas = [0.9, 0.999]
25
+ eps = 1.0e-3
26
+ samples_per_step = 4
27
+ micro_batch_size = 4
28
+ max_steps = 1500
29
+ grad_checkpointing = true
30
+ attn_implementation = "flash_attention_2"
31
+ student_dtype = "bfloat16"
32
+ teacher_dtype = "bfloat16"
33
+ mixed_precision = "bf16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 50
39
+ samples = 500
40
+ seed = 4242
41
+
42
+ [log]
43
+ wandb = true
44
+ wandb_project = "distil-subnet97"
45
+ wandb_run = "C_resume_lr2e8_cos"
46
+ log_every = 1
47
+ output_dir = "./out/sweep/C_resume_lr2e8_cos"
48
+
49
+ [init]
50
+ zero_layers = []
51
+ target_num_layers = 40
sweep/D_resume_lr1e7_const.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Resume from grow40_winning best. Constant LR (no schedule overshoot at all).
2
+
3
+ [model]
4
+ teacher = "Qwen/Qwen3.5-35B-A3B"
5
+ student = "./out/grow40_winning/best"
6
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
7
+
8
+ [data]
9
+ dataset = "karpathy/climbmix-400b-shuffle"
10
+ text_field = "text"
11
+ min_chars = 2560
12
+ max_seq_len = 2048
13
+ kl_start_pos = 128
14
+ seed = 6767
15
+ shuffle_buffer = 10000
16
+
17
+ [train]
18
+ seed = 6767
19
+ lr = 1.0e-7
20
+ schedule = "constant"
21
+ warmup_steps = 0
22
+ weight_decay = 0.0
23
+ grad_clip = 1.0
24
+ betas = [0.9, 0.999]
25
+ eps = 1.0e-3
26
+ samples_per_step = 4
27
+ micro_batch_size = 4
28
+ max_steps = 1500
29
+ grad_checkpointing = true
30
+ attn_implementation = "flash_attention_2"
31
+ student_dtype = "bfloat16"
32
+ teacher_dtype = "bfloat16"
33
+ mixed_precision = "bf16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 50
39
+ samples = 500
40
+ seed = 4242
41
+
42
+ [log]
43
+ wandb = true
44
+ wandb_project = "distil-subnet97"
45
+ wandb_run = "D_resume_lr1e7_const"
46
+ log_every = 1
47
+ output_dir = "./out/sweep/D_resume_lr1e7_const"
48
+
49
+ [init]
50
+ zero_layers = []
51
+ target_num_layers = 40
sweep/E_resume_lr5e8_b95.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Resume from grow40_winning best. Smaller second-moment memory (beta2=0.95)
2
+ # so Adam stabilizes faster. Same low LR.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "./out/grow40_winning/best"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 5.0e-8
21
+ schedule = "cosine"
22
+ warmup_steps = 100
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.95]
26
+ eps = 1.0e-8
27
+ samples_per_step = 4
28
+ micro_batch_size = 4
29
+ max_steps = 1500
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 256
36
+ new_layer_lr_mul = 1.0
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "E_resume_lr5e8_b95"
47
+ log_every = 1
48
+ output_dir = "./out/sweep/E_resume_lr5e8_b95"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 40
sweep/F_cold_lr1e7_grow40.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cold start, 40 layers, lower peak LR than the original winning recipe.
2
+
3
+ [model]
4
+ teacher = "Qwen/Qwen3.5-35B-A3B"
5
+ student = "Troiaaa/m-6a3lnzvb"
6
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
7
+
8
+ [data]
9
+ dataset = "karpathy/climbmix-400b-shuffle"
10
+ text_field = "text"
11
+ min_chars = 2560
12
+ max_seq_len = 2048
13
+ kl_start_pos = 128
14
+ seed = 6767
15
+ shuffle_buffer = 10000
16
+
17
+ [train]
18
+ seed = 6767
19
+ lr = 1.0e-7
20
+ schedule = "cosine"
21
+ warmup_steps = 100
22
+ weight_decay = 0.0
23
+ grad_clip = 1.0
24
+ betas = [0.9, 0.999]
25
+ eps = 1.0e-3
26
+ samples_per_step = 4
27
+ micro_batch_size = 4
28
+ max_steps = 2000
29
+ grad_checkpointing = true
30
+ attn_implementation = "flash_attention_2"
31
+ student_dtype = "bfloat16"
32
+ teacher_dtype = "bfloat16"
33
+ mixed_precision = "bf16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 50
39
+ samples = 500
40
+ seed = 4242
41
+
42
+ [log]
43
+ wandb = true
44
+ wandb_project = "distil-subnet97"
45
+ wandb_run = "F_cold_lr1e7_grow40"
46
+ log_every = 1
47
+ output_dir = "./out/sweep/F_cold_lr1e7_grow40"
48
+
49
+ [init]
50
+ zero_layers = []
51
+ target_num_layers = 40
sweep/G_cold_lr2e7_grow40.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cold start, 40 layers, lr=2e-7 (between 1e-7 and the failing 5e-7).
2
+
3
+ [model]
4
+ teacher = "Qwen/Qwen3.5-35B-A3B"
5
+ student = "Troiaaa/m-6a3lnzvb"
6
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
7
+
8
+ [data]
9
+ dataset = "karpathy/climbmix-400b-shuffle"
10
+ text_field = "text"
11
+ min_chars = 2560
12
+ max_seq_len = 2048
13
+ kl_start_pos = 128
14
+ seed = 6767
15
+ shuffle_buffer = 10000
16
+
17
+ [train]
18
+ seed = 6767
19
+ lr = 2.0e-7
20
+ schedule = "cosine"
21
+ warmup_steps = 100
22
+ weight_decay = 0.0
23
+ grad_clip = 1.0
24
+ betas = [0.9, 0.999]
25
+ eps = 1.0e-3
26
+ samples_per_step = 4
27
+ micro_batch_size = 4
28
+ max_steps = 2000
29
+ grad_checkpointing = true
30
+ attn_implementation = "flash_attention_2"
31
+ student_dtype = "bfloat16"
32
+ teacher_dtype = "bfloat16"
33
+ mixed_precision = "bf16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 50
39
+ samples = 500
40
+ seed = 4242
41
+
42
+ [log]
43
+ wandb = true
44
+ wandb_project = "distil-subnet97"
45
+ wandb_run = "G_cold_lr2e7_grow40"
46
+ log_every = 1
47
+ output_dir = "./out/sweep/G_cold_lr2e7_grow40"
48
+
49
+ [init]
50
+ zero_layers = []
51
+ target_num_layers = 40
sweep/H_cold_lr1e7_32L.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cold start, 32 layers (NO grow), lower LR. Tests whether the +8 layers were
2
+ # helping at all once we use the right LR.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "Troiaaa/m-6a3lnzvb"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 1.0e-7
21
+ schedule = "cosine"
22
+ warmup_steps = 100
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 4
29
+ max_steps = 2000
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 256
36
+ new_layer_lr_mul = 1.0
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "H_cold_lr1e7_32L"
47
+ log_every = 1
48
+ output_dir = "./out/sweep/H_cold_lr1e7_32L"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 32
sweep/I_cold_paramgroups_grow40.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cold start, 40 layers, low LR for original layers + 5x for the new ones.
2
+ # Lets the new layers wake up faster without disturbing the trained layers.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "Troiaaa/m-6a3lnzvb"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 1.0e-7
21
+ schedule = "cosine"
22
+ warmup_steps = 100
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.999]
26
+ eps = 1.0e-3
27
+ samples_per_step = 4
28
+ micro_batch_size = 4
29
+ max_steps = 2000
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 256
36
+ new_layer_lr_mul = 5.0
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "I_cold_paramgroups_grow40"
47
+ log_every = 1
48
+ output_dir = "./out/sweep/I_cold_paramgroups_grow40"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 40
sweep/J_phase2_lr5e9_const.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 2: ultra-conservative resume from phase 1 best.
2
+ # Tiny LR, constant, zero warmup, very high beta2 for max smoothing.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "./out/phase1_best"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 5.0e-9
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-2
27
+ samples_per_step = 4
28
+ micro_batch_size = 4
29
+ max_steps = 3000
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 256
36
+ new_layer_lr_mul = 1.0
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "J_phase2_lr5e9_const"
47
+ log_every = 1
48
+ output_dir = "./out/sweep/J_phase2_lr5e9_const"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 40
sweep/K_phase2_lr2e8_const.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 2: still conservative but a bit more LR than J.
2
+
3
+ [model]
4
+ teacher = "Qwen/Qwen3.5-35B-A3B"
5
+ student = "./out/phase1_best"
6
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
7
+
8
+ [data]
9
+ dataset = "karpathy/climbmix-400b-shuffle"
10
+ text_field = "text"
11
+ min_chars = 2560
12
+ max_seq_len = 2048
13
+ kl_start_pos = 128
14
+ seed = 6767
15
+ shuffle_buffer = 10000
16
+
17
+ [train]
18
+ seed = 6767
19
+ lr = 2.0e-8
20
+ schedule = "constant"
21
+ warmup_steps = 0
22
+ weight_decay = 0.0
23
+ grad_clip = 1.0
24
+ betas = [0.9, 0.99]
25
+ eps = 1.0e-3
26
+ samples_per_step = 4
27
+ micro_batch_size = 4
28
+ max_steps = 3000
29
+ grad_checkpointing = true
30
+ attn_implementation = "flash_attention_2"
31
+ student_dtype = "bfloat16"
32
+ teacher_dtype = "bfloat16"
33
+ mixed_precision = "bf16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 50
39
+ samples = 500
40
+ seed = 4242
41
+
42
+ [log]
43
+ wandb = true
44
+ wandb_project = "distil-subnet97"
45
+ wandb_run = "K_phase2_lr2e8_const"
46
+ log_every = 1
47
+ output_dir = "./out/sweep/K_phase2_lr2e8_const"
48
+
49
+ [init]
50
+ zero_layers = []
51
+ target_num_layers = 40
sweep/L_phase2_lr1e8_warmup500.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 2: very gentle cosine warmup over 500 steps to avoid any LR shock.
2
+
3
+ [model]
4
+ teacher = "Qwen/Qwen3.5-35B-A3B"
5
+ student = "./out/phase1_best"
6
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
7
+
8
+ [data]
9
+ dataset = "karpathy/climbmix-400b-shuffle"
10
+ text_field = "text"
11
+ min_chars = 2560
12
+ max_seq_len = 2048
13
+ kl_start_pos = 128
14
+ seed = 6767
15
+ shuffle_buffer = 10000
16
+
17
+ [train]
18
+ seed = 6767
19
+ lr = 1.0e-8
20
+ schedule = "cosine"
21
+ warmup_steps = 500
22
+ weight_decay = 0.0
23
+ grad_clip = 1.0
24
+ betas = [0.9, 0.99]
25
+ eps = 1.0e-3
26
+ samples_per_step = 4
27
+ micro_batch_size = 4
28
+ max_steps = 3000
29
+ grad_checkpointing = true
30
+ attn_implementation = "flash_attention_2"
31
+ student_dtype = "bfloat16"
32
+ teacher_dtype = "bfloat16"
33
+ mixed_precision = "bf16"
34
+ kl_chunk_size = 256
35
+ new_layer_lr_mul = 1.0
36
+
37
+ [eval]
38
+ every_steps = 50
39
+ samples = 500
40
+ seed = 4242
41
+
42
+ [log]
43
+ wandb = true
44
+ wandb_project = "distil-subnet97"
45
+ wandb_run = "L_phase2_lr1e8_warmup500"
46
+ log_every = 1
47
+ output_dir = "./out/sweep/L_phase2_lr1e8_warmup500"
48
+
49
+ [init]
50
+ zero_layers = []
51
+ target_num_layers = 40
sweep/M_phase2_lr2e8_largebatch.toml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 2: same tiny LR but larger inner batch (16/rank → effective 128) so the
2
+ # gradients are much smoother. Should give the smoothest descent of all.
3
+
4
+ [model]
5
+ teacher = "Qwen/Qwen3.5-35B-A3B"
6
+ student = "./out/phase1_best"
7
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
8
+
9
+ [data]
10
+ dataset = "karpathy/climbmix-400b-shuffle"
11
+ text_field = "text"
12
+ min_chars = 2560
13
+ max_seq_len = 2048
14
+ kl_start_pos = 128
15
+ seed = 6767
16
+ shuffle_buffer = 10000
17
+
18
+ [train]
19
+ seed = 6767
20
+ lr = 2.0e-8
21
+ schedule = "constant"
22
+ warmup_steps = 0
23
+ weight_decay = 0.0
24
+ grad_clip = 1.0
25
+ betas = [0.9, 0.99]
26
+ eps = 1.0e-3
27
+ samples_per_step = 16
28
+ micro_batch_size = 1
29
+ max_steps = 2000
30
+ grad_checkpointing = true
31
+ attn_implementation = "flash_attention_2"
32
+ student_dtype = "bfloat16"
33
+ teacher_dtype = "bfloat16"
34
+ mixed_precision = "bf16"
35
+ kl_chunk_size = 256
36
+ new_layer_lr_mul = 1.0
37
+
38
+ [eval]
39
+ every_steps = 50
40
+ samples = 500
41
+ seed = 4242
42
+
43
+ [log]
44
+ wandb = true
45
+ wandb_project = "distil-subnet97"
46
+ wandb_run = "M_phase2_lr2e8_largebatch"
47
+ log_every = 1
48
+ output_dir = "./out/sweep/M_phase2_lr2e8_largebatch"
49
+
50
+ [init]
51
+ zero_layers = []
52
+ target_num_layers = 40
zero_14_17.toml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Layer-zero distillation: zero student layers 14-17 at init,
2
+ # constant LR 5e-7, 2000 steps. Aim: lower KL than the prior checkpoint
3
+ # despite the surgery.
4
+
5
+ [model]
6
+ teacher = "Qwen/Qwen3.5-35B-A3B"
7
+ student = "Troiaaa/m-6a3lnzvb"
8
+ tokenizer = "Qwen/Qwen3.5-35B-A3B"
9
+
10
+ [data]
11
+ dataset = "karpathy/climbmix-400b-shuffle"
12
+ text_field = "text"
13
+ min_chars = 2560
14
+ max_seq_len = 640
15
+ kl_start_pos = 128
16
+ seed = 42
17
+ shuffle_buffer = 10000
18
+
19
+ [train]
20
+ seed = 42
21
+ lr = 5.0e-7
22
+ schedule = "constant"
23
+ warmup_steps = 0
24
+ weight_decay = 0.0
25
+ grad_clip = 1.0
26
+ betas = [0.9, 0.95]
27
+ eps = 1.0e-8
28
+ samples_per_step = 8
29
+ micro_batch_size = 8
30
+ max_steps = 2000
31
+ grad_checkpointing = true
32
+ attn_implementation = "flash_attention_2"
33
+ student_dtype = "bfloat16"
34
+ teacher_dtype = "bfloat16"
35
+ mixed_precision = "bf16"
36
+ kl_chunk_size = 0
37
+ new_layer_lr_mul = 1.0
38
+
39
+ [eval]
40
+ every_steps = 50
41
+ samples = 64
42
+ seed = 1234
43
+
44
+ [log]
45
+ wandb = true
46
+ wandb_project = "distil-subnet97"
47
+ wandb_run = "m-6a3lnzvb-zero14_17"
48
+ log_every = 1
49
+ output_dir = "./out/zero_14_17"
50
+
51
+ [init]
52
+ zero_layers = [14, 15, 16, 17]
53
+ target_num_layers = 32