davidquarel commited on
Commit
cbebb94
·
verified ·
1 Parent(s): 14801df

Upload folder using huggingface_hub

Browse files
debug.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ name jln.mlpblock.gpt2-sparse-1.0e-01 | device cuda | compile True | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 2 | gradient_accumulation_steps 16 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
2
+ name jln.mlpblock.gpt2-sparse-1.0e-01 | device cuda | compile True | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 2 | gradient_accumulation_steps 16 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
3
+ name jln.mlpblock.gpt2-sparse-1.0e-01 | device cuda | compile False | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 1 | eval_interval 1 | eval_steps 100 | batch_size 2 | gradient_accumulation_steps 16 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
4
+ name jln.mlpblock.gpt2-sparse-1.0e-01 | device cuda | compile False | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 1 | eval_interval 1 | eval_steps 100 | batch_size 4 | gradient_accumulation_steps 16 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
5
+ name jln.mlpblock.gpt2-sparse-1.0e-01 | device cuda | compile True | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 1 | eval_interval 250 | eval_steps 100 | batch_size 4 | gradient_accumulation_steps 8 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
eval.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ type eval | step 0 | loss 6615.6406 11866.7480 21591.0957 32546.1348 37218.3789 41898.9570 47866.1016 58492.0781 71819.0156 97141.4219 166687.6719 653176.3125 | checkpoint False | ce_loss 3.2929 | sae_losses 2025.5774 4540.9673 5418.6567 6438.1230 6858.7349 14721.0371 15443.1562 17090.4531 17956.0742 19249.9570 20448.4102 21438.1309 23203.3066 24650.5000 28540.7266 29938.7891 34139.3320 37666.8750 45037.2773 52091.0117 70683.5703 95990.0234 289342.1562 363818.6875 | ce_loss_increases 5.9522 4.7810 5.4976 5.5004 6.0922 6.1756 5.0244 4.4974 4.2151 3.7689 3.9428 8.5544 | compound_ce_loss_increase 14.8366 | l0s 33.0648 32.8452 33.0192 33.1385 33.1935 33.1692 33.1265 33.0800 33.0512 33.0033 32.9504 32.9093 32.8498 32.8270 32.8402 32.8774 33.0673 33.1424 33.0328 32.9427 32.8355 33.0557 33.0240 33.0913 | stream_l1s 309.2626 692.4410 746.0629 809.1002 859.6667 914.4285 980.9479 1049.1144 1114.2999 1175.6417 1265.6296 1332.4180 1437.4130 1520.1178 1672.2251 1790.8796 1952.7334 2118.9092 2354.9434 2609.0173 2949.1521 3358.8938 4410.5889 4829.7856 | ∇_l1 49.0962 9.9716 11.3264 12.5240 12.3488 12.4084 12.3092 12.5717 12.8200 13.1391 14.0311 15.6790 | recon_l2 2025.5774 4540.9673 5418.6567 6438.1230 6858.7349 14721.0371 15443.1562 17090.4531 17956.0742 19249.9570 20448.4102 21438.1309 23203.3066 24650.5000 28540.7266 29938.7891 34139.3320 37666.8750 45037.2773 52091.0117 70683.5703 95990.0234 289342.1562 363818.6875
2
+ type eval | step 0 | loss 6652.3945 11744.7949 21464.1680 33053.5234 37635.0117 43282.6055 47971.1406 57564.7070 71673.0234 96718.1094 166055.3281 667751.0000 | checkpoint False | ce_loss 3.2925 | sae_losses 1992.1345 4611.1274 5362.0352 6372.9028 6842.0923 14610.7002 15544.1104 17496.8555 18285.2617 19337.4277 20695.9082 22574.2871 23129.5527 24829.2812 27801.5898 29750.5469 34464.8438 37195.3750 44694.8828 52010.0781 70494.2344 95547.0312 293692.6562 374042.3125 | ce_loss_increases 5.9493 4.6235 5.5767 6.4071 5.5450 5.1880 5.2520 4.5760 4.5284 3.8061 3.8908 9.4399 | compound_ce_loss_increase 14.8839 | l0s 33.0900 32.8459 32.9905 33.1236 33.1803 33.1672 33.1368 33.0767 33.0415 33.0175 32.9534 32.9044 32.8498 32.8186 32.8367 32.8870 33.0587 33.1327 33.0339 32.9442 32.8374 33.0436 32.9710 33.0951 | stream_l1s 309.2635 692.4429 746.0773 809.1177 859.6948 914.4438 980.9666 1049.1263 1114.3087 1175.6410 1265.6100 1332.3904 1437.3879 1520.0878 1672.2047 1790.8553 1952.6946 2118.8623 2354.9084 2608.9729 2949.0505 3358.8362 4410.6128 4829.8242 | ∇_l1 49.1335 9.8573 11.3686 12.5473 12.3133 12.4047 12.3069 12.5739 12.7895 13.1603 14.0729 16.0551 | recon_l2 1992.1345 4611.1274 5362.0352 6372.9028 6842.0923 14610.7002 15544.1104 17496.8555 18285.2617 19337.4277 20695.9082 22574.2871 23129.5527 24829.2812 27801.5898 29750.5469 34464.8438 37195.3750 44694.8828 52010.0781 70494.2344 95547.0312 293692.6562 374042.3125
3
+ type eval | step 1 | loss 6642.9165 11720.0830 21433.6250 33021.8320 37598.5000 43243.6250 47931.6094 57530.5391 71659.9688 96711.1328 166048.1406 667691.8750 | checkpoint True True True True True True True True True True True True | ce_loss 3.2925 | sae_losses 1990.6041 4603.1772 5350.8794 6359.3442 6827.7505 14594.5127 15533.1816 17476.1074 18267.5000 19318.6914 20676.7832 22554.4375 23109.8008 24809.5078 27784.0918 29733.8730 34455.5703 37191.5898 44691.3438 52006.6289 70490.0156 95544.0234 293667.9688 374007.8750 | ce_loss_increases 5.9454 4.6286 5.5799 6.3964 5.5495 5.1927 5.2554 4.5779 4.5294 3.8067 3.8905 9.4395 | compound_ce_loss_increase 14.8900 | l0s 33.0594 32.7883 32.9127 33.0333 33.0890 33.0735 33.0482 32.9958 32.9611 32.9381 32.8827 32.8359 32.7874 32.7623 32.7934 32.8538 33.0415 33.1298 33.0329 32.9445 32.8372 33.0449 32.9719 33.0951 | stream_l1s 309.2635 692.4429 746.0773 809.1177 859.6948 914.4438 980.9666 1049.1263 1114.3087 1175.6410 1265.6100 1332.3904 1437.3879 1520.0878 1672.2047 1790.8553 1952.6946 2118.8623 2354.9084 2608.9729 2949.0505 3358.8362 4410.6128 4829.8242 | ∇_l1 49.1360 9.8578 11.3692 12.5476 12.3137 12.4048 12.3069 12.5739 12.7894 13.1600 14.0727 16.0549 | recon_l2 1990.6041 4603.1772 5350.8794 6359.3442 6827.7505 14594.5127 15533.1816 17476.1074 18267.5000 19318.6914 20676.7832 22554.4375 23109.8008 24809.5078 27784.0918 29733.8730 34455.5703 37191.5898 44691.3438 52006.6289 70490.0156 95544.0234 293667.9688 374007.8750
4
+ type eval | step 0 | loss 6672.5684 11792.3242 21541.5352 33255.6250 37912.4102 43652.3281 48257.6680 58320.7969 71132.0391 97199.3203 165197.4062 646829.6875 | checkpoint False | ce_loss 3.2557 | sae_losses 1998.8279 4575.1221 5396.0205 6376.4819 6888.5537 14630.2588 15857.5166 17372.9707 18417.8633 19469.8555 20745.7891 22881.7070 23542.0195 24691.0859 27715.6797 30580.0156 33463.5586 37642.8828 45511.9883 51660.9336 69566.7422 95602.8672 287454.0312 359344.1562 | ce_loss_increases 6.0107 4.7758 5.9661 5.7998 5.2866 5.6486 5.3008 4.9896 4.2024 3.8515 4.0453 9.3538 | compound_ce_loss_increase 14.9961 | l0s 33.0866 32.8493 32.9797 33.1090 33.1974 33.1588 33.1233 33.0901 33.0423 33.0109 32.9553 32.9165 32.8567 32.8182 32.8387 32.8775 33.0516 33.1266 33.0409 32.9388 32.8471 33.0502 32.9790 33.1312 | stream_l1s 309.8517 693.8203 747.5106 810.8797 861.5087 916.5118 983.0554 1051.8489 1117.2451 1179.2874 1269.0377 1336.7378 1441.0602 1524.1895 1674.8082 1794.0833 1954.0990 2120.9966 2354.1946 2610.6287 2942.9961 3357.1411 4390.7417 4811.3970 | ∇_l1 98.6178 19.8238 22.7231 25.1361 24.6960 24.8273 24.5483 25.0981 25.5977 26.3978 27.8349 31.4534 | recon_l2 1998.8279 4575.1221 5396.0205 6376.4819 6888.5537 14630.2588 15857.5166 17372.9707 18417.8633 19469.8555 20745.7891 22881.7070 23542.0195 24691.0859 27715.6797 30580.0156 33463.5586 37642.8828 45511.9883 51660.9336 69566.7422 95602.8672 287454.0312 359344.1562
5
+ type eval | step 1 | loss 6663.9229 11769.6455 21513.8281 33223.6836 37878.0469 43615.2461 48223.3711 58292.8984 71134.7266 97191.4453 165185.4219 646754.8125 | checkpoint True True True True True True True True True True True True | ce_loss 3.2557 | sae_losses 1997.3927 4567.9062 5386.1157 6363.7070 6875.4341 14615.6738 15841.9082 17356.6426 18401.7773 19451.5605 20727.6504 22862.7617 23525.6895 24673.1328 27698.6895 30569.1172 33470.5391 37638.5938 45507.6523 51657.3711 69561.9453 95595.6484 287427.1250 359296.4375 | ce_loss_increases 6.0110 4.7809 5.9665 5.7951 5.2928 5.6533 5.3017 4.9926 4.2017 3.8515 4.0452 9.3539 | compound_ce_loss_increase 14.9880 | l0s 33.0588 32.7972 32.9116 33.0262 33.1165 33.0762 33.0431 33.0116 32.9705 32.9414 32.8909 32.8545 32.8008 32.7678 32.7970 32.8455 33.0348 33.1229 33.0393 32.9389 32.8473 33.0499 32.9788 33.1320 | stream_l1s 309.8517 693.8203 747.5106 810.8797 861.5087 916.5118 983.0554 1051.8489 1117.2451 1179.2874 1269.0377 1336.7378 1441.0602 1524.1895 1674.8082 1794.0833 1954.0990 2120.9966 2354.1946 2610.6287 2942.9961 3357.1411 4390.7417 4811.3970 | ∇_l1 98.6242 19.8248 22.7243 25.1366 24.6964 24.8276 24.5484 25.0979 25.5975 26.3974 27.8346 31.4528 | recon_l2 1997.3927 4567.9062 5386.1157 6363.7070 6875.4341 14615.6738 15841.9082 17356.6426 18401.7773 19451.5605 20727.6504 22862.7617 23525.6895 24673.1328 27698.6895 30569.1172 33470.5391 37638.5938 45507.6523 51657.3711 69561.9453 95595.6484 287427.1250 359296.4375
6
+ type eval | step 2 | loss 6651.3955 11763.0000 21509.5371 33217.2109 37870.5938 43607.4492 48217.1016 58276.9570 71105.0547 97144.4297 164922.1406 646267.3125 | checkpoint True True True True True True True True True True True True | ce_loss 3.2557 | sae_losses 1991.8335 4560.9341 5381.9048 6361.2705 6873.6562 14613.1631 15838.5518 17353.5195 18398.1035 19447.7871 20723.8789 22858.7324 23522.3652 24670.2031 27692.0742 30559.7930 33461.7305 37617.7461 45483.4883 51634.5391 69528.2109 95366.1094 287159.1875 359076.5938 | ce_loss_increases 6.0060 4.7816 5.9672 5.8013 5.3007 5.6593 5.3072 5.0021 4.2090 3.8673 4.0563 9.3569 | compound_ce_loss_increase 15.0030 | l0s 32.9427 32.7351 32.8802 33.0141 33.1112 33.0707 33.0395 33.0088 32.9673 32.9384 32.8877 32.8529 32.7980 32.7649 32.7922 32.8357 33.0186 33.0987 33.0168 32.9214 32.8271 32.9447 32.9321 33.0990 | stream_l1s 309.8517 693.8203 747.5106 810.8797 861.5087 916.5118 983.0554 1051.8489 1117.2451 1179.2874 1269.0377 1336.7378 1441.0602 1524.1895 1674.8082 1794.0833 1954.0990 2120.9966 2354.1946 2610.6287 2942.9961 3357.1411 4390.7417 4811.3970 | ∇_l1 98.6295 19.8249 22.7239 25.1359 24.6958 24.8268 24.5477 25.0972 25.5968 26.3968 27.8327 31.4514 | recon_l2 1991.8335 4560.9341 5381.9048 6361.2705 6873.6562 14613.1631 15838.5518 17353.5195 18398.1035 19447.7871 20723.8789 22858.7324 23522.3652 24670.2031 27692.0742 30559.7930 33461.7305 37617.7461 45483.4883 51634.5391 69528.2109 95366.1094 287159.1875 359076.5938
7
+ type eval | step 0 | loss 6627.1367 11760.2422 22044.7617 32570.1016 37441.6445 42206.9219 48445.1602 57714.6445 71035.7891 97351.0234 165793.5938 660725.9375 | checkpoint False | ce_loss 3.2555 | sae_losses 1990.6980 4537.9966 5329.9004 6410.3491 6798.8975 15223.0459 15279.6572 17265.3379 17647.2148 19769.6660 20381.7793 21800.3262 23619.8516 24800.7676 28496.5820 29192.9492 33940.2227 37069.9961 45143.6797 52181.0586 70233.4688 95531.9141 291678.1562 369016.5312 | ce_loss_increases 5.8906 4.8148 6.0264 5.9125 6.5841 5.9184 5.2771 4.7719 4.3248 3.7632 4.0664 9.7506 | compound_ce_loss_increase 15.0005 | l0s 33.0693 32.8510 32.9873 33.1061 33.1699 33.1432 33.1326 33.0923 33.0362 33.0050 32.9479 32.9108 32.8584 32.8196 32.8391 32.8825 33.0519 33.1405 33.0461 32.9566 32.8451 33.0603 32.9799 33.0719 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 98.4422 19.9923 22.8175 25.1117 24.7504 24.8077 24.5401 25.1293 25.5755 26.2826 28.2273 31.5983 | recon_l2 1990.6980 4537.9966 5329.9004 6410.3491 6798.8975 15223.0459 15279.6572 17265.3379 17647.2148 19769.6660 20381.7793 21800.3262 23619.8516 24800.7676 28496.5820 29192.9492 33940.2227 37069.9961 45143.6797 52181.0586 70233.4688 95531.9141 291678.1562 369016.5312
8
+ type eval | step 250 | loss 1559.5353 3086.6912 6676.0586 10700.8008 11828.5879 13511.4971 15912.6045 18922.9473 23008.7949 29731.1230 49836.9141 96411.9531 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 356.8488 1142.5419 1339.6409 1733.2695 1893.3473 4767.2661 4937.3228 5746.6304 5581.3149 6230.6968 6832.6968 6662.2803 7484.4702 8411.7051 9330.9902 9575.0684 11114.4482 11877.0088 13801.7979 15911.4658 21747.4062 28070.2871 42784.7969 53603.7305 | ce_loss_increases 4.8905 6.1523 6.6114 7.1633 7.1380 6.3442 6.0894 5.4743 5.0531 4.3598 4.1474 5.4238 | compound_ce_loss_increase 12.6730 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0001 32.0001 32.0002 32.0001 32.0001 32.0001 32.0001 32.0001 32.0001 32.0002 32.0002 32.0003 32.0003 32.0003 32.0002 32.0003 32.0003 32.0004 32.0004 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 60.1444 13.7796 15.4446 16.8434 16.5775 16.5204 16.4334 16.8862 17.3467 17.8609 19.2277 23.4166 | recon_l2 356.8488 1142.5419 1339.6409 1733.2695 1893.3473 4767.2661 4937.3228 5746.6304 5581.3149 6230.6968 6832.6968 6662.2803 7484.4702 8411.7051 9330.9902 9575.0684 11114.4482 11877.0088 13801.7979 15911.4658 21747.4062 28070.2871 42784.7969 53603.7305
9
+ type eval | step 500 | loss 431.9084 853.4123 1142.9680 1602.3254 2141.5300 2844.9631 3738.0229 5134.8921 7084.7432 10733.9004 17944.2480 32947.0156 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 94.5828 326.6145 404.9998 432.3031 545.9028 580.4252 735.5486 849.8776 1008.3304 1116.7307 1364.4216 1463.8911 1786.6248 1934.5872 2435.9814 2681.8125 3293.4187 3773.8816 4871.9341 5843.5532 7853.9829 10070.3184 14684.4561 18235.1035 | ce_loss_increases 7.2196 3.2661 3.0622 3.0718 2.7443 2.5532 2.4496 2.2731 2.0594 1.8733 1.9018 2.7229 | compound_ce_loss_increase 6.9438 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 10.7111 16.1096 16.6402 16.8991 16.4690 16.6497 16.8112 17.0965 17.4428 18.4147 19.9436 27.4681 | recon_l2 94.5828 326.6145 404.9998 432.3031 545.9028 580.4252 735.5486 849.8776 1008.3304 1116.7307 1364.4216 1463.8911 1786.6248 1934.5872 2435.9814 2681.8125 3293.4187 3773.8816 4871.9341 5843.5532 7853.9829 10070.3184 14684.4561 18235.1035
10
+ type eval | step 750 | loss 232.1364 452.6435 635.2053 956.4237 1325.8250 1816.4664 2450.4241 3529.4399 4860.8164 7526.0239 12616.2324 23566.7461 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 61.5434 164.0199 220.8249 217.1448 296.1394 322.0094 439.6617 497.7122 623.9604 682.4593 869.4731 926.8566 1170.5699 1258.9890 1638.2910 1869.5277 2243.8481 2595.1145 3400.9116 4103.0464 5506.5967 7086.4819 10579.0166 12960.4863 | ce_loss_increases 7.5631 0.9056 1.3591 1.3871 1.2218 1.2496 1.3146 1.2611 1.1421 1.0858 1.1209 1.6450 | compound_ce_loss_increase 6.3839 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 6.5731 14.6739 17.0564 19.0496 19.4050 20.1366 20.8659 21.6211 21.8536 22.0656 23.1572 27.2467 | recon_l2 61.5434 164.0199 220.8249 217.1448 296.1394 322.0094 439.6617 497.7122 623.9604 682.4593 869.4731 926.8566 1170.5699 1258.9890 1638.2910 1869.5277 2243.8481 2595.1145 3400.9116 4103.0464 5506.5967 7086.4819 10579.0166 12960.4863
11
+ type eval | step 1000 | loss 171.4468 327.1380 471.4918 739.6506 1053.9741 1523.9082 2038.1517 2901.4595 4042.5554 6205.6738 10385.6064 19327.4551 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 51.5320 114.3381 162.0867 153.0121 216.8254 240.1826 342.2440 381.1610 501.6292 535.2418 745.6642 760.2060 976.7953 1041.9373 1391.0913 1489.2665 1890.4343 2129.7749 2798.5103 3383.9607 4546.0107 5815.2168 8554.3037 10745.7363 | ce_loss_increases 7.8509 0.4216 0.6769 0.7851 0.7102 0.7764 0.8021 0.7851 0.7673 0.7238 0.7949 1.2059 | compound_ce_loss_increase 6.3298 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 5.5766 12.0392 14.4837 16.2457 17.1030 18.0379 19.4187 21.1024 22.3463 23.2048 24.3798 27.4132 | recon_l2 51.5320 114.3381 162.0867 153.0121 216.8254 240.1826 342.2440 381.1610 501.6292 535.2418 745.6642 760.2060 976.7953 1041.9373 1391.0913 1489.2665 1890.4343 2129.7749 2798.5103 3383.9607 4546.0107 5815.2168 8554.3037 10745.7363
12
+ type eval | step 1250 | loss 146.4806 278.6682 410.9802 669.8072 937.1288 1355.1168 1847.5951 2613.9236 3662.9678 5715.7759 9415.5293 17596.5840 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 47.5651 93.9436 140.0668 128.4633 190.8633 207.8436 300.2591 355.0497 449.8733 472.3365 658.2959 679.9584 900.0393 929.5552 1252.0145 1342.2145 1711.8383 1929.6949 2588.6194 3104.3276 4116.6011 5274.9072 7829.9248 9739.7900 | ce_loss_increases 6.7913 0.3017 0.5030 0.5398 0.5607 0.6055 0.6349 0.6252 0.6044 0.5977 0.6534 1.0251 | compound_ce_loss_increase 6.0939 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.9719 10.1381 12.2734 14.4986 14.9190 16.8628 18.0006 19.6948 21.4339 22.8285 24.0198 26.8680 | recon_l2 47.5651 93.9436 140.0668 128.4633 190.8633 207.8436 300.2591 355.0497 449.8733 472.3365 658.2959 679.9584 900.0393 929.5552 1252.0145 1342.2145 1711.8383 1929.6949 2588.6194 3104.3276 4116.6011 5274.9072 7829.9248 9739.7900
13
+ type eval | step 1500 | loss 131.8943 252.0660 398.2011 606.6983 907.2122 1255.7852 1727.0543 2460.7739 3453.9043 5372.3589 8853.6787 16651.2598 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 45.3808 81.9782 128.6541 114.5451 178.4111 209.0643 280.6660 313.1707 421.1728 472.4656 617.3790 623.1101 847.6677 862.7369 1183.7074 1258.6570 1618.4927 1815.0243 2436.1123 2914.1907 3884.5693 4945.4956 7375.6567 9249.2725 | ce_loss_increases 6.3185 0.2437 0.4863 0.4442 0.5166 0.5105 0.5388 0.5325 0.5266 0.5224 0.5814 0.9246 | compound_ce_loss_increase 6.2319 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.5353 8.8669 10.7257 12.8616 13.5740 15.2964 16.6498 18.4103 20.3870 22.0548 23.6125 26.3251 | recon_l2 45.3808 81.9782 128.6541 114.5451 178.4111 209.0643 280.6660 313.1707 421.1728 472.4656 617.3790 623.1101 847.6677 862.7369 1183.7074 1258.6570 1618.4927 1815.0243 2436.1123 2914.1907 3884.5693 4945.4956 7375.6567 9249.2725
14
+ type eval | step 1750 | loss 121.8375 233.5073 365.0203 570.7539 840.5766 1194.9141 1651.4712 2360.6277 3314.9998 5139.8149 8482.9961 15968.7910 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 43.9346 73.7075 121.1443 104.3200 169.9736 185.2210 268.5992 290.4091 404.1864 423.7867 593.8620 586.9087 816.4814 819.3437 1139.1495 1204.0583 1555.0820 1740.2821 2335.4529 2783.0381 3730.0725 4729.5400 7046.3965 8896.5029 | ce_loss_increases 6.1801 0.2150 0.4018 0.3994 0.4513 0.4634 0.4806 0.4775 0.4731 0.4709 0.5317 0.8608 | compound_ce_loss_increase 6.1631 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.1954 8.0430 9.8258 11.7457 12.6035 14.1433 15.6466 17.4200 19.6371 21.3231 23.3835 25.8933 | recon_l2 43.9346 73.7075 121.1443 104.3200 169.9736 185.2210 268.5992 290.4091 404.1864 423.7867 593.8620 586.9087 816.4814 819.3437 1139.1495 1204.0583 1555.0820 1740.2821 2335.4529 2783.0381 3730.0725 4729.5400 7046.3965 8896.5029
15
+ type eval | step 2000 | loss 115.6837 222.9991 346.1193 550.5757 808.6876 1161.3655 1611.7919 2307.4878 3240.7156 5013.3613 8292.6465 15466.3809 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 43.2566 68.4717 117.6453 97.9927 165.9753 171.1334 262.6417 277.1276 395.6479 401.2710 580.9091 567.1961 800.4837 796.5103 1114.4941 1176.1953 1521.4647 1700.2471 2282.7139 2709.9114 3647.4021 4622.0723 6860.9863 8579.5264 | ce_loss_increases 6.0232 0.1994 0.3644 0.3884 0.4251 0.4423 0.4595 0.4552 0.4500 0.4463 0.5097 0.8060 | compound_ce_loss_increase 6.3819 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.9555 7.3611 9.0107 10.8065 11.7687 13.2604 14.7978 16.7980 19.0048 20.7353 23.1746 25.8675 | recon_l2 43.2566 68.4717 117.6453 97.9927 165.9753 171.1334 262.6417 277.1276 395.6479 401.2710 580.9091 567.1961 800.4837 796.5103 1114.4941 1176.1953 1521.4647 1700.2471 2282.7139 2709.9114 3647.4021 4622.0723 6860.9863 8579.5264
16
+ type eval | step 2250 | loss 111.0215 215.3074 334.2158 536.5679 786.4391 1135.5452 1582.3324 2267.3672 3185.1118 4910.6963 8131.9854 14846.9619 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 42.5372 64.7475 115.2617 93.1884 163.5578 162.2871 258.5701 267.9284 388.4810 386.8549 570.3817 552.5666 789.1320 779.0826 1096.2725 1154.8136 1495.9623 1670.7185 2240.8169 2649.6025 3578.3567 4530.7549 6571.1729 8249.7148 | ce_loss_increases 5.8860 0.1861 0.3572 0.3717 0.4023 0.4208 0.4400 0.4377 0.4300 0.4272 0.4888 0.7549 | compound_ce_loss_increase 6.9276 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.7368 6.8573 8.3710 10.0695 11.1032 12.5969 14.1174 16.2809 18.4307 20.2769 22.8742 26.0753 | recon_l2 42.5372 64.7475 115.2617 93.1884 163.5578 162.2871 258.5701 267.9284 388.4810 386.8549 570.3817 552.5666 789.1320 779.0826 1096.2725 1154.8136 1495.9623 1670.7185 2240.8169 2649.6025 3578.3567 4530.7549 6571.1729 8249.7148
17
+ type eval | step 2500 | loss 106.8401 208.4978 324.4058 524.5229 768.6301 1112.7850 1552.1202 2224.6746 3127.3350 4811.4243 7968.8740 14341.5107 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 41.9926 61.2898 113.3628 88.7145 161.2113 155.3353 255.5681 259.4780 383.0042 375.0714 562.0073 538.7092 777.6725 760.8715 1078.5603 1130.2771 1471.0525 1638.4124 2196.9478 2594.5598 3509.5806 4436.7305 6342.8794 7972.3135 | ce_loss_increases 5.8380 0.1774 0.3481 0.3617 0.3887 0.4069 0.4252 0.4184 0.4129 0.4076 0.4707 0.7097 | compound_ce_loss_increase 7.0832 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.5577 6.4205 7.8592 9.4768 10.5543 12.0688 13.5769 15.8362 17.8693 19.9176 22.5634 26.3184 | recon_l2 41.9926 61.2898 113.3628 88.7145 161.2113 155.3353 255.5681 259.4780 383.0042 375.0714 562.0073 538.7092 777.6725 760.8715 1078.5603 1130.2771 1471.0525 1638.4124 2196.9478 2594.5598 3509.5806 4436.7305 6342.8794 7972.3135
18
+ type eval | step 2750 | loss 104.2689 204.2940 319.2394 517.8898 759.5223 1100.4872 1536.5771 2200.3145 3098.0549 4769.3809 7886.6953 14102.0723 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 41.7086 59.1436 112.3622 85.8602 160.0402 151.7635 254.2413 254.6589 380.6286 368.8045 557.8970 530.9591 772.0189 751.4043 1070.6265 1114.3342 1458.4166 1622.2133 2176.6335 2573.0908 3480.8708 4383.5874 6236.7583 7839.2666 | ce_loss_increases 5.6093 0.1683 0.3438 0.3618 0.3820 0.4031 0.4162 0.4074 0.4029 0.3990 0.4609 0.6875 | compound_ce_loss_increase 7.2585 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.4167 6.0716 7.4358 8.9896 10.0893 11.6310 13.1539 15.3540 17.4252 19.6556 22.2359 26.0473 | recon_l2 41.7086 59.1436 112.3622 85.8602 160.0402 151.7635 254.2413 254.6589 380.6286 368.8045 557.8970 530.9591 772.0189 751.4043 1070.6265 1114.3342 1458.4166 1622.2133 2176.6335 2573.0908 3480.8708 4383.5874 6236.7583 7839.2666
19
+ type eval | step 3000 | loss 101.9445 200.1462 313.9266 510.0699 750.5416 1087.7557 1520.0159 2176.3313 3061.5212 4717.4912 7791.4644 13910.9678 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 41.4257 57.2363 111.1375 83.2032 158.6702 148.1564 251.9958 249.4812 377.9008 362.9205 553.6210 522.8849 766.2583 740.9825 1061.5898 1099.7881 1443.7174 1600.7429 2151.8530 2546.1863 3442.7771 4326.6689 6158.8022 7726.2295 | ce_loss_increases 5.4670 0.1674 0.3416 0.3498 0.3760 0.3961 0.4128 0.3992 0.3944 0.3924 0.4525 0.6703 | compound_ce_loss_increase 7.2899 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.2825 5.8054 7.1000 8.5927 9.7204 11.2500 12.7748 14.9536 17.0610 19.4516 22.0173 25.9341 | recon_l2 41.4257 57.2363 111.1375 83.2032 158.6702 148.1564 251.9958 249.4812 377.9008 362.9205 553.6210 522.8849 766.2583 740.9825 1061.5898 1099.7881 1443.7174 1600.7429 2151.8530 2546.1863 3442.7771 4326.6689 6158.8022 7726.2295
20
+ type eval | step 3250 | loss 99.8944 196.7910 309.4396 504.3038 743.3682 1075.9559 1505.3956 2156.9949 3031.8718 4674.9561 7720.3916 13747.6064 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 41.1906 55.5288 110.1658 81.0506 157.6914 144.9148 250.5230 245.4991 375.7975 358.1312 550.0611 514.9603 760.8331 732.0922 1054.4269 1087.9268 1432.1428 1582.9766 2132.1963 2523.4446 3413.1765 4285.3994 6087.4150 7634.3975 | ce_loss_increases 5.3996 0.1637 0.3441 0.3441 0.3729 0.3951 0.4064 0.3932 0.3866 0.3857 0.4439 0.6538 | compound_ce_loss_increase 7.3580 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.1750 5.5746 6.8333 8.2816 9.4398 10.9348 12.4702 14.6408 16.7525 19.3147 21.8160 25.7919 | recon_l2 41.1906 55.5288 110.1658 81.0506 157.6914 144.9148 250.5230 245.4991 375.7975 358.1312 550.0611 514.9603 760.8331 732.0922 1054.4269 1087.9268 1432.1428 1582.9766 2132.1963 2523.4446 3413.1765 4285.3994 6087.4150 7634.3975
21
+ type eval | step 3500 | loss 98.6320 194.8571 307.3279 501.9892 740.6986 1070.9193 1499.3638 2148.1047 3017.8083 4657.9595 7698.4980 13685.3789 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 41.1018 54.4359 109.7419 79.7340 157.7136 143.0126 250.6070 243.3571 375.8471 355.6340 549.5351 510.7134 758.9814 728.1561 1052.1417 1081.5510 1426.1439 1575.1741 2124.8704 2513.8894 3404.3723 4272.5049 6060.8857 7598.8218 | ce_loss_increases 5.3342 0.1601 0.3464 0.3455 0.3745 0.3928 0.4066 0.3906 0.3846 0.3839 0.4433 0.6472 | compound_ce_loss_increase 7.3248 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.0942 5.3813 6.6016 8.0250 9.2174 10.6712 12.2260 14.4110 16.4905 19.1996 21.6203 25.6726 | recon_l2 41.1018 54.4359 109.7419 79.7340 157.7136 143.0126 250.6070 243.3571 375.8471 355.6340 549.5351 510.7134 758.9814 728.1561 1052.1417 1081.5510 1426.1439 1575.1741 2124.8704 2513.8894 3404.3723 4272.5049 6060.8857 7598.8218
22
+ type eval | step 3750 | loss 97.3993 192.8303 304.5703 498.2867 735.5538 1063.7682 1490.9779 2135.9407 2998.2112 4633.3340 7660.9688 13602.6191 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 40.9385 53.4429 109.1869 78.4101 157.0949 141.0677 249.6903 240.7835 374.6235 351.9251 547.3242 506.0080 755.6380 723.3151 1048.0731 1073.6586 1418.1012 1563.8577 2113.9504 2500.3406 3387.4995 4251.9873 6022.9248 7554.1079 | ce_loss_increases 5.2895 0.1602 0.3424 0.3464 0.3724 0.3908 0.4053 0.3863 0.3802 0.3800 0.4399 0.6387 | compound_ce_loss_increase 7.3647 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 3.0179 5.2332 6.4076 7.8129 9.0053 10.4360 12.0249 14.2094 16.2513 19.0431 21.4812 25.5877 | recon_l2 40.9385 53.4429 109.1869 78.4101 157.0949 141.0677 249.6903 240.7835 374.6235 351.9251 547.3242 506.0080 755.6380 723.3151 1048.0731 1073.6586 1418.1012 1563.8577 2113.9504 2500.3406 3387.4995 4251.9873 6022.9248 7554.1079
23
+ type eval | step 4000 | loss 96.3109 191.0454 302.0663 495.1097 731.1932 1057.1489 1483.5106 2123.5276 2980.6914 4609.0903 7617.6040 13501.6406 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 40.8253 52.5099 108.7124 77.2266 156.5153 139.2904 249.0002 238.4614 373.6319 348.7296 545.1555 501.7578 753.1382 718.5260 1043.9286 1065.5800 1411.3262 1553.3137 2103.8665 2486.3030 3367.5791 4228.6694 5976.9375 7499.2021 | ce_loss_increases 5.2042 0.1596 0.3453 0.3477 0.3713 0.3861 0.4028 0.3816 0.3740 0.3748 0.4333 0.6328 | compound_ce_loss_increase 7.4210 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 2.9757 5.1064 6.2605 7.6480 8.8316 10.2352 11.8464 14.0196 16.0521 18.9216 21.3527 25.5027 | recon_l2 40.8253 52.5099 108.7124 77.2266 156.5153 139.2904 249.0002 238.4614 373.6319 348.7296 545.1555 501.7578 753.1382 718.5260 1043.9286 1065.5800 1411.3262 1553.3137 2103.8665 2486.3030 3367.5791 4228.6694 5976.9375 7499.2021
24
+ type eval | step 4250 | loss 95.6697 190.2950 301.3245 494.1573 729.9346 1055.0828 1480.7490 2119.1414 2975.5308 4601.8550 7612.3638 13474.5322 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 40.8053 51.9316 108.7647 76.5392 156.7179 138.4771 249.2488 237.4108 374.2883 346.9779 545.2162 499.8217 752.6614 716.4028 1043.2948 1062.0052 1409.6852 1549.9590 2098.6472 2484.3672 3364.3235 4226.7847 5964.7822 7484.3330 | ce_loss_increases 5.1477 0.1578 0.3486 0.3508 0.3700 0.3900 0.4034 0.3816 0.3739 0.3749 0.4339 0.6271 | compound_ce_loss_increase 7.4099 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 2.9328 4.9911 6.1297 7.4976 8.6684 10.0449 11.6842 13.8414 15.8874 18.8419 21.2541 25.4156 | recon_l2 40.8053 51.9316 108.7647 76.5392 156.7179 138.4771 249.2488 237.4108 374.2883 346.9779 545.2162 499.8217 752.6614 716.4028 1043.2948 1062.0052 1409.6852 1549.9590 2098.6472 2484.3672 3364.3235 4226.7847 5964.7822 7484.3330
25
+ type eval | step 4500 | loss 95.0302 189.2894 299.9839 492.2272 727.0740 1051.8116 1476.9030 2113.3176 2965.1216 4585.8018 7593.0400 13423.9316 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 40.7396 51.4046 108.5369 75.8438 156.5384 137.4293 248.7747 236.0702 373.7111 344.8229 544.1862 497.7164 751.6490 713.7125 1041.9702 1057.6549 1405.6500 1543.7122 2091.1892 2475.8257 3355.2834 4216.5957 5941.9429 7456.6138 | ce_loss_increases 5.1211 0.1588 0.3456 0.3476 0.3676 0.3948 0.4023 0.3796 0.3704 0.3734 0.4315 0.6261 | compound_ce_loss_increase 7.2839 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 2.8860 4.9087 6.0162 7.3822 8.5402 9.9088 11.5414 13.6925 15.7587 18.7875 21.1620 25.3760 | recon_l2 40.7396 51.4046 108.5369 75.8438 156.5384 137.4293 248.7747 236.0702 373.7111 344.8229 544.1862 497.7164 751.6490 713.7125 1041.9702 1057.6549 1405.6500 1543.7122 2091.1892 2475.8257 3355.2834 4216.5957 5941.9429 7456.6138
26
+ type eval | step 4750 | loss 94.3603 188.2416 298.5036 489.9702 723.7598 1047.9397 1471.5168 2105.5208 2955.4543 4567.2378 7569.5923 13369.9551 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 40.6728 50.8293 108.3039 75.1055 156.2585 136.3215 248.1605 234.5313 372.8534 342.4873 543.0032 495.1723 749.6819 710.4281 1039.5708 1052.4011 1401.9214 1537.9127 2083.2922 2465.2166 3344.6875 4203.8311 5918.4497 7426.2178 | ce_loss_increases 5.0796 0.1576 0.3418 0.3496 0.3649 0.3956 0.3994 0.3784 0.3693 0.3708 0.4305 0.6220 | compound_ce_loss_increase 7.2980 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 2.8582 4.8323 5.9236 7.2783 8.4190 9.7644 11.4069 13.5491 15.6211 18.7269 21.0748 25.2880 | recon_l2 40.6728 50.8293 108.3039 75.1055 156.2585 136.3215 248.1605 234.5313 372.8534 342.4873 543.0032 495.1723 749.6819 710.4281 1039.5708 1052.4011 1401.9214 1537.9127 2083.2922 2465.2166 3344.6875 4203.8311 5918.4497 7426.2178
27
+ type eval | step 5000 | loss 94.0270 188.1264 298.4841 490.2946 723.8708 1047.8970 1471.5043 2104.4077 2955.2065 4565.9263 7575.1670 13368.0918 | checkpoint True True True False False True True True True True False True | ce_loss 3.2555 | sae_losses 40.6936 50.5018 108.5667 74.8075 156.6766 135.9793 248.9182 234.2052 373.7816 341.7866 543.6644 494.5916 750.2772 709.9340 1040.0643 1050.9143 1401.8873 1537.8007 2083.4402 2463.8113 3346.7217 4207.4316 5917.0903 7425.7456 | ce_loss_increases 5.0439 0.1581 0.3455 0.3561 0.3662 0.3994 0.4001 0.3788 0.3715 0.3704 0.4309 0.6188 | compound_ce_loss_increase 7.1183 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 2.8317 4.7523 5.8283 7.1712 8.3027 9.6407 11.2932 13.4288 15.5175 18.6751 21.0152 25.2515 | recon_l2 40.6936 50.5018 108.5667 74.8075 156.6766 135.9793 248.9182 234.2052 373.7816 341.7866 543.6644 494.5916 750.2772 709.9340 1040.0643 1050.9143 1401.8873 1537.8007 2083.4402 2463.8113 3346.7217 4207.4316 5917.0903 7425.7456
model.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"block_size": 1024, "vocab_size": 50257, "n_layer": 12, "n_head": 12, "n_embd": 768, "norm_strategy": "LayerNorm"}
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f76d410b06d9bcf56953e112c862553a8f38d07c118afe54a0a67b5f1af4a3bb
3
+ size 497774344
sae.0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bb73927bb880679388fed0175973d7b67d9d265c271a4770124db99fa7c3416
3
+ size 151096640
sae.1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52c7dbe7b339a2ebd10545667735fdfb6dc902a3e6cd35c1d89158b9d256ecf4
3
+ size 151096640
sae.10.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d5723e15ace0f5eab37fa94b1f9f5cbf89fef3e4e481ff856b8b1976ed26d0
3
+ size 151096640
sae.11.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6f3cc6704b8481273f5dff1e5bca92a0bd670094ad81d0d2ab9e9acf1dcfc25
3
+ size 151096640
sae.12.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18935183a9c5f82a6a8f9cdc6b8fa5fae7b997dbda06dee4046343b0a8aca5e2
3
+ size 151096640
sae.13.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc45773f195d968f5b5be2276d33b3601d5744164b863779bfb670233b2f5f7
3
+ size 151096640
sae.14.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684d3472538dfaa304c58ef640abb42081f8efb004220ab6bb630765b2c35fb7
3
+ size 151096640
sae.15.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ffc4967d3e4638476404f2be457d097ad19cd87cfbc6c591db2c7ee92f960b5
3
+ size 151096640
sae.16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05d17fe3d987970c555f04ea80f65d5d69fa6b2e89a0309149fe90730c6bcfb3
3
+ size 151096640
sae.17.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5806db05e72e9f1b289d0bfbbd6a09cc14af61ecb3671424db048264d5f3b9f
3
+ size 151096640
sae.18.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b9bae813970fd081c059800c4c1815febdff19e2dbe2c7cfd6b297a9a9dc12f
3
+ size 151096640
sae.19.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d09e654956886a397a760260325ba43b0a3e2457b71b0f3284030b479a4d4594
3
+ size 151096640
sae.2.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:769987d221fe5a1110e8b83f43edb1861528516045d5e85f7accd40fba59bfcf
3
+ size 151096640
sae.20.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abfc2fbe9773f3f4d2fabd050fe1b8bbf817440b58f6256479183d2af888728e
3
+ size 151096640
sae.21.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b64c293fe2d7fc96f1cfac8894c488e083719f86b308b8f08c3db38a0e6a985
3
+ size 151096640
sae.22.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da59a7960a06e283f1fde6c278516d12304bf6223b577d8130bf6620487f1772
3
+ size 151096640
sae.23.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35ee767407c939a1f3af8ee27342d676e898b4bc1b4e7d3a22be8b3248054ae7
3
+ size 151096640
sae.3.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3de233175f7c3a37613b8be4168b79ccfc78b0f35413ce66610195016f48d7bf
3
+ size 151096640
sae.4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76252711a1342ff57fd89ba7df12f5b20b01255152b8900d9eecf73740d0f164
3
+ size 151096640
sae.5.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8500cbdfe2c5dea743d25378bdab9f53efae6b5f97af3878df9dbb9a918306a
3
+ size 151096640
sae.6.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:324487d4de0e269433a8799e1b73d79fd38dcbc2d7678045c90711b7d4ffa984
3
+ size 151096640
sae.7.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3680af77fa14e2b2b4b121b7ba23365e36dc07da157fffa52a166f19263324a5
3
+ size 151096640
sae.8.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb0e27bb659f4dc4e1e0180a2ec185bfd5382b861f6aa7e12fae37db4f3124a
3
+ size 151096640
sae.9.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49f90c74d27e17f8a8cd9e0fa3c9353d1817c9de258ac254fdf318907c71f4fb
3
+ size 151096640
sae.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"n_features": [24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576], "sae_variant": "jsae_block", "top_k": [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32], "sae_keys": ["0_residmid", "0_residpost", "1_residmid", "1_residpost", "2_residmid", "2_residpost", "3_residmid", "3_residpost", "4_residmid", "4_residpost", "5_residmid", "5_residpost", "6_residmid", "6_residpost", "7_residmid", "7_residpost", "8_residmid", "8_residpost", "9_residmid", "9_residpost", "10_residmid", "10_residpost", "11_residmid", "11_residpost"]}
train.log ADDED
The diff for this file is too large to render. See raw diff