Upload folder using huggingface_hub
Browse files- debug.log +2 -0
- eval.log +22 -0
- model.json +1 -0
- model.safetensors +3 -0
- sae.0.safetensors +3 -0
- sae.1.safetensors +3 -0
- sae.10.safetensors +3 -0
- sae.11.safetensors +3 -0
- sae.12.safetensors +3 -0
- sae.13.safetensors +3 -0
- sae.14.safetensors +3 -0
- sae.15.safetensors +3 -0
- sae.16.safetensors +3 -0
- sae.17.safetensors +3 -0
- sae.18.safetensors +3 -0
- sae.19.safetensors +3 -0
- sae.2.safetensors +3 -0
- sae.20.safetensors +3 -0
- sae.21.safetensors +3 -0
- sae.22.safetensors +3 -0
- sae.23.safetensors +3 -0
- sae.3.safetensors +3 -0
- sae.4.safetensors +3 -0
- sae.5.safetensors +3 -0
- sae.6.safetensors +3 -0
- sae.7.safetensors +3 -0
- sae.8.safetensors +3 -0
- sae.9.safetensors +3 -0
- sae.json +1 -0
- train.log +0 -0
debug.log
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name jln.mlpblock.gpt2-sparse-5.0e-01 | device cuda | compile True | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 1 | eval_interval 250 | eval_steps 100 | batch_size 4 | gradient_accumulation_steps 8 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), 'regularization': None, 'downstream': None, 'bandwidth': None}
|
| 2 |
+
name jln.mlpblock.gpt2-sparse-5.0e-01 | device cuda | compile True | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 1 | eval_interval 250 | eval_steps 100 | batch_size 4 | gradient_accumulation_steps 8 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), 'regularization': None, 'downstream': None, 'bandwidth': None}
|
eval.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
type eval | step 0 | loss 7027.8613 11944.6504 21999.4141 32531.5508 38476.2500 43335.2578 48548.3594 58522.6289 71820.1875 97268.9766 167364.2188 674833.1875 | checkpoint False | ce_loss 3.2555 | sae_losses 1994.1711 4540.0571 5408.7031 6436.5791 6758.7568 15127.0469 15237.0566 17169.1426 18432.5352 19920.3887 20504.4297 22707.3516 23517.2402 24907.9277 28249.3125 30147.5977 33943.6406 37748.3008 45405.5234 51731.7422 71635.5156 95587.9766 305591.7812 369087.4688 | ce_loss_increases 6.1479 4.8017 5.4297 5.8548 6.5109 5.6564 5.0433 4.7023 4.2492 3.8047 3.8563 8.9796 | compound_ce_loss_increase 14.9573 | l0s 33.0884 32.8337 33.0043 33.1199 33.1823 33.1656 33.1295 33.0972 33.0480 33.0069 32.9474 32.9125 32.8493 32.8242 32.8288 32.8825 33.0646 33.1424 33.0454 32.9485 32.8317 33.0250 32.9957 33.1132 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 493.6329 99.3619 113.6022 125.3516 123.3314 123.4745 123.1875 125.7359 128.2455 131.6655 140.7792 153.9272 | recon_l2 1994.1711 4540.0571 5408.7031 6436.5791 6758.7568 15127.0469 15237.0566 17169.1426 18432.5352 19920.3887 20504.4297 22707.3516 23517.2402 24907.9277 28249.3125 30147.5977 33943.6406 37748.3008 45405.5234 51731.7422 71635.5156 95587.9766 305591.7812 369087.4688
|
| 2 |
+
type eval | step 0 | loss 7068.9312 11836.2197 21768.9023 32755.1406 37960.5781 42365.3945 49309.0195 56952.8828 71537.3516 98090.1328 166182.7500 672151.5000 | checkpoint False | ce_loss 3.2555 | sae_losses 1993.3568 4584.3779 5366.5913 6370.9126 6806.5693 14849.3682 15378.6338 17250.5195 17722.7793 20113.6992 20338.9648 21902.8672 23942.1738 25243.4609 27877.8730 28948.8672 34164.8906 37244.8008 45877.2852 52081.3633 69197.0078 96845.4141 297035.0000 374954.8125 | ce_loss_increases 6.1620 4.9305 5.8163 6.9764 5.9748 5.6903 5.2062 4.6107 4.2282 3.7334 3.9867 8.5741 | compound_ce_loss_increase 15.0238 | l0s 33.0903 32.8345 32.9749 33.1011 33.1711 33.1633 33.1331 33.0836 33.0335 33.0087 32.9427 32.9105 32.8518 32.8225 32.8364 32.8768 33.0557 33.1391 33.0382 32.9428 32.8398 33.0370 33.0217 33.0216 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 491.1973 98.7171 112.9673 125.9938 124.0943 123.5737 123.3939 126.1653 127.6542 131.4581 140.3393 161.6427 | recon_l2 1993.3568 4584.3779 5366.5913 6370.9126 6806.5693 14849.3682 15378.6338 17250.5195 17722.7793 20113.6992 20338.9648 21902.8672 23942.1738 25243.4609 27877.8730 28948.8672 34164.8906 37244.8008 45877.2852 52081.3633 69197.0078 96845.4141 297035.0000 374954.8125
|
| 3 |
+
type eval | step 250 | loss 1728.9546 3111.5127 6799.4565 10624.5234 12211.7646 13984.2832 15878.1738 18631.8887 22310.2891 30897.9355 49820.9375 93778.9922 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 358.6840 1120.7013 1316.0684 1729.3218 1906.4369 4818.0996 4756.3872 5785.2837 5493.6665 6635.9268 6606.6035 7296.0444 7781.9434 8014.5024 9187.8652 9360.0762 10614.2451 11609.7949 14343.3105 16465.1660 21912.5273 27813.5117 42641.7422 51018.3398 | ce_loss_increases 4.9802 6.4088 6.5059 7.3201 6.8796 6.3320 5.8938 5.3626 4.9847 4.5740 4.2126 5.3784 | compound_ce_loss_increase 13.0987 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0001 32.0001 32.0001 32.0002 32.0002 32.0002 32.0002 32.0001 32.0003 32.0002 32.0002 32.0004 32.0003 32.0003 32.0003 32.0002 32.0004 32.0005 32.0005 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 249.5695 66.1227 74.9178 82.8489 82.1688 81.6374 81.7332 83.9474 86.2457 89.4665 94.8975 118.9178 | recon_l2 358.6840 1120.7013 1316.0684 1729.3218 1906.4369 4818.0996 4756.3872 5785.2837 5493.6665 6635.9268 6606.6035 7296.0444 7781.9434 8014.5024 9187.8652 9360.0762 10614.2451 11609.7949 14343.3105 16465.1660 21912.5273 27813.5117 42641.7422 51018.3398
|
| 4 |
+
type eval | step 500 | loss 463.5749 907.6201 1216.0128 1654.2703 2194.7642 2915.5940 3803.6348 5193.4287 7170.7920 10799.9453 17973.0391 33301.8750 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 97.2427 344.4480 403.8639 445.5739 557.9799 595.4860 735.8972 853.6599 1007.8149 1121.7646 1366.9678 1481.5648 1794.1017 1940.3119 2430.7915 2688.8977 3319.9597 3773.5085 4867.5815 5849.7358 7883.9902 9999.8701 14922.4980 18260.8633 | ce_loss_increases 8.3532 3.2995 3.1141 3.0572 2.8003 2.5964 2.4825 2.3075 2.0748 1.9021 1.8891 2.7594 | compound_ce_loss_increase 7.0089 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 21.8842 58.1822 62.5468 64.7136 65.1844 67.0614 69.2209 73.7386 77.3210 82.6272 89.1811 118.5147 | recon_l2 97.2427 344.4480 403.8639 445.5739 557.9799 595.4860 735.8972 853.6599 1007.8149 1121.7646 1366.9678 1481.5648 1794.1017 1940.3119 2430.7915 2688.8977 3319.9597 3773.5085 4867.5815 5849.7358 7883.9902 9999.8701 14922.4980 18260.8633
|
| 5 |
+
type eval | step 750 | loss 245.4812 494.8949 689.2791 1002.5322 1394.7936 1901.7035 2526.1709 3523.0015 4986.3608 7591.4697 12755.0996 23980.6289 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 66.1336 166.7594 228.0548 230.6970 303.4612 340.6945 440.7170 510.6902 650.1522 689.3521 900.7890 938.9512 1194.4246 1261.9513 1651.1511 1795.3292 2264.3105 2636.4785 3404.2783 4092.3821 5561.0811 7093.7251 10770.0332 13100.2578 | ce_loss_increases 7.9217 0.9606 1.4749 1.4084 1.2578 1.3038 1.3037 1.2537 1.1754 1.0738 1.1278 1.6728 | compound_ce_loss_increase 7.3953 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 12.5882 36.1432 45.1232 51.1253 55.2894 61.9639 69.7955 76.5211 85.5703 94.8108 100.2935 110.3318 | recon_l2 66.1336 166.7594 228.0548 230.6970 303.4612 340.6945 440.7170 510.6902 650.1522 689.3521 900.7890 938.9512 1194.4246 1261.9513 1651.1511 1795.3292 2264.3105 2636.4785 3404.2783 4092.3821 5561.0811 7093.7251 10770.0332 13100.2578
|
| 6 |
+
type eval | step 1000 | loss 183.6755 363.1170 516.8552 791.1254 1115.4067 1555.2277 2103.0007 2932.9360 4111.9614 6375.0835 10550.3857 19567.1855 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 57.2092 117.0179 173.0050 164.4995 231.3804 253.5191 367.1287 387.1677 535.6129 538.4417 762.2504 745.7289 1036.4905 1012.3981 1418.6252 1451.1001 1896.1240 2139.6633 2859.2461 3426.1201 4614.4590 5833.3364 8742.8379 10712.8555 | ce_loss_increases 8.5040 0.4877 0.8028 0.7752 0.7887 0.8164 0.8275 0.7975 0.7748 0.7489 0.7954 1.2180 | compound_ce_loss_increase 6.5630 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 9.4484 25.6123 31.9558 36.8292 41.3525 47.2482 54.1124 63.2110 76.1744 89.7170 102.5887 111.4901 | recon_l2 57.2092 117.0179 173.0050 164.4995 231.3804 253.5191 367.1287 387.1677 535.6129 538.4417 762.2504 745.7289 1036.4905 1012.3981 1418.6252 1451.1001 1896.1240 2139.6633 2859.2461 3426.1201 4614.4590 5833.3364 8742.8379 10712.8555
|
| 7 |
+
type eval | step 1250 | loss 157.9242 312.4448 451.6824 703.5930 996.7654 1402.9021 1909.6477 2668.5662 3796.5791 5769.7065 9531.3545 17811.8770 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 53.5153 96.2071 154.7490 137.6687 210.4403 216.2479 338.2235 336.0836 490.7737 472.3979 700.8373 663.0840 949.0037 915.7719 1305.7977 1308.8728 1801.6476 1929.1443 2611.1653 3077.6965 4176.8608 5257.8384 7868.9878 9833.6689 | ce_loss_increases 6.8734 0.3691 0.6108 0.6089 0.6434 0.6721 0.6907 0.6588 0.6375 0.5983 0.6526 1.0227 | compound_ce_loss_increase 6.1201 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 8.2017 20.0272 24.9943 29.2858 33.5939 38.9810 44.8724 53.8957 65.7878 80.8450 96.6558 109.2186 | recon_l2 53.5153 96.2071 154.7490 137.6687 210.4403 216.2479 338.2235 336.0836 490.7737 472.3979 700.8373 663.0840 949.0037 915.7719 1305.7977 1308.8728 1801.6476 1929.1443 2611.1653 3077.6965 4176.8608 5257.8384 7868.9878 9833.6689
|
| 8 |
+
type eval | step 1500 | loss 142.8025 283.1351 414.8404 649.7856 929.8449 1322.0557 1803.2930 2558.4150 3582.7402 5434.6460 8943.1885 16722.5215 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 51.3359 84.0702 145.1132 120.9360 200.1756 193.5326 318.9512 305.9298 466.9922 433.9539 672.7509 615.5329 908.7374 855.0031 1250.2119 1259.2981 1718.4775 1805.3159 2480.9788 2879.8501 3939.9253 4911.9653 7310.8486 9304.9648 | ce_loss_increases 6.2388 0.2993 0.5255 0.5281 0.5739 0.6005 0.6147 0.5890 0.5669 0.5217 0.5759 0.9229 | compound_ce_loss_increase 6.3963 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 7.3964 17.0860 21.1323 24.9046 28.8989 33.7718 39.5526 48.9050 58.9466 73.8166 91.2981 106.7106 | recon_l2 51.3359 84.0702 145.1132 120.9360 200.1756 193.5326 318.9512 305.9298 466.9922 433.9539 672.7509 615.5329 908.7374 855.0031 1250.2119 1259.2981 1718.4775 1805.3159 2480.9788 2879.8501 3939.9253 4911.9653 7310.8486 9304.9648
|
| 9 |
+
type eval | step 1750 | loss 132.0087 262.8381 390.5668 613.3934 885.3325 1267.0336 1733.0116 2454.4880 3452.9744 5213.7246 8570.1699 15989.3506 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 49.7613 75.4426 139.1730 108.4479 193.2919 178.6055 306.9047 284.2567 451.4411 407.8735 653.1562 583.3173 884.0611 812.9578 1218.5043 1191.1259 1666.6635 1732.1132 2394.8801 2750.2375 3771.4861 4711.3042 6941.1982 8942.8594 | ce_loss_increases 6.0944 0.2667 0.4972 0.4907 0.5347 0.5624 0.5681 0.5403 0.5202 0.4749 0.5304 0.8541 | compound_ce_loss_increase 6.7738 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 6.8048 15.2171 18.6695 22.2321 26.0177 30.5602 35.9924 44.8579 54.1968 68.6078 87.3794 105.2930 | recon_l2 49.7613 75.4426 139.1730 108.4479 193.2919 178.6055 306.9047 284.2567 451.4411 407.8735 653.1562 583.3173 884.0611 812.9578 1218.5043 1191.1259 1666.6635 1732.1132 2394.8801 2750.2375 3771.4861 4711.3042 6941.1982 8942.8594
|
| 10 |
+
type eval | step 2000 | loss 124.9282 251.4995 376.8385 592.1154 859.7445 1235.5266 1694.8478 2403.0447 3395.2246 5106.3896 8381.7900 15553.2754 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 48.6762 69.8270 136.7813 101.1125 190.9715 168.9457 301.2273 270.7297 445.1489 390.7584 644.4566 562.8620 874.2834 787.3342 1204.1228 1157.0804 1637.1219 1707.1890 2354.3198 2687.3918 3693.7432 4604.2314 6776.5400 8673.2559 | ce_loss_increases 6.0869 0.2630 0.4902 0.4861 0.5358 0.5506 0.5566 0.5170 0.5065 0.4585 0.5054 0.8080 | compound_ce_loss_increase 6.8407 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 6.4250 13.6057 16.9214 20.1583 23.8372 28.2082 33.2298 41.8416 50.9140 64.6778 83.8142 103.4821 | recon_l2 48.6762 69.8270 136.7813 101.1125 190.9715 168.9457 301.2273 270.7297 445.1489 390.7584 644.4566 562.8620 874.2834 787.3342 1204.1228 1157.0804 1637.1219 1707.1890 2354.3198 2687.3918 3693.7432 4604.2314 6776.5400 8673.2559
|
| 11 |
+
type eval | step 2250 | loss 119.4043 243.0167 378.1810 576.9497 842.0344 1210.6526 1718.7483 2364.1787 3351.0349 5017.1846 8224.4756 15252.2441 | checkpoint True True False True True True False True True True True True | ce_loss 3.2555 | sae_losses 47.6997 65.5920 134.7837 95.7120 189.2448 173.1502 297.8914 260.4036 442.0708 377.8212 637.6944 546.4368 867.4298 820.1442 1191.0140 1133.6644 1617.7815 1684.6475 2320.5269 2634.6646 3631.5447 4511.9624 6649.9893 8500.3643 | ce_loss_increases 6.1378 0.2526 0.5095 0.4751 0.5230 0.5354 0.5870 0.5063 0.4916 0.4415 0.4883 0.7837 | compound_ce_loss_increase 7.1007 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 6.1125 12.5210 15.7860 18.6546 22.1425 26.5212 31.1739 39.5005 48.6058 61.9931 80.9671 101.8954 | recon_l2 47.6997 65.5920 134.7837 95.7120 189.2448 173.1502 297.8914 260.4036 442.0708 377.8212 637.6944 546.4368 867.4298 820.1442 1191.0140 1133.6644 1617.7815 1684.6475 2320.5269 2634.6646 3631.5447 4511.9624 6649.9893 8500.3643
|
| 12 |
+
type eval | step 2500 | loss 114.7144 235.3493 361.3765 562.5886 825.7578 1187.9587 1663.6831 2331.1763 3302.0996 4934.2715 8060.7402 14844.4639 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 47.1169 61.7535 133.1134 90.5974 187.4667 159.0676 294.4609 250.5690 439.0154 365.9249 632.4633 530.4269 861.0643 772.8090 1180.9963 1112.8374 1601.3682 1654.1696 2288.2817 2586.4417 3572.0305 4410.4419 6508.6128 8234.8809 | ce_loss_increases 6.2145 0.2422 0.4852 0.4679 0.5255 0.5278 0.5457 0.4996 0.4796 0.4257 0.4721 0.7460 | compound_ce_loss_increase 7.7281 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 5.8439 11.6385 14.8423 17.5587 20.8172 25.0686 29.8102 37.3424 46.5616 59.5486 78.2676 100.9710 | recon_l2 47.1169 61.7535 133.1134 90.5974 187.4667 159.0676 294.4609 250.5690 439.0154 365.9249 632.4633 530.4269 861.0643 772.8090 1180.9963 1112.8374 1601.3682 1654.1696 2288.2817 2586.4417 3572.0305 4410.4419 6508.6128 8234.8809
|
| 13 |
+
type eval | step 2750 | loss 111.5649 230.9908 352.8335 554.5011 817.6702 1175.5730 1638.8199 2317.8950 3270.0176 4905.9771 8003.5806 14548.5146 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 46.6313 59.3723 132.7191 87.3574 186.8804 151.9476 293.3723 244.5123 438.5253 359.3916 630.8286 520.8700 860.2069 750.1198 1178.2979 1103.8943 1594.7191 1630.4410 2279.3247 2569.0613 3548.1641 4379.1724 6380.7402 8067.1270 | ce_loss_increases 6.0522 0.2400 0.4749 0.4620 0.5334 0.5360 0.5477 0.4996 0.4733 0.4210 0.4690 0.7192 | compound_ce_loss_increase 7.6701 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 5.5613 10.9143 14.0055 16.6164 19.7533 23.8749 28.4936 35.7028 44.8571 57.5923 76.2419 100.6455 | recon_l2 46.6313 59.3723 132.7191 87.3574 186.8804 151.9476 293.3723 244.5123 438.5253 359.3916 630.8286 520.8700 860.2069 750.1198 1178.2979 1103.8943 1594.7191 1630.4410 2279.3247 2569.0613 3548.1641 4379.1724 6380.7402 8067.1270
|
| 14 |
+
type eval | step 3000 | loss 108.7469 226.8173 346.3229 546.3054 808.1017 1212.3265 1620.8588 2300.9719 3235.3958 4867.2998 7929.4556 14375.6934 | checkpoint True True True True True False True True True True True True | ce_loss 3.2555 | sae_losses 46.1274 57.2553 132.0339 84.4298 185.8408 147.1815 291.1909 239.3169 436.7019 352.5100 628.0987 561.3898 856.7819 736.6304 1174.1267 1092.4888 1586.1338 1606.3293 2261.2166 2549.5720 3513.9758 4341.1172 6308.3354 7968.3467 | ce_loss_increases 5.9876 0.2409 0.4808 0.4640 0.5387 0.5617 0.5493 0.5015 0.4652 0.4189 0.4629 0.7058 | compound_ce_loss_increase 7.8578 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 5.3642 10.3537 13.3005 15.7975 18.8896 22.8380 27.4460 34.3564 42.9329 56.5130 74.3633 99.0106 | recon_l2 46.1274 57.2553 132.0339 84.4298 185.8408 147.1815 291.1909 239.3169 436.7019 352.5100 628.0987 561.3898 856.7819 736.6304 1174.1267 1092.4888 1586.1338 1606.3293 2261.2166 2549.5720 3513.9758 4341.1172 6308.3354 7968.3467
|
| 15 |
+
type eval | step 3250 | loss 106.4248 223.7034 341.8616 564.8652 800.1475 1172.7136 1608.4694 2287.0303 3207.1956 4829.3716 7863.5850 14211.7080 | checkpoint True True True False True True True True True True True True | ce_loss 3.2555 | sae_losses 45.8015 55.4216 131.8136 81.9611 185.5466 143.6066 290.1953 259.4831 435.6870 346.2761 625.9606 524.5527 855.2346 726.6537 1171.7985 1081.9243 1578.6611 1587.1714 2247.9417 2526.1672 3486.2185 4304.4922 6240.3633 7873.3164 | ce_loss_increases 5.9349 0.2372 0.4927 0.4890 0.5457 0.5605 0.5480 0.4989 0.4560 0.4137 0.4550 0.6895 | compound_ce_loss_increase 7.1805 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 5.2017 9.9287 12.7082 15.1868 18.1846 22.2002 26.5812 33.3077 41.3629 55.2633 72.8734 98.0297 | recon_l2 45.8015 55.4216 131.8136 81.9611 185.5466 143.6066 290.1953 259.4831 435.6870 346.2761 625.9606 524.5527 855.2346 726.6537 1171.7985 1081.9243 1578.6611 1587.1714 2247.9417 2526.1672 3486.2185 4304.4922 6240.3633 7873.3164
|
| 16 |
+
type eval | step 3500 | loss 104.9158 221.6778 339.8033 550.4977 796.7444 1159.8379 1608.4510 2282.0603 3196.0784 4816.0293 7846.7256 14150.5332 | checkpoint True True True False True True True True True True True True | ce_loss 3.2555 | sae_losses 45.6175 54.2233 131.7776 80.3082 185.7480 141.8307 290.0379 245.7278 436.4572 342.6617 626.9047 511.3346 857.1643 725.3232 1172.4392 1077.0480 1577.4685 1578.4564 2246.3250 2515.5967 3479.4458 4295.7773 6222.7129 7830.5747 | ce_loss_increases 5.8553 0.2410 0.5067 0.4778 0.5523 0.5625 0.5548 0.5045 0.4557 0.4115 0.4521 0.6806 | compound_ce_loss_increase 7.6398 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 5.0750 9.5919 12.2246 14.7322 17.6258 21.5984 25.9634 32.5737 40.1537 54.1065 71.5015 97.2396 | recon_l2 45.6175 54.2233 131.7776 80.3082 185.7480 141.8307 290.0379 245.7278 436.4572 342.6617 626.9047 511.3346 857.1643 725.3232 1172.4392 1077.0480 1577.4685 1578.4564 2246.3250 2515.5967 3479.4458 4295.7773 6222.7129 7830.5747
|
| 17 |
+
type eval | step 3750 | loss 103.4859 219.8248 337.4513 543.2390 847.4277 1150.8466 1605.4104 2272.6440 3181.3411 4794.8179 7811.8511 14055.0010 | checkpoint True True True True False True True True True True True True | ce_loss 3.2555 | sae_losses 45.3594 53.1908 131.7152 78.8078 185.6380 139.9852 289.5286 239.3109 435.9422 394.3231 625.8914 503.9179 856.1522 723.8322 1171.1191 1069.6647 1574.9589 1567.2936 2238.9475 2502.7114 3462.6509 4278.6958 6192.7080 7765.6211 | ce_loss_increases 5.8299 0.2374 0.5100 0.4855 0.6296 0.5785 0.5655 0.5055 0.4542 0.4087 0.4501 0.6722 | compound_ce_loss_increase 7.0157 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.9357 9.3017 11.8281 14.3995 17.1627 21.0374 25.4262 31.8601 39.0881 53.1597 70.5043 96.6739 | recon_l2 45.3594 53.1908 131.7152 78.8078 185.6380 139.9852 289.5286 239.3109 435.9422 394.3231 625.8914 503.9179 856.1522 723.8322 1171.1191 1069.6647 1574.9589 1567.2936 2238.9475 2502.7114 3462.6509 4278.6958 6192.7080 7765.6211
|
| 18 |
+
type eval | step 4000 | loss 102.2357 218.0208 335.1172 537.8822 816.0225 1143.6210 1601.6965 2262.2520 3166.1145 4770.0020 7773.4316 13857.1279 | checkpoint True True True True False True True True True True True True | ce_loss 3.2555 | sae_losses 45.1872 52.2090 131.4927 77.4425 185.2430 138.3292 288.8839 234.8570 435.0349 364.0985 625.0569 497.9617 855.7354 721.0585 1169.7008 1061.3329 1571.8840 1555.9630 2232.3042 2485.4580 3446.3528 4257.4868 6144.7808 7615.9346 | ce_loss_increases 5.7971 0.2433 0.5235 0.4947 0.5900 0.5742 0.5659 0.5060 0.4517 0.4040 0.4451 0.6572 | compound_ce_loss_increase 6.9432 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.8395 9.0856 11.5449 14.1413 16.8892 20.6023 24.9028 31.2171 38.2677 52.2405 69.5913 96.4135 | recon_l2 45.1872 52.2090 131.4927 77.4425 185.2430 138.3292 288.8839 234.8570 435.0349 364.0985 625.0569 497.9617 855.7354 721.0585 1169.7008 1061.3329 1571.8840 1555.9630 2232.3042 2485.4580 3446.3528 4257.4868 6144.7808 7615.9346
|
| 19 |
+
type eval | step 4250 | loss 101.4572 216.9886 334.1419 535.0634 804.6584 1141.5352 1601.2433 2260.7761 3160.5273 4765.2495 7769.3799 13779.0908 | checkpoint True True True True False True True True True True True True | ce_loss 3.2555 | sae_losses 45.1002 51.6112 131.5564 76.5605 185.4651 137.3701 289.5799 231.6377 436.0503 352.0357 626.8217 494.5551 857.6467 719.1106 1171.3500 1058.7756 1572.5192 1550.4065 2232.5540 2481.2046 3443.0684 4257.4126 6128.2905 7554.7515 | ce_loss_increases 5.7329 0.2449 0.5265 0.5033 0.5777 0.5826 0.5692 0.5154 0.4540 0.4025 0.4445 0.6487 | compound_ce_loss_increase 7.1303 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.7458 8.8717 11.3067 13.8458 16.5728 20.1582 24.4860 30.6503 37.6007 51.4913 68.8965 96.0502 | recon_l2 45.1002 51.6112 131.5564 76.5605 185.4651 137.3701 289.5799 231.6377 436.0503 352.0357 626.8217 494.5551 857.6467 719.1106 1171.3500 1058.7756 1572.5192 1550.4065 2232.5540 2481.2046 3443.0684 4257.4126 6128.2905 7554.7515
|
| 20 |
+
type eval | step 4500 | loss 100.7069 216.0949 332.6963 532.6974 799.5189 1139.0879 1598.2098 2257.8552 3153.6987 4752.9116 7744.6675 13694.8311 | checkpoint True True True True False True True True True True True True | ce_loss 3.2555 | sae_losses 44.9773 51.0597 131.5543 75.8213 185.2286 136.3349 289.4462 229.6160 435.9752 347.1808 627.2365 492.0203 857.9547 716.1136 1171.7067 1055.9294 1573.0924 1543.6521 2229.1492 2472.8611 3430.3455 4245.9922 6079.9517 7518.9082 | ce_loss_increases 5.7487 0.2471 0.5346 0.5051 0.5966 0.5931 0.5718 0.5256 0.4562 0.4013 0.4402 0.6425 | compound_ce_loss_increase 7.4181 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.6699 8.7193 11.1327 13.6353 16.3628 19.8310 24.1415 30.2194 36.9545 50.9008 68.3306 95.9717 | recon_l2 44.9773 51.0597 131.5543 75.8213 185.2286 136.3349 289.4462 229.6160 435.9752 347.1808 627.2365 492.0203 857.9547 716.1136 1171.7067 1055.9294 1573.0924 1543.6521 2229.1492 2472.8611 3430.3455 4245.9922 6079.9517 7518.9082
|
| 21 |
+
type eval | step 4750 | loss 99.9209 215.0471 330.9441 529.8846 793.9942 1136.2942 1594.0957 2252.1016 3144.4724 4737.9785 7720.9478 13615.7490 | checkpoint True True True True True True True True True True True True | ce_loss 3.2555 | sae_losses 44.8468 50.4617 131.4196 75.0493 184.9507 135.0185 289.1434 227.2924 435.4624 342.3524 626.9265 489.8327 857.9612 712.3328 1170.8230 1051.4523 1571.7897 1536.3010 2225.3293 2462.4192 3419.6306 4233.5635 6037.3521 7482.7837 | ce_loss_increases 5.7096 0.2481 0.5373 0.5180 0.5973 0.6004 0.5721 0.5315 0.4563 0.4012 0.4395 0.6378 | compound_ce_loss_increase 7.2870 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.6125 8.5782 10.9750 13.4487 16.1794 19.5350 23.8015 29.8266 36.3812 50.2301 67.7551 95.6188 | recon_l2 44.8468 50.4617 131.4196 75.0493 184.9507 135.0185 289.1434 227.2924 435.4624 342.3524 626.9265 489.8327 857.9612 712.3328 1170.8230 1051.4523 1571.7897 1536.3010 2225.3293 2462.4192 3419.6306 4233.5635 6037.3521 7482.7837
|
| 22 |
+
type eval | step 5000 | loss 99.5399 214.8708 330.9410 529.9941 793.6228 1137.7212 1594.8652 2253.2727 3144.0276 4741.0420 7730.1113 13605.8770 | checkpoint True True True False True False False False True False False True | ce_loss 3.2555 | sae_losses 44.8414 50.1242 131.7381 74.7048 185.5606 134.5683 290.2479 226.4892 437.0598 340.5843 628.9772 489.5003 860.3865 711.0188 1173.7788 1049.9679 1574.0374 1534.0377 2228.9849 2462.3274 3424.2085 4238.6294 6031.3608 7479.1494 | ce_loss_increases 5.6500 0.2495 0.5411 0.5318 0.6055 0.6083 0.5713 0.5373 0.4602 0.4011 0.4405 0.6330 | compound_ce_loss_increase 7.3846 | l0s 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 32.0000 | stream_l1s 309.8509 693.8199 747.4986 810.8648 861.4839 916.4980 983.0375 1051.8395 1117.2389 1179.2896 1269.0573 1336.7666 1441.0857 1524.2220 1674.8420 1794.1229 1954.1461 2121.0488 2354.2358 2610.6741 2943.0991 3357.1912 4390.7090 4811.3340 | ∇_l1 4.5744 8.4280 10.8121 13.2571 15.9788 19.2434 23.4601 29.5259 35.9531 49.7295 67.2726 95.3650 | recon_l2 44.8414 50.1242 131.7381 74.7048 185.5606 134.5683 290.2479 226.4892 437.0598 340.5843 628.9772 489.5003 860.3865 711.0188 1173.7788 1049.9679 1574.0374 1534.0377 2228.9849 2462.3274 3424.2085 4238.6294 6031.3608 7479.1494
|
model.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"block_size": 1024, "vocab_size": 50257, "n_layer": 12, "n_head": 12, "n_embd": 768, "norm_strategy": "LayerNorm"}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f76d410b06d9bcf56953e112c862553a8f38d07c118afe54a0a67b5f1af4a3bb
|
| 3 |
+
size 497774344
|
sae.0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75a15e8757a0ad62312ab97f2f2eb5fe7b2604c9293e465a4a12faf0f2053a83
|
| 3 |
+
size 151096640
|
sae.1.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c3bdecdb741872147adb122e43adb140bd3261a5fc9dccf380b7fd6ac4ac810
|
| 3 |
+
size 151096640
|
sae.10.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:646614fc47bac1b58ea6ecd87c4b5bfb0bce7e68e06a81d15ae739bad292b592
|
| 3 |
+
size 151096640
|
sae.11.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0d1a56a9c0c2a720ff1f863046223ee8db58a2b08a74c2525c371c37b785968
|
| 3 |
+
size 151096640
|
sae.12.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9de3a3305a15f32f5e7701ba3c5493f51ef7fa962867d46b9ae0a1cf26d9e1b
|
| 3 |
+
size 151096640
|
sae.13.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a7ac2c31d16794cf9de3c8a519f69b6920ee1dbb389f8487a472b57f2ba5eaa
|
| 3 |
+
size 151096640
|
sae.14.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a306c78c6c1ed3c314fa9de5ce6fa341480899439d04f43da8a8735d479f129a
|
| 3 |
+
size 151096640
|
sae.15.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fa11c9f4ac0c4d6dab412c108c3097a1f26367d7a129e7d10ff0042dd2329cc
|
| 3 |
+
size 151096640
|
sae.16.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9460dae2991d21f1a70dff0170023ca37dad2ef929281a70942209bc3767d4a9
|
| 3 |
+
size 151096640
|
sae.17.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8c64a34fc30811f84b6aa88ae3b8466f1736a47e3497ca69aded1578177d8e3
|
| 3 |
+
size 151096640
|
sae.18.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d7d2d9a36e8389fd8d71a17ffd314301ccb4601be19bdbce4a13290ab7c2e37
|
| 3 |
+
size 151096640
|
sae.19.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2721c53f6c62da45ccc9effc50d1347ac987ef3dd02bc3b36fb26026282af9a3
|
| 3 |
+
size 151096640
|
sae.2.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:092160a5d6517c05f44712536ee626f50a015dca1f2ee6932ce4f3858bde56a5
|
| 3 |
+
size 151096640
|
sae.20.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57430680e007439d6590a4fa1d77f4b5591f6d5a99a29c9d76deb878fec3f2bc
|
| 3 |
+
size 151096640
|
sae.21.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7dfbb74a4612315868392c57be4696334a8a2470a217306385d8ba12b959998d
|
| 3 |
+
size 151096640
|
sae.22.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c49a56069cdb9a102232b821e284066f9db30280efb6dd3bb32a06ec785f7d0e
|
| 3 |
+
size 151096640
|
sae.23.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f7ced43b029f0106ec9f93461ec2526f5a84ff79b7157033651a1f93f2e2080
|
| 3 |
+
size 151096640
|
sae.3.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:869c59697c787fd6427311df84e14e16d9328acd02359ca4e32c0cea3aa3f206
|
| 3 |
+
size 151096640
|
sae.4.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fe5e65190e52630728deaa9db2438371a9afa1122b1b88991c973991fa03c55
|
| 3 |
+
size 151096640
|
sae.5.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4dcb4d361f8a1b43a9d91c5a638bb3cbade560b3ebd91b466eebe93234108109
|
| 3 |
+
size 151096640
|
sae.6.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dfdec6bfbeff1d9c0316206303163f88e3ef1ea3ccc82f8be57e24833d545a72
|
| 3 |
+
size 151096640
|
sae.7.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09b011d945f253474448f66a1f8cd980b2a455527ead04297eb13d090c7fd9be
|
| 3 |
+
size 151096640
|
sae.8.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:877cf2a4ecef41ad00a619fb863cd4fe6a3f0677ac904d40beb5119a0f190dbe
|
| 3 |
+
size 151096640
|
sae.9.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6eae0a2132c2d8efad37f462de780313c7d7e1ac5883f3c84215327845ce25c
|
| 3 |
+
size 151096640
|
sae.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"n_features": [24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576], "sae_variant": "jsae_block", "top_k": [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32], "sae_keys": ["0_residmid", "0_residpost", "1_residmid", "1_residpost", "2_residmid", "2_residpost", "3_residmid", "3_residpost", "4_residmid", "4_residpost", "5_residmid", "5_residpost", "6_residmid", "6_residpost", "7_residmid", "7_residpost", "8_residmid", "8_residpost", "9_residmid", "9_residpost", "10_residmid", "10_residpost", "11_residmid", "11_residpost"]}
|
train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|