| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 25.64, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.0512, |
| "grad_norm": 0.86328125, |
| "learning_rate": 3.076923076923077e-05, |
| "loss": 12.4822, |
| "router_z_loss": 0.0, |
| "step": 10 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.1024, |
| "grad_norm": 0.90625, |
| "learning_rate": 6.153846153846154e-05, |
| "loss": 12.2583, |
| "router_z_loss": 0.0, |
| "step": 20 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.1536, |
| "grad_norm": 0.84765625, |
| "learning_rate": 9.230769230769232e-05, |
| "loss": 11.7958, |
| "router_z_loss": 0.0, |
| "step": 30 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.2048, |
| "grad_norm": 0.8203125, |
| "learning_rate": 0.00012307692307692307, |
| "loss": 11.1369, |
| "router_z_loss": 0.0, |
| "step": 40 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.256, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.00015384615384615385, |
| "loss": 10.2261, |
| "router_z_loss": 0.0, |
| "step": 50 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.3072, |
| "grad_norm": 0.9140625, |
| "learning_rate": 0.00018461538461538463, |
| "loss": 9.3587, |
| "router_z_loss": 0.0, |
| "step": 60 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.3584, |
| "grad_norm": 0.5546875, |
| "learning_rate": 0.0002153846153846154, |
| "loss": 8.7066, |
| "router_z_loss": 0.0, |
| "step": 70 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.4096, |
| "grad_norm": 4.03125, |
| "learning_rate": 0.00024615384615384614, |
| "loss": 8.2878, |
| "router_z_loss": 0.0, |
| "step": 80 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.4608, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00027692307692307695, |
| "loss": 7.9893, |
| "router_z_loss": 0.0, |
| "step": 90 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.512, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.0003076923076923077, |
| "loss": 7.6368, |
| "router_z_loss": 0.0, |
| "step": 100 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.5632, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.00033846153846153846, |
| "loss": 7.1774, |
| "router_z_loss": 0.0, |
| "step": 110 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.6144, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.00036923076923076927, |
| "loss": 6.6833, |
| "router_z_loss": 0.0, |
| "step": 120 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.6656, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0004, |
| "loss": 6.2922, |
| "router_z_loss": 0.0, |
| "step": 130 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.7168, |
| "grad_norm": 0.75390625, |
| "learning_rate": 0.0004307692307692308, |
| "loss": 6.0386, |
| "router_z_loss": 0.0, |
| "step": 140 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.768, |
| "grad_norm": 0.71484375, |
| "learning_rate": 0.0004615384615384616, |
| "loss": 5.8256, |
| "router_z_loss": 0.0, |
| "step": 150 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.8192, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0004923076923076923, |
| "loss": 5.6412, |
| "router_z_loss": 0.0, |
| "step": 160 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.8704, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.0005230769230769231, |
| "loss": 5.4734, |
| "router_z_loss": 0.0, |
| "step": 170 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.9216, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0005538461538461539, |
| "loss": 5.3141, |
| "router_z_loss": 0.0, |
| "step": 180 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 0.9728, |
| "grad_norm": 0.9375, |
| "learning_rate": 0.0005846153846153846, |
| "loss": 5.1415, |
| "router_z_loss": 0.0, |
| "step": 190 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.0256, |
| "grad_norm": 0.81640625, |
| "learning_rate": 0.0006153846153846154, |
| "loss": 5.5058, |
| "router_z_loss": 0.0, |
| "step": 200 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.0768, |
| "grad_norm": 0.5546875, |
| "learning_rate": 0.0006461538461538462, |
| "loss": 4.8577, |
| "router_z_loss": 0.0, |
| "step": 210 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.0006769230769230769, |
| "loss": 4.7405, |
| "router_z_loss": 0.0, |
| "step": 220 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.1792, |
| "grad_norm": 0.75, |
| "learning_rate": 0.0007076923076923077, |
| "loss": 4.6311, |
| "router_z_loss": 0.0, |
| "step": 230 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.2304, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.0007384615384615385, |
| "loss": 4.4939, |
| "router_z_loss": 0.0, |
| "step": 240 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.2816, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.0007692307692307692, |
| "loss": 4.4278, |
| "router_z_loss": 0.0, |
| "step": 250 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.3328, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.0008, |
| "loss": 4.3309, |
| "router_z_loss": 0.0, |
| "step": 260 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.384, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.0008307692307692308, |
| "loss": 4.2308, |
| "router_z_loss": 0.0, |
| "step": 270 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.4352, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0008615384615384615, |
| "loss": 4.1648, |
| "router_z_loss": 0.0, |
| "step": 280 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.4864, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.0008923076923076924, |
| "loss": 4.0882, |
| "router_z_loss": 0.0, |
| "step": 290 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.5375999999999999, |
| "grad_norm": 0.58203125, |
| "learning_rate": 0.0009230769230769232, |
| "loss": 4.0039, |
| "router_z_loss": 0.0, |
| "step": 300 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.5888, |
| "grad_norm": 0.68359375, |
| "learning_rate": 0.0009538461538461538, |
| "loss": 3.9642, |
| "router_z_loss": 0.0, |
| "step": 310 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 0.77734375, |
| "learning_rate": 0.0009846153846153846, |
| "loss": 3.8901, |
| "router_z_loss": 0.0, |
| "step": 320 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.6912, |
| "grad_norm": 0.9453125, |
| "learning_rate": 0.0010153846153846155, |
| "loss": 3.8254, |
| "router_z_loss": 0.0, |
| "step": 330 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.7424, |
| "grad_norm": 0.71484375, |
| "learning_rate": 0.0010461538461538462, |
| "loss": 3.7841, |
| "router_z_loss": 0.0, |
| "step": 340 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.7936, |
| "grad_norm": 0.79296875, |
| "learning_rate": 0.0010769230769230769, |
| "loss": 3.751, |
| "router_z_loss": 0.0, |
| "step": 350 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.8448, |
| "grad_norm": 0.890625, |
| "learning_rate": 0.0011076923076923078, |
| "loss": 3.6586, |
| "router_z_loss": 0.0, |
| "step": 360 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.896, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.0011384615384615385, |
| "loss": 3.6627, |
| "router_z_loss": 0.0, |
| "step": 370 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.9472, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.0011692307692307692, |
| "loss": 3.5893, |
| "router_z_loss": 0.0, |
| "step": 380 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 1.9984, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.0012000000000000001, |
| "loss": 3.5727, |
| "router_z_loss": 0.0, |
| "step": 390 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.0512, |
| "grad_norm": 0.400390625, |
| "learning_rate": 0.0012307692307692308, |
| "loss": 3.8897, |
| "router_z_loss": 0.0, |
| "step": 400 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.1024, |
| "grad_norm": 0.486328125, |
| "learning_rate": 0.0012615384615384615, |
| "loss": 3.501, |
| "router_z_loss": 0.0, |
| "step": 410 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.1536, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.0012923076923076924, |
| "loss": 3.4462, |
| "router_z_loss": 0.0, |
| "step": 420 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.2048, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.0013230769230769231, |
| "loss": 3.4335, |
| "router_z_loss": 0.0, |
| "step": 430 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.0013538461538461538, |
| "loss": 3.3947, |
| "router_z_loss": 0.0, |
| "step": 440 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.3072, |
| "grad_norm": 0.66796875, |
| "learning_rate": 0.0013846153846153847, |
| "loss": 3.37, |
| "router_z_loss": 0.0, |
| "step": 450 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.3584, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.0014153846153846154, |
| "loss": 3.345, |
| "router_z_loss": 0.0, |
| "step": 460 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.4096, |
| "grad_norm": 0.447265625, |
| "learning_rate": 0.0014461538461538461, |
| "loss": 3.355, |
| "router_z_loss": 0.0, |
| "step": 470 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.4608, |
| "grad_norm": 0.8671875, |
| "learning_rate": 0.001476923076923077, |
| "loss": 3.3138, |
| "router_z_loss": 0.0, |
| "step": 480 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.512, |
| "grad_norm": 0.6171875, |
| "learning_rate": 0.0015076923076923078, |
| "loss": 3.2843, |
| "router_z_loss": 0.0, |
| "step": 490 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.5632, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.0015384615384615385, |
| "loss": 3.2832, |
| "router_z_loss": 0.0, |
| "step": 500 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.6144, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0015692307692307694, |
| "loss": 3.2416, |
| "router_z_loss": 0.0, |
| "step": 510 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.6656, |
| "grad_norm": 0.421875, |
| "learning_rate": 0.0016, |
| "loss": 3.2343, |
| "router_z_loss": 0.0, |
| "step": 520 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.7168, |
| "grad_norm": 0.404296875, |
| "learning_rate": 0.0016307692307692308, |
| "loss": 3.2167, |
| "router_z_loss": 0.0, |
| "step": 530 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.768, |
| "grad_norm": 0.4765625, |
| "learning_rate": 0.0016615384615384617, |
| "loss": 3.1902, |
| "router_z_loss": 0.0, |
| "step": 540 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.8192, |
| "grad_norm": 0.4921875, |
| "learning_rate": 0.0016923076923076924, |
| "loss": 3.2147, |
| "router_z_loss": 0.0, |
| "step": 550 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.8704, |
| "grad_norm": 0.640625, |
| "learning_rate": 0.001723076923076923, |
| "loss": 3.1914, |
| "router_z_loss": 0.0, |
| "step": 560 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.9215999999999998, |
| "grad_norm": 0.396484375, |
| "learning_rate": 0.001753846153846154, |
| "loss": 3.1648, |
| "router_z_loss": 0.0, |
| "step": 570 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 2.9728, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.0017846153846153847, |
| "loss": 3.1473, |
| "router_z_loss": 0.0, |
| "step": 580 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.0256, |
| "grad_norm": 1.984375, |
| "learning_rate": 0.0018153846153846154, |
| "loss": 3.555, |
| "router_z_loss": 0.0, |
| "step": 590 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.0768, |
| "grad_norm": 0.88671875, |
| "learning_rate": 0.0018461538461538463, |
| "loss": 3.2716, |
| "router_z_loss": 0.0, |
| "step": 600 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.128, |
| "grad_norm": 0.3515625, |
| "learning_rate": 0.001876923076923077, |
| "loss": 3.1852, |
| "router_z_loss": 0.0, |
| "step": 610 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.1792, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.0019076923076923075, |
| "loss": 3.1403, |
| "router_z_loss": 0.0, |
| "step": 620 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.2304, |
| "grad_norm": 0.4296875, |
| "learning_rate": 0.0019384615384615386, |
| "loss": 3.1375, |
| "router_z_loss": 0.0, |
| "step": 630 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.2816, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.001969230769230769, |
| "loss": 3.1062, |
| "router_z_loss": 0.0, |
| "step": 640 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.3327999999999998, |
| "grad_norm": 0.486328125, |
| "learning_rate": 0.002, |
| "loss": 3.0988, |
| "router_z_loss": 0.0, |
| "step": 650 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.384, |
| "grad_norm": 0.458984375, |
| "learning_rate": 0.002030769230769231, |
| "loss": 3.0966, |
| "router_z_loss": 0.0, |
| "step": 660 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.4352, |
| "grad_norm": 0.33203125, |
| "learning_rate": 0.0020615384615384614, |
| "loss": 3.0842, |
| "router_z_loss": 0.0, |
| "step": 670 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.4864, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.0020923076923076924, |
| "loss": 3.0998, |
| "router_z_loss": 0.0, |
| "step": 680 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.5376, |
| "grad_norm": 0.4140625, |
| "learning_rate": 0.0021230769230769233, |
| "loss": 3.074, |
| "router_z_loss": 0.0, |
| "step": 690 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.5888, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.0021538461538461538, |
| "loss": 3.0512, |
| "router_z_loss": 0.0, |
| "step": 700 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.64, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.0021846153846153847, |
| "loss": 3.0719, |
| "router_z_loss": 0.0, |
| "step": 710 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.6912000000000003, |
| "grad_norm": 0.3828125, |
| "learning_rate": 0.0022153846153846156, |
| "loss": 3.0421, |
| "router_z_loss": 0.0, |
| "step": 720 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.7424, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.002246153846153846, |
| "loss": 3.061, |
| "router_z_loss": 0.0, |
| "step": 730 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.7936, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.002276923076923077, |
| "loss": 3.0319, |
| "router_z_loss": 0.0, |
| "step": 740 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.8448, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.002307692307692308, |
| "loss": 3.0467, |
| "router_z_loss": 0.0, |
| "step": 750 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.896, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.0023384615384615384, |
| "loss": 3.0424, |
| "router_z_loss": 0.0, |
| "step": 760 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.9472, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.0023692307692307693, |
| "loss": 3.0409, |
| "router_z_loss": 0.0, |
| "step": 770 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 3.9984, |
| "grad_norm": 0.41015625, |
| "learning_rate": 0.0024000000000000002, |
| "loss": 3.0104, |
| "router_z_loss": 0.0, |
| "step": 780 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.0512, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.0024307692307692307, |
| "loss": 3.3914, |
| "router_z_loss": 0.0, |
| "step": 790 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.1024, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.0024615384615384616, |
| "loss": 2.9982, |
| "router_z_loss": 0.0, |
| "step": 800 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.1536, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.0024923076923076925, |
| "loss": 3.0196, |
| "router_z_loss": 0.0, |
| "step": 810 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.2048, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.002523076923076923, |
| "loss": 2.9782, |
| "router_z_loss": 0.0, |
| "step": 820 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.256, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.002553846153846154, |
| "loss": 2.9809, |
| "router_z_loss": 0.0, |
| "step": 830 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.3072, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.002584615384615385, |
| "loss": 2.9719, |
| "router_z_loss": 0.0, |
| "step": 840 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.3584, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.0026153846153846153, |
| "loss": 2.967, |
| "router_z_loss": 0.0, |
| "step": 850 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.4096, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.0026461538461538463, |
| "loss": 2.951, |
| "router_z_loss": 0.0, |
| "step": 860 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.4608, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.002676923076923077, |
| "loss": 2.9363, |
| "router_z_loss": 0.0, |
| "step": 870 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.5120000000000005, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.0027076923076923077, |
| "loss": 2.9139, |
| "router_z_loss": 0.0, |
| "step": 880 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.5632, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.0027384615384615386, |
| "loss": 2.9451, |
| "router_z_loss": 0.0, |
| "step": 890 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.6144, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.0027692307692307695, |
| "loss": 2.9361, |
| "router_z_loss": 0.0, |
| "step": 900 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.6655999999999995, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.0028, |
| "loss": 2.9243, |
| "router_z_loss": 0.0, |
| "step": 910 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.7168, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.002830769230769231, |
| "loss": 2.9238, |
| "router_z_loss": 0.0, |
| "step": 920 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.768, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 0.002861538461538462, |
| "loss": 2.9097, |
| "router_z_loss": 0.0, |
| "step": 930 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.8192, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.0028923076923076923, |
| "loss": 2.9009, |
| "router_z_loss": 0.0, |
| "step": 940 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.8704, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.002923076923076923, |
| "loss": 2.9211, |
| "router_z_loss": 0.0, |
| "step": 950 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.9216, |
| "grad_norm": 0.39453125, |
| "learning_rate": 0.002953846153846154, |
| "loss": 2.8885, |
| "router_z_loss": 0.0, |
| "step": 960 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 4.9728, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.0029846153846153846, |
| "loss": 2.8904, |
| "router_z_loss": 0.0, |
| "step": 970 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.0256, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.003, |
| "loss": 3.1845, |
| "router_z_loss": 0.0, |
| "step": 980 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.0768, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.8775, |
| "router_z_loss": 0.0, |
| "step": 990 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.128, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.003, |
| "loss": 2.8545, |
| "router_z_loss": 0.0, |
| "step": 1000 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.1792, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.003, |
| "loss": 2.8552, |
| "router_z_loss": 0.0, |
| "step": 1010 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.2304, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.003, |
| "loss": 2.8244, |
| "router_z_loss": 0.0, |
| "step": 1020 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.2816, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.003, |
| "loss": 2.836, |
| "router_z_loss": 0.0, |
| "step": 1030 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.3328, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.003, |
| "loss": 2.8203, |
| "router_z_loss": 0.0, |
| "step": 1040 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.384, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.003, |
| "loss": 2.8179, |
| "router_z_loss": 0.0, |
| "step": 1050 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.4352, |
| "grad_norm": 0.375, |
| "learning_rate": 0.003, |
| "loss": 2.8163, |
| "router_z_loss": 0.0, |
| "step": 1060 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.4864, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.003, |
| "loss": 2.8234, |
| "router_z_loss": 0.0, |
| "step": 1070 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.5376, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.003, |
| "loss": 2.7997, |
| "router_z_loss": 0.0, |
| "step": 1080 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.5888, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.003, |
| "loss": 2.7923, |
| "router_z_loss": 0.0, |
| "step": 1090 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.64, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.7918, |
| "router_z_loss": 0.0, |
| "step": 1100 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.6912, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 0.003, |
| "loss": 2.7794, |
| "router_z_loss": 0.0, |
| "step": 1110 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.7424, |
| "grad_norm": 0.361328125, |
| "learning_rate": 0.003, |
| "loss": 2.7793, |
| "router_z_loss": 0.0, |
| "step": 1120 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.7936, |
| "grad_norm": 0.20703125, |
| "learning_rate": 0.003, |
| "loss": 2.793, |
| "router_z_loss": 0.0, |
| "step": 1130 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.8448, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.003, |
| "loss": 2.7649, |
| "router_z_loss": 0.0, |
| "step": 1140 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.896, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.003, |
| "loss": 2.7741, |
| "router_z_loss": 0.0, |
| "step": 1150 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.9472000000000005, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.7515, |
| "router_z_loss": 0.0, |
| "step": 1160 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 5.9984, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.003, |
| "loss": 2.7619, |
| "router_z_loss": 0.0, |
| "step": 1170 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.0512, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.003, |
| "loss": 3.0268, |
| "router_z_loss": 0.0, |
| "step": 1180 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.1024, |
| "grad_norm": 0.341796875, |
| "learning_rate": 0.003, |
| "loss": 2.7404, |
| "router_z_loss": 0.0, |
| "step": 1190 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.1536, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.003, |
| "loss": 2.7276, |
| "router_z_loss": 0.0, |
| "step": 1200 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.2048, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.6999, |
| "router_z_loss": 0.0, |
| "step": 1210 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.256, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.003, |
| "loss": 2.7228, |
| "router_z_loss": 0.0, |
| "step": 1220 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.3072, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.003, |
| "loss": 2.7162, |
| "router_z_loss": 0.0, |
| "step": 1230 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.3584, |
| "grad_norm": 0.419921875, |
| "learning_rate": 0.003, |
| "loss": 2.7036, |
| "router_z_loss": 0.0, |
| "step": 1240 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.4096, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.003, |
| "loss": 2.7148, |
| "router_z_loss": 0.0, |
| "step": 1250 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.4608, |
| "grad_norm": 0.21875, |
| "learning_rate": 0.003, |
| "loss": 2.6904, |
| "router_z_loss": 0.0, |
| "step": 1260 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.5120000000000005, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.003, |
| "loss": 2.6952, |
| "router_z_loss": 0.0, |
| "step": 1270 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.5632, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.003, |
| "loss": 2.6898, |
| "router_z_loss": 0.0, |
| "step": 1280 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.6144, |
| "grad_norm": 0.2041015625, |
| "learning_rate": 0.003, |
| "loss": 2.6835, |
| "router_z_loss": 0.0, |
| "step": 1290 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.6655999999999995, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.6932, |
| "router_z_loss": 0.0, |
| "step": 1300 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.7168, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.003, |
| "loss": 2.6754, |
| "router_z_loss": 0.0, |
| "step": 1310 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.768, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.003, |
| "loss": 2.6842, |
| "router_z_loss": 0.0, |
| "step": 1320 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.8192, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.003, |
| "loss": 2.6741, |
| "router_z_loss": 0.0, |
| "step": 1330 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.8704, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.003, |
| "loss": 2.6605, |
| "router_z_loss": 0.0, |
| "step": 1340 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.9216, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.003, |
| "loss": 2.6674, |
| "router_z_loss": 0.0, |
| "step": 1350 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 6.9728, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.003, |
| "loss": 2.6727, |
| "router_z_loss": 0.0, |
| "step": 1360 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.0256, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.003, |
| "loss": 2.9389, |
| "router_z_loss": 0.0, |
| "step": 1370 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.0768, |
| "grad_norm": 0.1962890625, |
| "learning_rate": 0.003, |
| "loss": 2.6637, |
| "router_z_loss": 0.0, |
| "step": 1380 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.128, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.003, |
| "loss": 2.6444, |
| "router_z_loss": 0.0, |
| "step": 1390 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.1792, |
| "grad_norm": 0.2314453125, |
| "learning_rate": 0.003, |
| "loss": 2.6353, |
| "router_z_loss": 0.0, |
| "step": 1400 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.2304, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.003, |
| "loss": 2.631, |
| "router_z_loss": 0.0, |
| "step": 1410 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.2816, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.003, |
| "loss": 2.6191, |
| "router_z_loss": 0.0, |
| "step": 1420 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.3328, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 0.003, |
| "loss": 2.627, |
| "router_z_loss": 0.0, |
| "step": 1430 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.384, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.6287, |
| "router_z_loss": 0.0, |
| "step": 1440 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.4352, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.003, |
| "loss": 2.6258, |
| "router_z_loss": 0.0, |
| "step": 1450 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.4864, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.003, |
| "loss": 2.6219, |
| "router_z_loss": 0.0, |
| "step": 1460 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.5376, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.6155, |
| "router_z_loss": 0.0, |
| "step": 1470 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.5888, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.6125, |
| "router_z_loss": 0.0, |
| "step": 1480 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.64, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.003, |
| "loss": 2.616, |
| "router_z_loss": 0.0, |
| "step": 1490 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.6912, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.003, |
| "loss": 2.6035, |
| "router_z_loss": 0.0, |
| "step": 1500 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.7424, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.003, |
| "loss": 2.6112, |
| "router_z_loss": 0.0, |
| "step": 1510 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.7936, |
| "grad_norm": 0.2431640625, |
| "learning_rate": 0.003, |
| "loss": 2.6033, |
| "router_z_loss": 0.0, |
| "step": 1520 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.8448, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.003, |
| "loss": 2.5985, |
| "router_z_loss": 0.0, |
| "step": 1530 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.896, |
| "grad_norm": 0.21875, |
| "learning_rate": 0.003, |
| "loss": 2.6088, |
| "router_z_loss": 0.0, |
| "step": 1540 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.9472000000000005, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.003, |
| "loss": 2.5988, |
| "router_z_loss": 0.0, |
| "step": 1550 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 7.9984, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.5964, |
| "router_z_loss": 0.0, |
| "step": 1560 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.0512, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.8688, |
| "router_z_loss": 0.0, |
| "step": 1570 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.1024, |
| "grad_norm": 0.1923828125, |
| "learning_rate": 0.003, |
| "loss": 2.5877, |
| "router_z_loss": 0.0, |
| "step": 1580 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.1536, |
| "grad_norm": 0.373046875, |
| "learning_rate": 0.003, |
| "loss": 2.5778, |
| "router_z_loss": 0.0, |
| "step": 1590 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.2048, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.5813, |
| "router_z_loss": 0.0, |
| "step": 1600 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.256, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 0.003, |
| "loss": 2.5646, |
| "router_z_loss": 0.0, |
| "step": 1610 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.3072, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.003, |
| "loss": 2.5666, |
| "router_z_loss": 0.0, |
| "step": 1620 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.3584, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.003, |
| "loss": 2.5681, |
| "router_z_loss": 0.0, |
| "step": 1630 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.4096, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.003, |
| "loss": 2.5768, |
| "router_z_loss": 0.0, |
| "step": 1640 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.4608, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.003, |
| "loss": 2.5612, |
| "router_z_loss": 0.0, |
| "step": 1650 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.512, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.562, |
| "router_z_loss": 0.0, |
| "step": 1660 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.5632, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.571, |
| "router_z_loss": 0.0, |
| "step": 1670 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.6144, |
| "grad_norm": 0.21875, |
| "learning_rate": 0.003, |
| "loss": 2.5554, |
| "router_z_loss": 0.0, |
| "step": 1680 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.6656, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.003, |
| "loss": 2.5633, |
| "router_z_loss": 0.0, |
| "step": 1690 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.7168, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.5639, |
| "router_z_loss": 0.0, |
| "step": 1700 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.768, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.003, |
| "loss": 2.546, |
| "router_z_loss": 0.0, |
| "step": 1710 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.8192, |
| "grad_norm": 0.2314453125, |
| "learning_rate": 0.003, |
| "loss": 2.5578, |
| "router_z_loss": 0.0, |
| "step": 1720 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.8704, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.003, |
| "loss": 2.5567, |
| "router_z_loss": 0.0, |
| "step": 1730 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.9216, |
| "grad_norm": 0.208984375, |
| "learning_rate": 0.003, |
| "loss": 2.5517, |
| "router_z_loss": 0.0, |
| "step": 1740 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 8.9728, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.5523, |
| "router_z_loss": 0.0, |
| "step": 1750 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.0256, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.003, |
| "loss": 2.816, |
| "router_z_loss": 0.0, |
| "step": 1760 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.0768, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 0.003, |
| "loss": 2.5529, |
| "router_z_loss": 0.0, |
| "step": 1770 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.128, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.003, |
| "loss": 2.5302, |
| "router_z_loss": 0.0, |
| "step": 1780 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.1792, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.003, |
| "loss": 2.526, |
| "router_z_loss": 0.0, |
| "step": 1790 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.2304, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.003, |
| "loss": 2.5318, |
| "router_z_loss": 0.0, |
| "step": 1800 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.2816, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.003, |
| "loss": 2.5167, |
| "router_z_loss": 0.0, |
| "step": 1810 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.3328, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.003, |
| "loss": 2.5291, |
| "router_z_loss": 0.0, |
| "step": 1820 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.384, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.003, |
| "loss": 2.5245, |
| "router_z_loss": 0.0, |
| "step": 1830 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.4352, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.003, |
| "loss": 2.5227, |
| "router_z_loss": 0.0, |
| "step": 1840 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.4864, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.5194, |
| "router_z_loss": 0.0, |
| "step": 1850 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.5376, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.003, |
| "loss": 2.5335, |
| "router_z_loss": 0.0, |
| "step": 1860 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.588799999999999, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.003, |
| "loss": 2.5289, |
| "router_z_loss": 0.0, |
| "step": 1870 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.64, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.5138, |
| "router_z_loss": 0.0, |
| "step": 1880 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.6912, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.003, |
| "loss": 2.5216, |
| "router_z_loss": 0.0, |
| "step": 1890 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.7424, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.003, |
| "loss": 2.5238, |
| "router_z_loss": 0.0, |
| "step": 1900 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.7936, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.5161, |
| "router_z_loss": 0.0, |
| "step": 1910 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.8448, |
| "grad_norm": 0.1953125, |
| "learning_rate": 0.003, |
| "loss": 2.5058, |
| "router_z_loss": 0.0, |
| "step": 1920 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.896, |
| "grad_norm": 0.318359375, |
| "learning_rate": 0.003, |
| "loss": 2.508, |
| "router_z_loss": 0.0, |
| "step": 1930 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.9472, |
| "grad_norm": 0.228515625, |
| "learning_rate": 0.003, |
| "loss": 2.5147, |
| "router_z_loss": 0.0, |
| "step": 1940 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 9.9984, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.5169, |
| "router_z_loss": 0.0, |
| "step": 1950 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.0512, |
| "grad_norm": 0.228515625, |
| "learning_rate": 0.003, |
| "loss": 2.7845, |
| "router_z_loss": 0.0, |
| "step": 1960 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.1024, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 0.003, |
| "loss": 2.5004, |
| "router_z_loss": 0.0, |
| "step": 1970 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.1536, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 0.003, |
| "loss": 2.4885, |
| "router_z_loss": 0.0, |
| "step": 1980 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.2048, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.4883, |
| "router_z_loss": 0.0, |
| "step": 1990 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.256, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.003, |
| "loss": 2.4822, |
| "router_z_loss": 0.0, |
| "step": 2000 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.3072, |
| "grad_norm": 0.25, |
| "learning_rate": 0.003, |
| "loss": 2.4836, |
| "router_z_loss": 0.0, |
| "step": 2010 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.3584, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.4835, |
| "router_z_loss": 0.0, |
| "step": 2020 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.4096, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.488, |
| "router_z_loss": 0.0, |
| "step": 2030 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.4608, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.4858, |
| "router_z_loss": 0.0, |
| "step": 2040 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.512, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.003, |
| "loss": 2.4848, |
| "router_z_loss": 0.0, |
| "step": 2050 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.5632, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.003, |
| "loss": 2.4799, |
| "router_z_loss": 0.0, |
| "step": 2060 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.6144, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.003, |
| "loss": 2.4868, |
| "router_z_loss": 0.0, |
| "step": 2070 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.6656, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.003, |
| "loss": 2.4811, |
| "router_z_loss": 0.0, |
| "step": 2080 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.7168, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.003, |
| "loss": 2.4766, |
| "router_z_loss": 0.0, |
| "step": 2090 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.768, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.4846, |
| "router_z_loss": 0.0, |
| "step": 2100 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.8192, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.003, |
| "loss": 2.4844, |
| "router_z_loss": 0.0, |
| "step": 2110 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.8704, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.003, |
| "loss": 2.4765, |
| "router_z_loss": 0.0, |
| "step": 2120 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.9216, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.4752, |
| "router_z_loss": 0.0, |
| "step": 2130 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 10.9728, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.003, |
| "loss": 2.4769, |
| "router_z_loss": 0.0, |
| "step": 2140 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.0256, |
| "grad_norm": 0.376953125, |
| "learning_rate": 0.003, |
| "loss": 2.733, |
| "router_z_loss": 0.0, |
| "step": 2150 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.0768, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.4789, |
| "router_z_loss": 0.0, |
| "step": 2160 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.128, |
| "grad_norm": 0.1982421875, |
| "learning_rate": 0.003, |
| "loss": 2.4601, |
| "router_z_loss": 0.0, |
| "step": 2170 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.1792, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.003, |
| "loss": 2.4514, |
| "router_z_loss": 0.0, |
| "step": 2180 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.2304, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.003, |
| "loss": 2.4503, |
| "router_z_loss": 0.0, |
| "step": 2190 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.2816, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.003, |
| "loss": 2.4582, |
| "router_z_loss": 0.0, |
| "step": 2200 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.3328, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.003, |
| "loss": 2.4509, |
| "router_z_loss": 0.0, |
| "step": 2210 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.384, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.003, |
| "loss": 2.4592, |
| "router_z_loss": 0.0, |
| "step": 2220 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.4352, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.003, |
| "loss": 2.453, |
| "router_z_loss": 0.0, |
| "step": 2230 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.4864, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.003, |
| "loss": 2.4599, |
| "router_z_loss": 0.0, |
| "step": 2240 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.5376, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.003, |
| "loss": 2.4608, |
| "router_z_loss": 0.0, |
| "step": 2250 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.588799999999999, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.003, |
| "loss": 2.4552, |
| "router_z_loss": 0.0, |
| "step": 2260 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.64, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.003, |
| "loss": 2.4493, |
| "router_z_loss": 0.0, |
| "step": 2270 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.6912, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.003, |
| "loss": 2.4593, |
| "router_z_loss": 0.0, |
| "step": 2280 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.7424, |
| "grad_norm": 0.2080078125, |
| "learning_rate": 0.003, |
| "loss": 2.4547, |
| "router_z_loss": 0.0, |
| "step": 2290 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.7936, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.003, |
| "loss": 2.4519, |
| "router_z_loss": 0.0, |
| "step": 2300 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.8448, |
| "grad_norm": 0.20703125, |
| "learning_rate": 0.003, |
| "loss": 2.4577, |
| "router_z_loss": 0.0, |
| "step": 2310 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.896, |
| "grad_norm": 0.18359375, |
| "learning_rate": 0.003, |
| "loss": 2.4436, |
| "router_z_loss": 0.0, |
| "step": 2320 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.9472, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.003, |
| "loss": 2.4479, |
| "router_z_loss": 0.0, |
| "step": 2330 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 11.9984, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.003, |
| "loss": 2.4643, |
| "router_z_loss": 0.0, |
| "step": 2340 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.0512, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.003, |
| "loss": 2.7028, |
| "router_z_loss": 0.0, |
| "step": 2350 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.1024, |
| "grad_norm": 0.1796875, |
| "learning_rate": 0.003, |
| "loss": 2.4387, |
| "router_z_loss": 0.0, |
| "step": 2360 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.1536, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.003, |
| "loss": 2.4258, |
| "router_z_loss": 0.0, |
| "step": 2370 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.2048, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.4298, |
| "router_z_loss": 0.0, |
| "step": 2380 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.256, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.003, |
| "loss": 2.4336, |
| "router_z_loss": 0.0, |
| "step": 2390 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.3072, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.003, |
| "loss": 2.4322, |
| "router_z_loss": 0.0, |
| "step": 2400 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.3584, |
| "grad_norm": 0.25, |
| "learning_rate": 0.003, |
| "loss": 2.4283, |
| "router_z_loss": 0.0, |
| "step": 2410 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.4096, |
| "grad_norm": 0.228515625, |
| "learning_rate": 0.003, |
| "loss": 2.4257, |
| "router_z_loss": 0.0, |
| "step": 2420 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.4608, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.003, |
| "loss": 2.4435, |
| "router_z_loss": 0.0, |
| "step": 2430 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.512, |
| "grad_norm": 0.2470703125, |
| "learning_rate": 0.003, |
| "loss": 2.4284, |
| "router_z_loss": 0.0, |
| "step": 2440 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.5632, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.003, |
| "loss": 2.4325, |
| "router_z_loss": 0.0, |
| "step": 2450 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.6144, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.003, |
| "loss": 2.4266, |
| "router_z_loss": 0.0, |
| "step": 2460 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.6656, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.4236, |
| "router_z_loss": 0.0, |
| "step": 2470 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.7168, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.003, |
| "loss": 2.4234, |
| "router_z_loss": 0.0, |
| "step": 2480 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.768, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.003, |
| "loss": 2.4256, |
| "router_z_loss": 0.0, |
| "step": 2490 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.8192, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.003, |
| "loss": 2.4317, |
| "router_z_loss": 0.0, |
| "step": 2500 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.8704, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.003, |
| "loss": 2.4258, |
| "router_z_loss": 0.0, |
| "step": 2510 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.9216, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.4369, |
| "router_z_loss": 0.0, |
| "step": 2520 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 12.9728, |
| "grad_norm": 0.203125, |
| "learning_rate": 0.003, |
| "loss": 2.427, |
| "router_z_loss": 0.0, |
| "step": 2530 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.0256, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.003, |
| "loss": 2.6928, |
| "router_z_loss": 0.0, |
| "step": 2540 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.0768, |
| "grad_norm": 0.1787109375, |
| "learning_rate": 0.003, |
| "loss": 2.4329, |
| "router_z_loss": 0.0, |
| "step": 2550 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.128, |
| "grad_norm": 0.18359375, |
| "learning_rate": 0.003, |
| "loss": 2.4122, |
| "router_z_loss": 0.0, |
| "step": 2560 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.1792, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.003, |
| "loss": 2.4012, |
| "router_z_loss": 0.0, |
| "step": 2570 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.2304, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.003, |
| "loss": 2.4096, |
| "router_z_loss": 0.0, |
| "step": 2580 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.2816, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.003, |
| "loss": 2.4102, |
| "router_z_loss": 0.0, |
| "step": 2590 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.3328, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.4039, |
| "router_z_loss": 0.0, |
| "step": 2600 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.384, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.003, |
| "loss": 2.4086, |
| "router_z_loss": 0.0, |
| "step": 2610 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.4352, |
| "grad_norm": 0.228515625, |
| "learning_rate": 0.003, |
| "loss": 2.4067, |
| "router_z_loss": 0.0, |
| "step": 2620 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.4864, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.003, |
| "loss": 2.4154, |
| "router_z_loss": 0.0, |
| "step": 2630 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.5376, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.003, |
| "loss": 2.4111, |
| "router_z_loss": 0.0, |
| "step": 2640 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.588799999999999, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.003, |
| "loss": 2.4139, |
| "router_z_loss": 0.0, |
| "step": 2650 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.64, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.003, |
| "loss": 2.4113, |
| "router_z_loss": 0.0, |
| "step": 2660 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.6912, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.003, |
| "loss": 2.4106, |
| "router_z_loss": 0.0, |
| "step": 2670 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.7424, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.003, |
| "loss": 2.411, |
| "router_z_loss": 0.0, |
| "step": 2680 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.7936, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.003, |
| "loss": 2.4019, |
| "router_z_loss": 0.0, |
| "step": 2690 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.8448, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.003, |
| "loss": 2.4074, |
| "router_z_loss": 0.0, |
| "step": 2700 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.896, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.003, |
| "loss": 2.4129, |
| "router_z_loss": 0.0, |
| "step": 2710 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.9472, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.003, |
| "loss": 2.3988, |
| "router_z_loss": 0.0, |
| "step": 2720 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 13.9984, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 0.003, |
| "loss": 2.4069, |
| "router_z_loss": 0.0, |
| "step": 2730 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.0512, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.003, |
| "loss": 2.6531, |
| "router_z_loss": 0.0, |
| "step": 2740 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.1024, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.003, |
| "loss": 2.3899, |
| "router_z_loss": 0.0, |
| "step": 2750 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.1536, |
| "grad_norm": 0.20703125, |
| "learning_rate": 0.003, |
| "loss": 2.376, |
| "router_z_loss": 0.0, |
| "step": 2760 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.2048, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.3885, |
| "router_z_loss": 0.0, |
| "step": 2770 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.256, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.003, |
| "loss": 2.3924, |
| "router_z_loss": 0.0, |
| "step": 2780 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.3072, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 0.003, |
| "loss": 2.3908, |
| "router_z_loss": 0.0, |
| "step": 2790 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.3584, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.3913, |
| "router_z_loss": 0.0, |
| "step": 2800 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.4096, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.3867, |
| "router_z_loss": 0.0, |
| "step": 2810 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.4608, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.003, |
| "loss": 2.3902, |
| "router_z_loss": 0.0, |
| "step": 2820 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.512, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.003, |
| "loss": 2.3953, |
| "router_z_loss": 0.0, |
| "step": 2830 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.5632, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.003, |
| "loss": 2.3928, |
| "router_z_loss": 0.0, |
| "step": 2840 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.6144, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.003, |
| "loss": 2.3981, |
| "router_z_loss": 0.0, |
| "step": 2850 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.6656, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.003, |
| "loss": 2.393, |
| "router_z_loss": 0.0, |
| "step": 2860 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.7168, |
| "grad_norm": 0.1845703125, |
| "learning_rate": 0.003, |
| "loss": 2.3859, |
| "router_z_loss": 0.0, |
| "step": 2870 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.768, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.003, |
| "loss": 2.3941, |
| "router_z_loss": 0.0, |
| "step": 2880 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.8192, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.003, |
| "loss": 2.3908, |
| "router_z_loss": 0.0, |
| "step": 2890 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.8704, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.392, |
| "router_z_loss": 0.0, |
| "step": 2900 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.9216, |
| "grad_norm": 0.18359375, |
| "learning_rate": 0.003, |
| "loss": 2.3869, |
| "router_z_loss": 0.0, |
| "step": 2910 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 14.9728, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.003, |
| "loss": 2.3867, |
| "router_z_loss": 0.0, |
| "step": 2920 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.0256, |
| "grad_norm": 0.341796875, |
| "learning_rate": 0.003, |
| "loss": 2.6412, |
| "router_z_loss": 0.0, |
| "step": 2930 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.0768, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.003, |
| "loss": 2.3859, |
| "router_z_loss": 0.0, |
| "step": 2940 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.128, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.003, |
| "loss": 2.3677, |
| "router_z_loss": 0.0, |
| "step": 2950 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.1792, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.003, |
| "loss": 2.366, |
| "router_z_loss": 0.0, |
| "step": 2960 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.2304, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.3677, |
| "router_z_loss": 0.0, |
| "step": 2970 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.2816, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.003, |
| "loss": 2.3678, |
| "router_z_loss": 0.0, |
| "step": 2980 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.3328, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.003, |
| "loss": 2.3698, |
| "router_z_loss": 0.0, |
| "step": 2990 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.384, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.003, |
| "loss": 2.3727, |
| "router_z_loss": 0.0, |
| "step": 3000 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.4352, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.003, |
| "loss": 2.3714, |
| "router_z_loss": 0.0, |
| "step": 3010 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.4864, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.003, |
| "loss": 2.3734, |
| "router_z_loss": 0.0, |
| "step": 3020 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.5376, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.003, |
| "loss": 2.3697, |
| "router_z_loss": 0.0, |
| "step": 3030 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.588799999999999, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.3835, |
| "router_z_loss": 0.0, |
| "step": 3040 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.64, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.003, |
| "loss": 2.3769, |
| "router_z_loss": 0.0, |
| "step": 3050 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.6912, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.003, |
| "loss": 2.3736, |
| "router_z_loss": 0.0, |
| "step": 3060 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.7424, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.003, |
| "loss": 2.3659, |
| "router_z_loss": 0.0, |
| "step": 3070 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.7936, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.3654, |
| "router_z_loss": 0.0, |
| "step": 3080 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.8448, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.003, |
| "loss": 2.3832, |
| "router_z_loss": 0.0, |
| "step": 3090 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.896, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.003, |
| "loss": 2.3691, |
| "router_z_loss": 0.0, |
| "step": 3100 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.9472, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.003, |
| "loss": 2.3783, |
| "router_z_loss": 0.0, |
| "step": 3110 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 15.9984, |
| "grad_norm": 0.2470703125, |
| "learning_rate": 0.003, |
| "loss": 2.371, |
| "router_z_loss": 0.0, |
| "step": 3120 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.0512, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.003, |
| "loss": 2.6061, |
| "router_z_loss": 0.0, |
| "step": 3130 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.1024, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.003, |
| "loss": 2.3592, |
| "router_z_loss": 0.0, |
| "step": 3140 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.1536, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.003, |
| "loss": 2.3532, |
| "router_z_loss": 0.0, |
| "step": 3150 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.2048, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.003, |
| "loss": 2.3589, |
| "router_z_loss": 0.0, |
| "step": 3160 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.256, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.3526, |
| "router_z_loss": 0.0, |
| "step": 3170 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.3072, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.003, |
| "loss": 2.3661, |
| "router_z_loss": 0.0, |
| "step": 3180 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.3584, |
| "grad_norm": 0.46875, |
| "learning_rate": 0.003, |
| "loss": 2.3769, |
| "router_z_loss": 0.0, |
| "step": 3190 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.4096, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.003, |
| "loss": 2.389, |
| "router_z_loss": 0.0, |
| "step": 3200 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.4608, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.3585, |
| "router_z_loss": 0.0, |
| "step": 3210 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.512, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.3547, |
| "router_z_loss": 0.0, |
| "step": 3220 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.5632, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.003, |
| "loss": 2.36, |
| "router_z_loss": 0.0, |
| "step": 3230 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.6144, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.3523, |
| "router_z_loss": 0.0, |
| "step": 3240 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.6656, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.3612, |
| "router_z_loss": 0.0, |
| "step": 3250 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.7168, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.3605, |
| "router_z_loss": 0.0, |
| "step": 3260 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.768, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.003, |
| "loss": 2.3628, |
| "router_z_loss": 0.0, |
| "step": 3270 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.8192, |
| "grad_norm": 0.208984375, |
| "learning_rate": 0.003, |
| "loss": 2.3562, |
| "router_z_loss": 0.0, |
| "step": 3280 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.8704, |
| "grad_norm": 0.203125, |
| "learning_rate": 0.003, |
| "loss": 2.364, |
| "router_z_loss": 0.0, |
| "step": 3290 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.9216, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.3577, |
| "router_z_loss": 0.0, |
| "step": 3300 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 16.9728, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.3625, |
| "router_z_loss": 0.0, |
| "step": 3310 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.0256, |
| "grad_norm": 0.3125, |
| "learning_rate": 0.003, |
| "loss": 2.5916, |
| "router_z_loss": 0.0, |
| "step": 3320 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.0768, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.003, |
| "loss": 2.3568, |
| "router_z_loss": 0.0, |
| "step": 3330 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.128, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.3433, |
| "router_z_loss": 0.0, |
| "step": 3340 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.1792, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.3373, |
| "router_z_loss": 0.0, |
| "step": 3350 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.2304, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.003, |
| "loss": 2.3511, |
| "router_z_loss": 0.0, |
| "step": 3360 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.2816, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.003, |
| "loss": 2.3433, |
| "router_z_loss": 0.0, |
| "step": 3370 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.3328, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.003, |
| "loss": 2.3534, |
| "router_z_loss": 0.0, |
| "step": 3380 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.384, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.003, |
| "loss": 2.3519, |
| "router_z_loss": 0.0, |
| "step": 3390 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.4352, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.003, |
| "loss": 2.3326, |
| "router_z_loss": 0.0, |
| "step": 3400 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.4864, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.3422, |
| "router_z_loss": 0.0, |
| "step": 3410 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.5376, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.003, |
| "loss": 2.3433, |
| "router_z_loss": 0.0, |
| "step": 3420 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.5888, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.003, |
| "loss": 2.3479, |
| "router_z_loss": 0.0, |
| "step": 3430 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.64, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.003, |
| "loss": 2.3461, |
| "router_z_loss": 0.0, |
| "step": 3440 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.6912, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.003, |
| "loss": 2.3472, |
| "router_z_loss": 0.0, |
| "step": 3450 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.7424, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.003, |
| "loss": 2.3466, |
| "router_z_loss": 0.0, |
| "step": 3460 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.7936, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.003, |
| "loss": 2.3429, |
| "router_z_loss": 0.0, |
| "step": 3470 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.8448, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.003, |
| "loss": 2.3371, |
| "router_z_loss": 0.0, |
| "step": 3480 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.896, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.003, |
| "loss": 2.3425, |
| "router_z_loss": 0.0, |
| "step": 3490 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.9472, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.003, |
| "loss": 2.3435, |
| "router_z_loss": 0.0, |
| "step": 3500 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 17.9984, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.003, |
| "loss": 2.346, |
| "router_z_loss": 0.0, |
| "step": 3510 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.0512, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.5893, |
| "router_z_loss": 0.0, |
| "step": 3520 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.1024, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.3269, |
| "router_z_loss": 0.0, |
| "step": 3530 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.1536, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.003, |
| "loss": 2.3263, |
| "router_z_loss": 0.0, |
| "step": 3540 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.2048, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.003, |
| "loss": 2.3229, |
| "router_z_loss": 0.0, |
| "step": 3550 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.256, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.3294, |
| "router_z_loss": 0.0, |
| "step": 3560 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.3072, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.003, |
| "loss": 2.3249, |
| "router_z_loss": 0.0, |
| "step": 3570 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.3584, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.3299, |
| "router_z_loss": 0.0, |
| "step": 3580 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.4096, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.003, |
| "loss": 2.3279, |
| "router_z_loss": 0.0, |
| "step": 3590 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.4608, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.003, |
| "loss": 2.3334, |
| "router_z_loss": 0.0, |
| "step": 3600 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.512, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.003, |
| "loss": 2.3275, |
| "router_z_loss": 0.0, |
| "step": 3610 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.5632, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.003, |
| "loss": 2.3335, |
| "router_z_loss": 0.0, |
| "step": 3620 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.6144, |
| "grad_norm": 0.224609375, |
| "learning_rate": 0.003, |
| "loss": 2.3309, |
| "router_z_loss": 0.0, |
| "step": 3630 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.6656, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.3376, |
| "router_z_loss": 0.0, |
| "step": 3640 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.7168, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.003, |
| "loss": 2.3377, |
| "router_z_loss": 0.0, |
| "step": 3650 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.768, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 0.003, |
| "loss": 2.3373, |
| "router_z_loss": 0.0, |
| "step": 3660 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.8192, |
| "grad_norm": 0.203125, |
| "learning_rate": 0.003, |
| "loss": 2.3309, |
| "router_z_loss": 0.0, |
| "step": 3670 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.8704, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.003, |
| "loss": 2.3363, |
| "router_z_loss": 0.0, |
| "step": 3680 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.9216, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.003, |
| "loss": 2.3413, |
| "router_z_loss": 0.0, |
| "step": 3690 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 18.9728, |
| "grad_norm": 0.1953125, |
| "learning_rate": 0.003, |
| "loss": 2.3221, |
| "router_z_loss": 0.0, |
| "step": 3700 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.0256, |
| "grad_norm": 0.37109375, |
| "learning_rate": 0.003, |
| "loss": 2.5891, |
| "router_z_loss": 0.0, |
| "step": 3710 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.0768, |
| "grad_norm": 0.2080078125, |
| "learning_rate": 0.003, |
| "loss": 2.3331, |
| "router_z_loss": 0.0, |
| "step": 3720 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.128, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.003, |
| "loss": 2.3133, |
| "router_z_loss": 0.0, |
| "step": 3730 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.1792, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.3156, |
| "router_z_loss": 0.0, |
| "step": 3740 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.2304, |
| "grad_norm": 0.1923828125, |
| "learning_rate": 0.003, |
| "loss": 2.3177, |
| "router_z_loss": 0.0, |
| "step": 3750 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.2816, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.3054, |
| "router_z_loss": 0.0, |
| "step": 3760 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.3328, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.3219, |
| "router_z_loss": 0.0, |
| "step": 3770 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.384, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.003, |
| "loss": 2.3164, |
| "router_z_loss": 0.0, |
| "step": 3780 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.4352, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.3115, |
| "router_z_loss": 0.0, |
| "step": 3790 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.4864, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.3204, |
| "router_z_loss": 0.0, |
| "step": 3800 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.5376, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.003, |
| "loss": 2.3235, |
| "router_z_loss": 0.0, |
| "step": 3810 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.5888, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.3196, |
| "router_z_loss": 0.0, |
| "step": 3820 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.64, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.003, |
| "loss": 2.3186, |
| "router_z_loss": 0.0, |
| "step": 3830 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.6912, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.3255, |
| "router_z_loss": 0.0, |
| "step": 3840 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.7424, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.003, |
| "loss": 2.3227, |
| "router_z_loss": 0.0, |
| "step": 3850 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.7936, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.3226, |
| "router_z_loss": 0.0, |
| "step": 3860 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.8448, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.003, |
| "loss": 2.321, |
| "router_z_loss": 0.0, |
| "step": 3870 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.896, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.003, |
| "loss": 2.3289, |
| "router_z_loss": 0.0, |
| "step": 3880 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.9472, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.003, |
| "loss": 2.3259, |
| "router_z_loss": 0.0, |
| "step": 3890 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 19.9984, |
| "grad_norm": 0.208984375, |
| "learning_rate": 0.003, |
| "loss": 2.3186, |
| "router_z_loss": 0.0, |
| "step": 3900 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.0512, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.003, |
| "loss": 2.5579, |
| "router_z_loss": 0.0, |
| "step": 3910 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.1024, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 0.003, |
| "loss": 2.3122, |
| "router_z_loss": 0.0, |
| "step": 3920 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.1536, |
| "grad_norm": 0.1962890625, |
| "learning_rate": 0.003, |
| "loss": 2.3019, |
| "router_z_loss": 0.0, |
| "step": 3930 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.2048, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.003, |
| "loss": 2.3079, |
| "router_z_loss": 0.0, |
| "step": 3940 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.256, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.3077, |
| "router_z_loss": 0.0, |
| "step": 3950 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.3072, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.3094, |
| "router_z_loss": 0.0, |
| "step": 3960 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.3584, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.003, |
| "loss": 2.3071, |
| "router_z_loss": 0.0, |
| "step": 3970 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.4096, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.003, |
| "loss": 2.307, |
| "router_z_loss": 0.0, |
| "step": 3980 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.4608, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.003, |
| "loss": 2.3117, |
| "router_z_loss": 0.0, |
| "step": 3990 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.512, |
| "grad_norm": 0.228515625, |
| "learning_rate": 0.003, |
| "loss": 2.305, |
| "router_z_loss": 0.0, |
| "step": 4000 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.5632, |
| "grad_norm": 0.3828125, |
| "learning_rate": 0.003, |
| "loss": 2.3096, |
| "router_z_loss": 0.0, |
| "step": 4010 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.6144, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.003, |
| "loss": 2.3166, |
| "router_z_loss": 0.0, |
| "step": 4020 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.6656, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.003, |
| "loss": 2.3105, |
| "router_z_loss": 0.0, |
| "step": 4030 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.7168, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.003, |
| "loss": 2.3168, |
| "router_z_loss": 0.0, |
| "step": 4040 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.768, |
| "grad_norm": 0.203125, |
| "learning_rate": 0.003, |
| "loss": 2.3146, |
| "router_z_loss": 0.0, |
| "step": 4050 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.8192, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.3069, |
| "router_z_loss": 0.0, |
| "step": 4060 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.8704, |
| "grad_norm": 0.2041015625, |
| "learning_rate": 0.003, |
| "loss": 2.3131, |
| "router_z_loss": 0.0, |
| "step": 4070 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.9216, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.3062, |
| "router_z_loss": 0.0, |
| "step": 4080 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 20.9728, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.3078, |
| "router_z_loss": 0.0, |
| "step": 4090 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.0256, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.003, |
| "loss": 2.5517, |
| "router_z_loss": 0.0, |
| "step": 4100 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.0768, |
| "grad_norm": 0.189453125, |
| "learning_rate": 0.003, |
| "loss": 2.3115, |
| "router_z_loss": 0.0, |
| "step": 4110 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.128, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 0.003, |
| "loss": 2.2996, |
| "router_z_loss": 0.0, |
| "step": 4120 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.1792, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.003, |
| "loss": 2.2861, |
| "router_z_loss": 0.0, |
| "step": 4130 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.2304, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.003, |
| "loss": 2.2964, |
| "router_z_loss": 0.0, |
| "step": 4140 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.2816, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.003, |
| "loss": 2.2974, |
| "router_z_loss": 0.0, |
| "step": 4150 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.3328, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.003, |
| "loss": 2.2996, |
| "router_z_loss": 0.0, |
| "step": 4160 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.384, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.2975, |
| "router_z_loss": 0.0, |
| "step": 4170 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.4352, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.003, |
| "loss": 2.2967, |
| "router_z_loss": 0.0, |
| "step": 4180 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.4864, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.3015, |
| "router_z_loss": 0.0, |
| "step": 4190 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.5376, |
| "grad_norm": 0.2431640625, |
| "learning_rate": 0.003, |
| "loss": 2.3039, |
| "router_z_loss": 0.0, |
| "step": 4200 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.5888, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.003, |
| "loss": 2.2986, |
| "router_z_loss": 0.0, |
| "step": 4210 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.64, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.003, |
| "loss": 2.3019, |
| "router_z_loss": 0.0, |
| "step": 4220 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.6912, |
| "grad_norm": 0.384765625, |
| "learning_rate": 0.003, |
| "loss": 2.3001, |
| "router_z_loss": 0.0, |
| "step": 4230 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.7424, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.3052, |
| "router_z_loss": 0.0, |
| "step": 4240 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.7936, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.3046, |
| "router_z_loss": 0.0, |
| "step": 4250 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.8448, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 0.003, |
| "loss": 2.2952, |
| "router_z_loss": 0.0, |
| "step": 4260 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.896, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.003, |
| "loss": 2.3044, |
| "router_z_loss": 0.0, |
| "step": 4270 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.9472, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.003, |
| "loss": 2.297, |
| "router_z_loss": 0.0, |
| "step": 4280 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 21.9984, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.3065, |
| "router_z_loss": 0.0, |
| "step": 4290 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.0512, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.003, |
| "loss": 2.5303, |
| "router_z_loss": 0.0, |
| "step": 4300 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.1024, |
| "grad_norm": 0.208984375, |
| "learning_rate": 0.003, |
| "loss": 2.2945, |
| "router_z_loss": 0.0, |
| "step": 4310 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.1536, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 0.003, |
| "loss": 2.2929, |
| "router_z_loss": 0.0, |
| "step": 4320 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.2048, |
| "grad_norm": 0.1962890625, |
| "learning_rate": 0.003, |
| "loss": 2.2863, |
| "router_z_loss": 0.0, |
| "step": 4330 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.256, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.003, |
| "loss": 2.2848, |
| "router_z_loss": 0.0, |
| "step": 4340 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.3072, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.2808, |
| "router_z_loss": 0.0, |
| "step": 4350 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.3584, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.003, |
| "loss": 2.295, |
| "router_z_loss": 0.0, |
| "step": 4360 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.4096, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.003, |
| "loss": 2.2885, |
| "router_z_loss": 0.0, |
| "step": 4370 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.4608, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.003, |
| "loss": 2.293, |
| "router_z_loss": 0.0, |
| "step": 4380 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.512, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.003, |
| "loss": 2.2869, |
| "router_z_loss": 0.0, |
| "step": 4390 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.5632, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.2914, |
| "router_z_loss": 0.0, |
| "step": 4400 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.6144, |
| "grad_norm": 0.228515625, |
| "learning_rate": 0.003, |
| "loss": 2.2923, |
| "router_z_loss": 0.0, |
| "step": 4410 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.6656, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.003, |
| "loss": 2.2918, |
| "router_z_loss": 0.0, |
| "step": 4420 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.7168, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.003, |
| "loss": 2.2946, |
| "router_z_loss": 0.0, |
| "step": 4430 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.768, |
| "grad_norm": 0.224609375, |
| "learning_rate": 0.003, |
| "loss": 2.2953, |
| "router_z_loss": 0.0, |
| "step": 4440 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.8192, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.003, |
| "loss": 2.2919, |
| "router_z_loss": 0.0, |
| "step": 4450 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.8704, |
| "grad_norm": 0.37109375, |
| "learning_rate": 0.003, |
| "loss": 2.2963, |
| "router_z_loss": 0.0, |
| "step": 4460 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.9216, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.003, |
| "loss": 2.3011, |
| "router_z_loss": 0.0, |
| "step": 4470 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 22.9728, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.003, |
| "loss": 2.288, |
| "router_z_loss": 0.0, |
| "step": 4480 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.0256, |
| "grad_norm": 0.341796875, |
| "learning_rate": 0.003, |
| "loss": 2.5194, |
| "router_z_loss": 0.0, |
| "step": 4490 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.0768, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.003, |
| "loss": 2.29, |
| "router_z_loss": 0.0, |
| "step": 4500 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.128, |
| "grad_norm": 0.2470703125, |
| "learning_rate": 0.003, |
| "loss": 2.2739, |
| "router_z_loss": 0.0, |
| "step": 4510 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.1792, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.003, |
| "loss": 2.2745, |
| "router_z_loss": 0.0, |
| "step": 4520 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.2304, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.003, |
| "loss": 2.2817, |
| "router_z_loss": 0.0, |
| "step": 4530 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.2816, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.003, |
| "loss": 2.2819, |
| "router_z_loss": 0.0, |
| "step": 4540 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.3328, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.003, |
| "loss": 2.2771, |
| "router_z_loss": 0.0, |
| "step": 4550 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.384, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.003, |
| "loss": 2.2759, |
| "router_z_loss": 0.0, |
| "step": 4560 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.4352, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.003, |
| "loss": 2.2858, |
| "router_z_loss": 0.0, |
| "step": 4570 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.4864, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.003, |
| "loss": 2.2792, |
| "router_z_loss": 0.0, |
| "step": 4580 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.5376, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.003, |
| "loss": 2.2787, |
| "router_z_loss": 0.0, |
| "step": 4590 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.5888, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.003, |
| "loss": 2.2848, |
| "router_z_loss": 0.0, |
| "step": 4600 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.64, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.003, |
| "loss": 2.2912, |
| "router_z_loss": 0.0, |
| "step": 4610 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.6912, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.2804, |
| "router_z_loss": 0.0, |
| "step": 4620 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.7424, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.003, |
| "loss": 2.2858, |
| "router_z_loss": 0.0, |
| "step": 4630 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.7936, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.276, |
| "router_z_loss": 0.0, |
| "step": 4640 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.8448, |
| "grad_norm": 0.25, |
| "learning_rate": 0.003, |
| "loss": 2.2804, |
| "router_z_loss": 0.0, |
| "step": 4650 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.896, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.003, |
| "loss": 2.2768, |
| "router_z_loss": 0.0, |
| "step": 4660 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.9472, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.2873, |
| "router_z_loss": 0.0, |
| "step": 4670 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 23.9984, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.003, |
| "loss": 2.2838, |
| "router_z_loss": 0.0, |
| "step": 4680 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.0512, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.003, |
| "loss": 2.5179, |
| "router_z_loss": 0.0, |
| "step": 4690 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.1024, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.003, |
| "loss": 2.2732, |
| "router_z_loss": 0.0, |
| "step": 4700 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.1536, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.003, |
| "loss": 2.2682, |
| "router_z_loss": 0.0, |
| "step": 4710 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.2048, |
| "grad_norm": 0.2041015625, |
| "learning_rate": 0.003, |
| "loss": 2.2655, |
| "router_z_loss": 0.0, |
| "step": 4720 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.256, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.003, |
| "loss": 2.2717, |
| "router_z_loss": 0.0, |
| "step": 4730 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.3072, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.003, |
| "loss": 2.2683, |
| "router_z_loss": 0.0, |
| "step": 4740 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.3584, |
| "grad_norm": 0.193359375, |
| "learning_rate": 0.003, |
| "loss": 2.2719, |
| "router_z_loss": 0.0, |
| "step": 4750 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.4096, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.003, |
| "loss": 2.2755, |
| "router_z_loss": 0.0, |
| "step": 4760 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.4608, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.2796, |
| "router_z_loss": 0.0, |
| "step": 4770 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.512, |
| "grad_norm": 0.341796875, |
| "learning_rate": 0.003, |
| "loss": 2.2727, |
| "router_z_loss": 0.0, |
| "step": 4780 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.5632, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.003, |
| "loss": 2.28, |
| "router_z_loss": 0.0, |
| "step": 4790 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.6144, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.003, |
| "loss": 2.2707, |
| "router_z_loss": 0.0, |
| "step": 4800 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.6656, |
| "grad_norm": 0.1982421875, |
| "learning_rate": 0.003, |
| "loss": 2.2735, |
| "router_z_loss": 0.0, |
| "step": 4810 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.7168, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.003, |
| "loss": 2.2702, |
| "router_z_loss": 0.0, |
| "step": 4820 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.768, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.003, |
| "loss": 2.2765, |
| "router_z_loss": 0.0, |
| "step": 4830 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.8192, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.003, |
| "loss": 2.2724, |
| "router_z_loss": 0.0, |
| "step": 4840 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.8704, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.003, |
| "loss": 2.2781, |
| "router_z_loss": 0.0, |
| "step": 4850 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.9216, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.003, |
| "loss": 2.2737, |
| "router_z_loss": 0.0, |
| "step": 4860 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 24.9728, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.003, |
| "loss": 2.2763, |
| "router_z_loss": 0.0, |
| "step": 4870 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.0256, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.003, |
| "loss": 2.5145, |
| "router_z_loss": 0.0, |
| "step": 4880 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.0768, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.003, |
| "loss": 2.273, |
| "router_z_loss": 0.0, |
| "step": 4890 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.128, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.003, |
| "loss": 2.2639, |
| "router_z_loss": 0.0, |
| "step": 4900 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.1792, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.003, |
| "loss": 2.2609, |
| "router_z_loss": 0.0, |
| "step": 4910 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.2304, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.003, |
| "loss": 2.2573, |
| "router_z_loss": 0.0, |
| "step": 4920 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.2816, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.003, |
| "loss": 2.2669, |
| "router_z_loss": 0.0, |
| "step": 4930 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.3328, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.003, |
| "loss": 2.2681, |
| "router_z_loss": 0.0, |
| "step": 4940 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.384, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.003, |
| "loss": 2.2659, |
| "router_z_loss": 0.0, |
| "step": 4950 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.4352, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.003, |
| "loss": 2.2657, |
| "router_z_loss": 0.0, |
| "step": 4960 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.4864, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.003, |
| "loss": 2.2681, |
| "router_z_loss": 0.0, |
| "step": 4970 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.5376, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.003, |
| "loss": 2.2655, |
| "router_z_loss": 0.0, |
| "step": 4980 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.5888, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.003, |
| "loss": 2.2683, |
| "router_z_loss": 0.0, |
| "step": 4990 |
| }, |
| { |
| "balancing_entropy": 0.0, |
| "balancing_loss": 0.0, |
| "epoch": 25.64, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.003, |
| "loss": 2.2736, |
| "router_z_loss": 0.0, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 19500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 100, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.066322671894528e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|