|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.01, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1e-05, |
|
"grad_norm": 1.085229352812366, |
|
"learning_rate": 3e-06, |
|
"loss": 10.849, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2e-05, |
|
"grad_norm": 1.0764689186661929, |
|
"learning_rate": 6e-06, |
|
"loss": 10.8489, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 3e-05, |
|
"grad_norm": 1.0926036068515363, |
|
"learning_rate": 9e-06, |
|
"loss": 10.8486, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 4e-05, |
|
"grad_norm": 1.0859011783792423, |
|
"learning_rate": 1.2e-05, |
|
"loss": 10.848, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 5e-05, |
|
"grad_norm": 1.0906873388641662, |
|
"learning_rate": 1.5e-05, |
|
"loss": 10.8453, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 6e-05, |
|
"grad_norm": 1.0895888734627917, |
|
"learning_rate": 1.8e-05, |
|
"loss": 10.8447, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7e-05, |
|
"grad_norm": 1.0913329404483254, |
|
"learning_rate": 2.1000000000000002e-05, |
|
"loss": 10.8355, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 8e-05, |
|
"grad_norm": 1.0766237663279077, |
|
"learning_rate": 2.4e-05, |
|
"loss": 10.8141, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 9e-05, |
|
"grad_norm": 1.0617425338278697, |
|
"learning_rate": 2.7e-05, |
|
"loss": 10.8099, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0001, |
|
"grad_norm": 1.080271312928173, |
|
"learning_rate": 3e-05, |
|
"loss": 10.7983, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00011, |
|
"grad_norm": 1.0663119503184246, |
|
"learning_rate": 3.2999999999999996e-05, |
|
"loss": 10.7844, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00012, |
|
"grad_norm": 1.0694505502575085, |
|
"learning_rate": 3.6e-05, |
|
"loss": 10.7739, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00013, |
|
"grad_norm": 1.0509206118753271, |
|
"learning_rate": 3.9e-05, |
|
"loss": 10.7554, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00014, |
|
"grad_norm": 1.0427932624919014, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 10.7452, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00015, |
|
"grad_norm": 1.0323600599358198, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 10.736, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00016, |
|
"grad_norm": 1.0039727604034705, |
|
"learning_rate": 4.8e-05, |
|
"loss": 10.7191, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00017, |
|
"grad_norm": 0.9736033767894897, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 10.7049, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00018, |
|
"grad_norm": 0.9665971933165531, |
|
"learning_rate": 5.4e-05, |
|
"loss": 10.69, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00019, |
|
"grad_norm": 0.9428089164290921, |
|
"learning_rate": 5.7e-05, |
|
"loss": 10.6767, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0002, |
|
"grad_norm": 0.943192779381936, |
|
"learning_rate": 6e-05, |
|
"loss": 10.666, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00021, |
|
"grad_norm": 0.9246956411226879, |
|
"learning_rate": 6.3e-05, |
|
"loss": 10.6525, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00022, |
|
"grad_norm": 0.9206776841693647, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 10.6378, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00023, |
|
"grad_norm": 0.9152142474292139, |
|
"learning_rate": 6.9e-05, |
|
"loss": 10.626, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00024, |
|
"grad_norm": 0.9101286615991204, |
|
"learning_rate": 7.2e-05, |
|
"loss": 10.613, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00025, |
|
"grad_norm": 0.9100685942512898, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 10.5998, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00026, |
|
"grad_norm": 0.9093157110032968, |
|
"learning_rate": 7.8e-05, |
|
"loss": 10.5859, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00027, |
|
"grad_norm": 0.9036082412984342, |
|
"learning_rate": 8.1e-05, |
|
"loss": 10.5742, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00028, |
|
"grad_norm": 0.9033464449252919, |
|
"learning_rate": 8.400000000000001e-05, |
|
"loss": 10.5612, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00029, |
|
"grad_norm": 0.9067163029768021, |
|
"learning_rate": 8.7e-05, |
|
"loss": 10.5465, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0003, |
|
"grad_norm": 0.9152916501496612, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 10.5321, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00031, |
|
"grad_norm": 0.9122391999445313, |
|
"learning_rate": 9.3e-05, |
|
"loss": 10.5167, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.00032, |
|
"grad_norm": 0.9090668181794543, |
|
"learning_rate": 9.6e-05, |
|
"loss": 10.5034, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00033, |
|
"grad_norm": 0.9095629132760054, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 10.4865, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00034, |
|
"grad_norm": 0.9107692784496799, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 10.4706, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.00035, |
|
"grad_norm": 0.9162740070354684, |
|
"learning_rate": 0.00010500000000000002, |
|
"loss": 10.4535, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00036, |
|
"grad_norm": 0.9114324016484531, |
|
"learning_rate": 0.000108, |
|
"loss": 10.4367, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.00037, |
|
"grad_norm": 0.901331547883087, |
|
"learning_rate": 0.000111, |
|
"loss": 10.4196, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.00038, |
|
"grad_norm": 0.9056352937834914, |
|
"learning_rate": 0.000114, |
|
"loss": 10.4003, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.00039, |
|
"grad_norm": 0.9071824304759276, |
|
"learning_rate": 0.000117, |
|
"loss": 10.3815, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0004, |
|
"grad_norm": 0.911150856035614, |
|
"learning_rate": 0.00012, |
|
"loss": 10.3599, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00041, |
|
"grad_norm": 0.9001357892993758, |
|
"learning_rate": 0.000123, |
|
"loss": 10.3411, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.00042, |
|
"grad_norm": 0.9057215347961051, |
|
"learning_rate": 0.000126, |
|
"loss": 10.3208, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.00043, |
|
"grad_norm": 0.9101872556211666, |
|
"learning_rate": 0.000129, |
|
"loss": 10.2978, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.00044, |
|
"grad_norm": 0.9097345366629672, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 10.2768, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.00045, |
|
"grad_norm": 0.9111670268341607, |
|
"learning_rate": 0.000135, |
|
"loss": 10.2539, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00046, |
|
"grad_norm": 0.9030473796809102, |
|
"learning_rate": 0.000138, |
|
"loss": 10.2322, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.00047, |
|
"grad_norm": 0.9086055117133346, |
|
"learning_rate": 0.000141, |
|
"loss": 10.2064, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.00048, |
|
"grad_norm": 0.9060414917101882, |
|
"learning_rate": 0.000144, |
|
"loss": 10.1837, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.00049, |
|
"grad_norm": 0.9059366436676172, |
|
"learning_rate": 0.000147, |
|
"loss": 10.1599, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0005, |
|
"grad_norm": 0.9106165244124662, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 10.1343, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00051, |
|
"grad_norm": 0.9110992341155927, |
|
"learning_rate": 0.000153, |
|
"loss": 10.1078, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.00052, |
|
"grad_norm": 0.9077294523989683, |
|
"learning_rate": 0.000156, |
|
"loss": 10.0815, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.00053, |
|
"grad_norm": 0.8995624264210066, |
|
"learning_rate": 0.000159, |
|
"loss": 10.0581, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00054, |
|
"grad_norm": 0.9187536344258231, |
|
"learning_rate": 0.000162, |
|
"loss": 10.026, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00055, |
|
"grad_norm": 0.9102867236672618, |
|
"learning_rate": 0.000165, |
|
"loss": 10.0019, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00056, |
|
"grad_norm": 0.9031170239719724, |
|
"learning_rate": 0.00016800000000000002, |
|
"loss": 9.9743, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.00057, |
|
"grad_norm": 0.9090239107255728, |
|
"learning_rate": 0.000171, |
|
"loss": 9.9467, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.00058, |
|
"grad_norm": 0.9070896749665766, |
|
"learning_rate": 0.000174, |
|
"loss": 9.9223, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.00059, |
|
"grad_norm": 0.9116582783399498, |
|
"learning_rate": 0.000177, |
|
"loss": 9.8905, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0006, |
|
"grad_norm": 0.899071209460366, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 9.8642, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00061, |
|
"grad_norm": 0.8984032767613607, |
|
"learning_rate": 0.000183, |
|
"loss": 9.8367, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.00062, |
|
"grad_norm": 0.9043779152804675, |
|
"learning_rate": 0.000186, |
|
"loss": 9.8058, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.00063, |
|
"grad_norm": 0.8969230453763369, |
|
"learning_rate": 0.000189, |
|
"loss": 9.7784, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.00064, |
|
"grad_norm": 0.8929896633083918, |
|
"learning_rate": 0.000192, |
|
"loss": 9.7485, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.00065, |
|
"grad_norm": 0.9016512171852502, |
|
"learning_rate": 0.00019500000000000002, |
|
"loss": 9.7165, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00066, |
|
"grad_norm": 0.8946267084059, |
|
"learning_rate": 0.00019800000000000002, |
|
"loss": 9.6927, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00067, |
|
"grad_norm": 0.8963872265737496, |
|
"learning_rate": 0.000201, |
|
"loss": 9.6633, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.00068, |
|
"grad_norm": 0.9035635569767697, |
|
"learning_rate": 0.00020400000000000003, |
|
"loss": 9.6313, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.00069, |
|
"grad_norm": 0.8884430485081615, |
|
"learning_rate": 0.00020700000000000002, |
|
"loss": 9.604, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0007, |
|
"grad_norm": 0.901710386427562, |
|
"learning_rate": 0.00021000000000000004, |
|
"loss": 9.5675, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00071, |
|
"grad_norm": 0.8913815692585527, |
|
"learning_rate": 0.00021299999999999997, |
|
"loss": 9.5403, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.00072, |
|
"grad_norm": 0.8947982003050186, |
|
"learning_rate": 0.000216, |
|
"loss": 9.5138, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.00073, |
|
"grad_norm": 0.8936046055705469, |
|
"learning_rate": 0.00021899999999999998, |
|
"loss": 9.4802, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.00074, |
|
"grad_norm": 0.894403551075387, |
|
"learning_rate": 0.000222, |
|
"loss": 9.4539, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.00075, |
|
"grad_norm": 0.8897139679621429, |
|
"learning_rate": 0.000225, |
|
"loss": 9.4187, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.00076, |
|
"grad_norm": 0.89755782108852, |
|
"learning_rate": 0.000228, |
|
"loss": 9.3929, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.00077, |
|
"grad_norm": 0.8917412779828411, |
|
"learning_rate": 0.000231, |
|
"loss": 9.3649, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.00078, |
|
"grad_norm": 0.886930008239094, |
|
"learning_rate": 0.000234, |
|
"loss": 9.3351, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.00079, |
|
"grad_norm": 0.8893115023720741, |
|
"learning_rate": 0.00023700000000000001, |
|
"loss": 9.2992, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0008, |
|
"grad_norm": 0.8917772276535281, |
|
"learning_rate": 0.00024, |
|
"loss": 9.2632, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.00081, |
|
"grad_norm": 0.8971231461853929, |
|
"learning_rate": 0.00024300000000000002, |
|
"loss": 9.2316, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.00082, |
|
"grad_norm": 0.9071155901294882, |
|
"learning_rate": 0.000246, |
|
"loss": 9.206, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.00083, |
|
"grad_norm": 0.8963243651316662, |
|
"learning_rate": 0.00024900000000000004, |
|
"loss": 9.1708, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.00084, |
|
"grad_norm": 0.9002519637093493, |
|
"learning_rate": 0.000252, |
|
"loss": 9.1395, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.00085, |
|
"grad_norm": 0.8962870771351267, |
|
"learning_rate": 0.000255, |
|
"loss": 9.1159, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.00086, |
|
"grad_norm": 0.8973951423301171, |
|
"learning_rate": 0.000258, |
|
"loss": 9.0799, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.00087, |
|
"grad_norm": 0.8894399774297843, |
|
"learning_rate": 0.000261, |
|
"loss": 9.0571, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.00088, |
|
"grad_norm": 0.8954974565265822, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 9.0211, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.00089, |
|
"grad_norm": 0.8903887799592864, |
|
"learning_rate": 0.000267, |
|
"loss": 8.9943, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0009, |
|
"grad_norm": 0.8896831425922581, |
|
"learning_rate": 0.00027, |
|
"loss": 8.9666, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00091, |
|
"grad_norm": 0.8841935030693385, |
|
"learning_rate": 0.000273, |
|
"loss": 8.9365, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.00092, |
|
"grad_norm": 0.8862593694142118, |
|
"learning_rate": 0.000276, |
|
"loss": 8.9052, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.00093, |
|
"grad_norm": 0.8881041222250594, |
|
"learning_rate": 0.000279, |
|
"loss": 8.8752, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.00094, |
|
"grad_norm": 0.8868931156100198, |
|
"learning_rate": 0.000282, |
|
"loss": 8.8494, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.00095, |
|
"grad_norm": 0.8826333164427848, |
|
"learning_rate": 0.000285, |
|
"loss": 8.8203, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.00096, |
|
"grad_norm": 0.8910955494857569, |
|
"learning_rate": 0.000288, |
|
"loss": 8.7905, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.00097, |
|
"grad_norm": 0.8809279978723125, |
|
"learning_rate": 0.000291, |
|
"loss": 8.7668, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.00098, |
|
"grad_norm": 0.8829202734417477, |
|
"learning_rate": 0.000294, |
|
"loss": 8.73, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.00099, |
|
"grad_norm": 0.8858154061322314, |
|
"learning_rate": 0.000297, |
|
"loss": 8.7072, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.001, |
|
"grad_norm": 0.8861735275161908, |
|
"learning_rate": 0.00030000000000000003, |
|
"loss": 8.6806, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00101, |
|
"grad_norm": 0.8886468466177152, |
|
"learning_rate": 0.00030300000000000005, |
|
"loss": 8.6455, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.00102, |
|
"grad_norm": 0.8755547614914917, |
|
"learning_rate": 0.000306, |
|
"loss": 8.625, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.00103, |
|
"grad_norm": 0.879237266457317, |
|
"learning_rate": 0.000309, |
|
"loss": 8.6003, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.00104, |
|
"grad_norm": 0.8697854479956653, |
|
"learning_rate": 0.000312, |
|
"loss": 8.5786, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.00105, |
|
"grad_norm": 0.8733949043334459, |
|
"learning_rate": 0.000315, |
|
"loss": 8.552, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.00106, |
|
"grad_norm": 0.8706148334292045, |
|
"learning_rate": 0.000318, |
|
"loss": 8.5224, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.00107, |
|
"grad_norm": 0.8653683244965015, |
|
"learning_rate": 0.000321, |
|
"loss": 8.4981, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.00108, |
|
"grad_norm": 0.8737239863866451, |
|
"learning_rate": 0.000324, |
|
"loss": 8.4698, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.00109, |
|
"grad_norm": 0.8706512729216435, |
|
"learning_rate": 0.000327, |
|
"loss": 8.4501, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0011, |
|
"grad_norm": 0.8842127152624679, |
|
"learning_rate": 0.00033, |
|
"loss": 8.4274, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.00111, |
|
"grad_norm": 0.9007754832304464, |
|
"learning_rate": 0.000333, |
|
"loss": 8.3985, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.00112, |
|
"grad_norm": 0.9159855921530741, |
|
"learning_rate": 0.00033600000000000004, |
|
"loss": 8.3784, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.00113, |
|
"grad_norm": 0.9063278036144603, |
|
"learning_rate": 0.000339, |
|
"loss": 8.3391, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.00114, |
|
"grad_norm": 0.8437820836704115, |
|
"learning_rate": 0.000342, |
|
"loss": 8.3286, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.00115, |
|
"grad_norm": 0.8612821674982505, |
|
"learning_rate": 0.00034500000000000004, |
|
"loss": 8.3022, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.00116, |
|
"grad_norm": 0.8581012057508914, |
|
"learning_rate": 0.000348, |
|
"loss": 8.2786, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.00117, |
|
"grad_norm": 0.834681162463853, |
|
"learning_rate": 0.000351, |
|
"loss": 8.2472, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.00118, |
|
"grad_norm": 0.8526508674143746, |
|
"learning_rate": 0.000354, |
|
"loss": 8.2312, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.00119, |
|
"grad_norm": 0.8471842679679056, |
|
"learning_rate": 0.000357, |
|
"loss": 8.2186, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0012, |
|
"grad_norm": 0.8238029079166322, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 8.1917, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00121, |
|
"grad_norm": 0.832770075662114, |
|
"learning_rate": 0.000363, |
|
"loss": 8.1641, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.00122, |
|
"grad_norm": 0.8176689288160716, |
|
"learning_rate": 0.000366, |
|
"loss": 8.1471, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.00123, |
|
"grad_norm": 0.8121975389696077, |
|
"learning_rate": 0.000369, |
|
"loss": 8.1243, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.00124, |
|
"grad_norm": 0.8287555396618358, |
|
"learning_rate": 0.000372, |
|
"loss": 8.1037, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.00125, |
|
"grad_norm": 0.8387006564379252, |
|
"learning_rate": 0.000375, |
|
"loss": 8.0802, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.00126, |
|
"grad_norm": 0.7985855201767323, |
|
"learning_rate": 0.000378, |
|
"loss": 8.0672, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.00127, |
|
"grad_norm": 0.7830434817433392, |
|
"learning_rate": 0.000381, |
|
"loss": 8.0437, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.00128, |
|
"grad_norm": 0.8466276209824322, |
|
"learning_rate": 0.000384, |
|
"loss": 8.0264, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.00129, |
|
"grad_norm": 1.1259201196462498, |
|
"learning_rate": 0.00038700000000000003, |
|
"loss": 8.0199, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.0013, |
|
"grad_norm": 1.356061683657538, |
|
"learning_rate": 0.00039000000000000005, |
|
"loss": 8.0054, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00131, |
|
"grad_norm": 0.781068809712636, |
|
"learning_rate": 0.000393, |
|
"loss": 7.9576, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.00132, |
|
"grad_norm": 1.0367188815921278, |
|
"learning_rate": 0.00039600000000000003, |
|
"loss": 7.9591, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.00133, |
|
"grad_norm": 0.7841067594214959, |
|
"learning_rate": 0.00039900000000000005, |
|
"loss": 7.9293, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.00134, |
|
"grad_norm": 0.8594113418007541, |
|
"learning_rate": 0.000402, |
|
"loss": 7.9217, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.00135, |
|
"grad_norm": 0.7468719464596628, |
|
"learning_rate": 0.00040500000000000003, |
|
"loss": 7.8883, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.00136, |
|
"grad_norm": 0.8415432392576198, |
|
"learning_rate": 0.00040800000000000005, |
|
"loss": 7.8771, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.00137, |
|
"grad_norm": 0.7386877288273068, |
|
"learning_rate": 0.000411, |
|
"loss": 7.8542, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.00138, |
|
"grad_norm": 0.7450176106214967, |
|
"learning_rate": 0.00041400000000000003, |
|
"loss": 7.8322, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.00139, |
|
"grad_norm": 0.7424698225823185, |
|
"learning_rate": 0.00041700000000000005, |
|
"loss": 7.8197, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0014, |
|
"grad_norm": 0.7268306219948636, |
|
"learning_rate": 0.00042000000000000007, |
|
"loss": 7.8018, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00141, |
|
"grad_norm": 0.7115037341291065, |
|
"learning_rate": 0.000423, |
|
"loss": 7.7905, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.00142, |
|
"grad_norm": 0.6725409058271569, |
|
"learning_rate": 0.00042599999999999995, |
|
"loss": 7.7772, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.00143, |
|
"grad_norm": 0.6830809074405504, |
|
"learning_rate": 0.00042899999999999997, |
|
"loss": 7.7496, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.00144, |
|
"grad_norm": 0.6791849969278475, |
|
"learning_rate": 0.000432, |
|
"loss": 7.7318, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.00145, |
|
"grad_norm": 0.6462817248800249, |
|
"learning_rate": 0.000435, |
|
"loss": 7.7251, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.00146, |
|
"grad_norm": 0.6695930969912641, |
|
"learning_rate": 0.00043799999999999997, |
|
"loss": 7.6893, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.00147, |
|
"grad_norm": 0.7012896651032599, |
|
"learning_rate": 0.000441, |
|
"loss": 7.6817, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.00148, |
|
"grad_norm": 0.7755580877429182, |
|
"learning_rate": 0.000444, |
|
"loss": 7.6698, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.00149, |
|
"grad_norm": 1.088947674236225, |
|
"learning_rate": 0.00044699999999999997, |
|
"loss": 7.6713, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0015, |
|
"grad_norm": 0.9396541473912592, |
|
"learning_rate": 0.00045, |
|
"loss": 7.6436, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.00151, |
|
"grad_norm": 0.640217037835256, |
|
"learning_rate": 0.000453, |
|
"loss": 7.6133, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.00152, |
|
"grad_norm": 0.9098496631236208, |
|
"learning_rate": 0.000456, |
|
"loss": 7.6149, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.00153, |
|
"grad_norm": 0.6175759444520236, |
|
"learning_rate": 0.000459, |
|
"loss": 7.5962, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.00154, |
|
"grad_norm": 0.6884616968083866, |
|
"learning_rate": 0.000462, |
|
"loss": 7.5772, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.00155, |
|
"grad_norm": 0.6360938051960316, |
|
"learning_rate": 0.000465, |
|
"loss": 7.5598, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.00156, |
|
"grad_norm": 0.6937352973699618, |
|
"learning_rate": 0.000468, |
|
"loss": 7.5366, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.00157, |
|
"grad_norm": 0.6334556877551312, |
|
"learning_rate": 0.000471, |
|
"loss": 7.53, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.00158, |
|
"grad_norm": 0.9707170671679272, |
|
"learning_rate": 0.00047400000000000003, |
|
"loss": 7.5153, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.00159, |
|
"grad_norm": 1.0240750882994218, |
|
"learning_rate": 0.000477, |
|
"loss": 7.5076, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 0.9322894026235434, |
|
"learning_rate": 0.00048, |
|
"loss": 7.4836, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.00161, |
|
"grad_norm": 0.5279037898518898, |
|
"learning_rate": 0.00048300000000000003, |
|
"loss": 7.4527, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.00162, |
|
"grad_norm": 0.687356662308957, |
|
"learning_rate": 0.00048600000000000005, |
|
"loss": 7.4601, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.00163, |
|
"grad_norm": 0.5623951705594973, |
|
"learning_rate": 0.0004890000000000001, |
|
"loss": 7.4388, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.00164, |
|
"grad_norm": 0.5581337114560441, |
|
"learning_rate": 0.000492, |
|
"loss": 7.4399, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.00165, |
|
"grad_norm": 0.5516159301488641, |
|
"learning_rate": 0.000495, |
|
"loss": 7.4126, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.00166, |
|
"grad_norm": 0.5242244578051735, |
|
"learning_rate": 0.0004980000000000001, |
|
"loss": 7.3876, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.00167, |
|
"grad_norm": 0.45997959649003123, |
|
"learning_rate": 0.000501, |
|
"loss": 7.3779, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.00168, |
|
"grad_norm": 0.5436289820614866, |
|
"learning_rate": 0.000504, |
|
"loss": 7.3569, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.00169, |
|
"grad_norm": 0.4983067598465849, |
|
"learning_rate": 0.0005070000000000001, |
|
"loss": 7.3495, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.0017, |
|
"grad_norm": 0.4402852485923817, |
|
"learning_rate": 0.00051, |
|
"loss": 7.3316, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.00171, |
|
"grad_norm": 0.5221521396945228, |
|
"learning_rate": 0.000513, |
|
"loss": 7.3138, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.00172, |
|
"grad_norm": 0.45544347662440743, |
|
"learning_rate": 0.000516, |
|
"loss": 7.3129, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.00173, |
|
"grad_norm": 0.4745602833877857, |
|
"learning_rate": 0.0005189999999999999, |
|
"loss": 7.2961, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.00174, |
|
"grad_norm": 0.5121580158942174, |
|
"learning_rate": 0.000522, |
|
"loss": 7.311, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.00175, |
|
"grad_norm": 0.680505499537256, |
|
"learning_rate": 0.000525, |
|
"loss": 7.2769, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.00176, |
|
"grad_norm": 0.7210959926983863, |
|
"learning_rate": 0.0005279999999999999, |
|
"loss": 7.2549, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.00177, |
|
"grad_norm": 0.7510526045152774, |
|
"learning_rate": 0.000531, |
|
"loss": 7.2572, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.00178, |
|
"grad_norm": 0.7343359791017195, |
|
"learning_rate": 0.000534, |
|
"loss": 7.2506, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.00179, |
|
"grad_norm": 0.6185909230661502, |
|
"learning_rate": 0.000537, |
|
"loss": 7.2371, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.0018, |
|
"grad_norm": 0.42887581745789505, |
|
"learning_rate": 0.00054, |
|
"loss": 7.2041, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00181, |
|
"grad_norm": 0.5359194273155663, |
|
"learning_rate": 0.000543, |
|
"loss": 7.1961, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.00182, |
|
"grad_norm": 0.4369454993609359, |
|
"learning_rate": 0.000546, |
|
"loss": 7.188, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.00183, |
|
"grad_norm": 0.41001210068633426, |
|
"learning_rate": 0.000549, |
|
"loss": 7.1769, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.00184, |
|
"grad_norm": 0.4396906853721559, |
|
"learning_rate": 0.000552, |
|
"loss": 7.1646, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.00185, |
|
"grad_norm": 0.4311329351383538, |
|
"learning_rate": 0.000555, |
|
"loss": 7.1403, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.00186, |
|
"grad_norm": 0.44013221474943204, |
|
"learning_rate": 0.000558, |
|
"loss": 7.1495, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.00187, |
|
"grad_norm": 0.42719641993731927, |
|
"learning_rate": 0.000561, |
|
"loss": 7.1244, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.00188, |
|
"grad_norm": 0.39192004433941763, |
|
"learning_rate": 0.000564, |
|
"loss": 7.1022, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.00189, |
|
"grad_norm": 0.5474017259570552, |
|
"learning_rate": 0.000567, |
|
"loss": 7.1002, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0019, |
|
"grad_norm": 0.5320208949420774, |
|
"learning_rate": 0.00057, |
|
"loss": 7.0824, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.00191, |
|
"grad_norm": 0.7661733453917681, |
|
"learning_rate": 0.000573, |
|
"loss": 7.0764, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.00192, |
|
"grad_norm": 0.9858385620454592, |
|
"learning_rate": 0.000576, |
|
"loss": 7.0775, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.00193, |
|
"grad_norm": 0.9591489873604585, |
|
"learning_rate": 0.000579, |
|
"loss": 7.0725, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.00194, |
|
"grad_norm": 0.8774582305545432, |
|
"learning_rate": 0.000582, |
|
"loss": 7.0715, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.00195, |
|
"grad_norm": 0.9015156284206135, |
|
"learning_rate": 0.000585, |
|
"loss": 7.0487, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.00196, |
|
"grad_norm": 0.8000421121121074, |
|
"learning_rate": 0.000588, |
|
"loss": 7.0339, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.00197, |
|
"grad_norm": 0.8468645119231468, |
|
"learning_rate": 0.000591, |
|
"loss": 7.0329, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.00198, |
|
"grad_norm": 0.6924161058762034, |
|
"learning_rate": 0.000594, |
|
"loss": 7.0197, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.00199, |
|
"grad_norm": 0.5671884633245193, |
|
"learning_rate": 0.0005970000000000001, |
|
"loss": 6.9992, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 0.5557515599131739, |
|
"learning_rate": 0.0006000000000000001, |
|
"loss": 6.9982, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00201, |
|
"grad_norm": 0.4619645912916116, |
|
"learning_rate": 0.000603, |
|
"loss": 6.9694, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.00202, |
|
"grad_norm": 0.4929222182059183, |
|
"learning_rate": 0.0006060000000000001, |
|
"loss": 6.9601, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.00203, |
|
"grad_norm": 0.37702087689736097, |
|
"learning_rate": 0.0006090000000000001, |
|
"loss": 6.9599, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.00204, |
|
"grad_norm": 0.45518892473332057, |
|
"learning_rate": 0.000612, |
|
"loss": 6.9476, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.00205, |
|
"grad_norm": 0.384809834745988, |
|
"learning_rate": 0.000615, |
|
"loss": 6.9322, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.00206, |
|
"grad_norm": 0.38976835520829006, |
|
"learning_rate": 0.000618, |
|
"loss": 6.9255, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.00207, |
|
"grad_norm": 0.4375749283114229, |
|
"learning_rate": 0.000621, |
|
"loss": 6.9166, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.00208, |
|
"grad_norm": 0.6317927736115376, |
|
"learning_rate": 0.000624, |
|
"loss": 6.9157, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.00209, |
|
"grad_norm": 0.7772617938427908, |
|
"learning_rate": 0.000627, |
|
"loss": 6.9007, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.0021, |
|
"grad_norm": 1.2298132092555871, |
|
"learning_rate": 0.00063, |
|
"loss": 6.9015, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.00211, |
|
"grad_norm": 0.7435502579532725, |
|
"learning_rate": 0.000633, |
|
"loss": 6.8883, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.00212, |
|
"grad_norm": 0.3794792840193541, |
|
"learning_rate": 0.000636, |
|
"loss": 6.8693, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.00213, |
|
"grad_norm": 0.688114698891817, |
|
"learning_rate": 0.000639, |
|
"loss": 6.8589, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.00214, |
|
"grad_norm": 0.49868168831557785, |
|
"learning_rate": 0.000642, |
|
"loss": 6.8504, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.00215, |
|
"grad_norm": 0.6292678113208914, |
|
"learning_rate": 0.000645, |
|
"loss": 6.8506, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.00216, |
|
"grad_norm": 0.3673676300147008, |
|
"learning_rate": 0.000648, |
|
"loss": 6.8383, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.00217, |
|
"grad_norm": 0.4511278951279821, |
|
"learning_rate": 0.000651, |
|
"loss": 6.8131, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.00218, |
|
"grad_norm": 0.3783399316849399, |
|
"learning_rate": 0.000654, |
|
"loss": 6.8146, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.00219, |
|
"grad_norm": 0.3651267362185199, |
|
"learning_rate": 0.000657, |
|
"loss": 6.7867, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.0022, |
|
"grad_norm": 0.4708567379601776, |
|
"learning_rate": 0.00066, |
|
"loss": 6.8074, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00221, |
|
"grad_norm": 0.5218809374007617, |
|
"learning_rate": 0.0006630000000000001, |
|
"loss": 6.7837, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.00222, |
|
"grad_norm": 0.7518789100021657, |
|
"learning_rate": 0.000666, |
|
"loss": 6.7774, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.00223, |
|
"grad_norm": 0.9964198530393009, |
|
"learning_rate": 0.000669, |
|
"loss": 6.7824, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.00224, |
|
"grad_norm": 0.807522133159797, |
|
"learning_rate": 0.0006720000000000001, |
|
"loss": 6.7748, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.00225, |
|
"grad_norm": 0.6803785939854445, |
|
"learning_rate": 0.000675, |
|
"loss": 6.7563, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.00226, |
|
"grad_norm": 1.0762485305388094, |
|
"learning_rate": 0.000678, |
|
"loss": 6.7681, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.00227, |
|
"grad_norm": 0.9129777590140102, |
|
"learning_rate": 0.0006810000000000001, |
|
"loss": 6.734, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.00228, |
|
"grad_norm": 1.299087365220233, |
|
"learning_rate": 0.000684, |
|
"loss": 6.7517, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.00229, |
|
"grad_norm": 0.7392729796935101, |
|
"learning_rate": 0.000687, |
|
"loss": 6.7331, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.0023, |
|
"grad_norm": 0.6158971034672628, |
|
"learning_rate": 0.0006900000000000001, |
|
"loss": 6.7089, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.00231, |
|
"grad_norm": 0.7016695647497186, |
|
"learning_rate": 0.000693, |
|
"loss": 6.6956, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.00232, |
|
"grad_norm": 0.6434185638703606, |
|
"learning_rate": 0.000696, |
|
"loss": 6.7082, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.00233, |
|
"grad_norm": 0.6044879418578446, |
|
"learning_rate": 0.0006990000000000001, |
|
"loss": 6.6921, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.00234, |
|
"grad_norm": 0.4480061456613071, |
|
"learning_rate": 0.000702, |
|
"loss": 6.679, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.00235, |
|
"grad_norm": 0.4492075259819563, |
|
"learning_rate": 0.000705, |
|
"loss": 6.6661, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.00236, |
|
"grad_norm": 0.3889271878969786, |
|
"learning_rate": 0.000708, |
|
"loss": 6.6458, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.00237, |
|
"grad_norm": 0.41607485698419117, |
|
"learning_rate": 0.0007109999999999999, |
|
"loss": 6.6575, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.00238, |
|
"grad_norm": 0.3515958046168432, |
|
"learning_rate": 0.000714, |
|
"loss": 6.6432, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.00239, |
|
"grad_norm": 0.4049983983930021, |
|
"learning_rate": 0.000717, |
|
"loss": 6.6274, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.0024, |
|
"grad_norm": 0.3247748587680522, |
|
"learning_rate": 0.0007199999999999999, |
|
"loss": 6.6414, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.00241, |
|
"grad_norm": 0.36262136655648425, |
|
"learning_rate": 0.000723, |
|
"loss": 6.6267, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.00242, |
|
"grad_norm": 0.3238608381517686, |
|
"learning_rate": 0.000726, |
|
"loss": 6.598, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.00243, |
|
"grad_norm": 0.31681258130419926, |
|
"learning_rate": 0.000729, |
|
"loss": 6.5877, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.00244, |
|
"grad_norm": 0.3616370322868285, |
|
"learning_rate": 0.000732, |
|
"loss": 6.5911, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.00245, |
|
"grad_norm": 0.4634076592576489, |
|
"learning_rate": 0.000735, |
|
"loss": 6.5815, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.00246, |
|
"grad_norm": 0.7486687333799987, |
|
"learning_rate": 0.000738, |
|
"loss": 6.5693, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.00247, |
|
"grad_norm": 1.662590620725261, |
|
"learning_rate": 0.000741, |
|
"loss": 6.5942, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.00248, |
|
"grad_norm": 0.7962452736060305, |
|
"learning_rate": 0.000744, |
|
"loss": 6.5799, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.00249, |
|
"grad_norm": 1.091558045553116, |
|
"learning_rate": 0.000747, |
|
"loss": 6.5681, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 1.1566118279124307, |
|
"learning_rate": 0.00075, |
|
"loss": 6.5821, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.00251, |
|
"grad_norm": 0.7784764288643516, |
|
"learning_rate": 0.000753, |
|
"loss": 6.5311, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.00252, |
|
"grad_norm": 0.8327004278614876, |
|
"learning_rate": 0.000756, |
|
"loss": 6.5476, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.00253, |
|
"grad_norm": 0.8184368732684698, |
|
"learning_rate": 0.000759, |
|
"loss": 6.5327, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.00254, |
|
"grad_norm": 1.1486609061260633, |
|
"learning_rate": 0.000762, |
|
"loss": 6.541, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.00255, |
|
"grad_norm": 0.8085196503687528, |
|
"learning_rate": 0.0007650000000000001, |
|
"loss": 6.533, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.00256, |
|
"grad_norm": 0.6227788989369496, |
|
"learning_rate": 0.000768, |
|
"loss": 6.5039, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.00257, |
|
"grad_norm": 0.7880922816831363, |
|
"learning_rate": 0.000771, |
|
"loss": 6.5163, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.00258, |
|
"grad_norm": 0.8888953354311911, |
|
"learning_rate": 0.0007740000000000001, |
|
"loss": 6.497, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.00259, |
|
"grad_norm": 0.6556759438280303, |
|
"learning_rate": 0.000777, |
|
"loss": 6.491, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.0026, |
|
"grad_norm": 0.5556045743581063, |
|
"learning_rate": 0.0007800000000000001, |
|
"loss": 6.4736, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.00261, |
|
"grad_norm": 0.5757853625150707, |
|
"learning_rate": 0.0007830000000000001, |
|
"loss": 6.4835, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.00262, |
|
"grad_norm": 0.4702991355160636, |
|
"learning_rate": 0.000786, |
|
"loss": 6.4605, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.00263, |
|
"grad_norm": 0.5236270398277312, |
|
"learning_rate": 0.0007890000000000001, |
|
"loss": 6.4668, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.00264, |
|
"grad_norm": 0.57918798311576, |
|
"learning_rate": 0.0007920000000000001, |
|
"loss": 6.4494, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.00265, |
|
"grad_norm": 0.48386144230733374, |
|
"learning_rate": 0.000795, |
|
"loss": 6.4394, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.00266, |
|
"grad_norm": 0.44170660781498655, |
|
"learning_rate": 0.0007980000000000001, |
|
"loss": 6.4372, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.00267, |
|
"grad_norm": 0.35002993788441544, |
|
"learning_rate": 0.0008010000000000001, |
|
"loss": 6.4273, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.00268, |
|
"grad_norm": 0.49450621569184094, |
|
"learning_rate": 0.000804, |
|
"loss": 6.426, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.00269, |
|
"grad_norm": 0.5917094821390496, |
|
"learning_rate": 0.0008070000000000001, |
|
"loss": 6.4104, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.0027, |
|
"grad_norm": 0.8610573273382283, |
|
"learning_rate": 0.0008100000000000001, |
|
"loss": 6.4165, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.00271, |
|
"grad_norm": 0.9687752227819599, |
|
"learning_rate": 0.000813, |
|
"loss": 6.4342, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.00272, |
|
"grad_norm": 0.8252297813091672, |
|
"learning_rate": 0.0008160000000000001, |
|
"loss": 6.4109, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.00273, |
|
"grad_norm": 1.0509340869742279, |
|
"learning_rate": 0.0008190000000000001, |
|
"loss": 6.4091, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.00274, |
|
"grad_norm": 1.2492890299831212, |
|
"learning_rate": 0.000822, |
|
"loss": 6.3984, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.00275, |
|
"grad_norm": 0.7449833128353471, |
|
"learning_rate": 0.0008250000000000001, |
|
"loss": 6.3937, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.00276, |
|
"grad_norm": 0.650217944416595, |
|
"learning_rate": 0.0008280000000000001, |
|
"loss": 6.383, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.00277, |
|
"grad_norm": 0.6897831829588578, |
|
"learning_rate": 0.0008310000000000001, |
|
"loss": 6.3791, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.00278, |
|
"grad_norm": 0.6845751052836879, |
|
"learning_rate": 0.0008340000000000001, |
|
"loss": 6.3625, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.00279, |
|
"grad_norm": 0.6895680336476154, |
|
"learning_rate": 0.0008370000000000001, |
|
"loss": 6.3607, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.0028, |
|
"grad_norm": 0.7962805776833526, |
|
"learning_rate": 0.0008400000000000001, |
|
"loss": 6.3497, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.00281, |
|
"grad_norm": 0.7281805890288613, |
|
"learning_rate": 0.0008430000000000001, |
|
"loss": 6.3358, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.00282, |
|
"grad_norm": 0.6077351466654652, |
|
"learning_rate": 0.000846, |
|
"loss": 6.3343, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.00283, |
|
"grad_norm": 0.7021452115418912, |
|
"learning_rate": 0.0008489999999999999, |
|
"loss": 6.3268, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.00284, |
|
"grad_norm": 0.8067589645945545, |
|
"learning_rate": 0.0008519999999999999, |
|
"loss": 6.3217, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.00285, |
|
"grad_norm": 0.7238661685202905, |
|
"learning_rate": 0.000855, |
|
"loss": 6.3229, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.00286, |
|
"grad_norm": 0.6566110557704579, |
|
"learning_rate": 0.0008579999999999999, |
|
"loss": 6.3002, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.00287, |
|
"grad_norm": 0.5448339024770075, |
|
"learning_rate": 0.000861, |
|
"loss": 6.3016, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.00288, |
|
"grad_norm": 0.4516947530472597, |
|
"learning_rate": 0.000864, |
|
"loss": 6.279, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.00289, |
|
"grad_norm": 0.5415480649305122, |
|
"learning_rate": 0.0008669999999999999, |
|
"loss": 6.2929, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.0029, |
|
"grad_norm": 0.5812113854990302, |
|
"learning_rate": 0.00087, |
|
"loss": 6.2743, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.00291, |
|
"grad_norm": 0.5620937871349196, |
|
"learning_rate": 0.000873, |
|
"loss": 6.2679, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.00292, |
|
"grad_norm": 0.43362286284353735, |
|
"learning_rate": 0.0008759999999999999, |
|
"loss": 6.2618, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.00293, |
|
"grad_norm": 0.563953479113573, |
|
"learning_rate": 0.000879, |
|
"loss": 6.2491, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.00294, |
|
"grad_norm": 0.5784107789505917, |
|
"learning_rate": 0.000882, |
|
"loss": 6.2636, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.00295, |
|
"grad_norm": 0.5410292555611181, |
|
"learning_rate": 0.0008849999999999999, |
|
"loss": 6.2293, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.00296, |
|
"grad_norm": 0.5413778994680785, |
|
"learning_rate": 0.000888, |
|
"loss": 6.2428, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.00297, |
|
"grad_norm": 0.7587824592105622, |
|
"learning_rate": 0.000891, |
|
"loss": 6.2186, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.00298, |
|
"grad_norm": 1.2390223277612324, |
|
"learning_rate": 0.0008939999999999999, |
|
"loss": 6.2454, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.00299, |
|
"grad_norm": 1.0051162436627608, |
|
"learning_rate": 0.000897, |
|
"loss": 6.2195, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.003, |
|
"grad_norm": 0.9301924220055711, |
|
"learning_rate": 0.0009, |
|
"loss": 6.2223, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.00301, |
|
"grad_norm": 0.9092669241988458, |
|
"learning_rate": 0.0009029999999999999, |
|
"loss": 6.2133, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.00302, |
|
"grad_norm": 1.0280476917286576, |
|
"learning_rate": 0.000906, |
|
"loss": 6.2045, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.00303, |
|
"grad_norm": 0.955686909103224, |
|
"learning_rate": 0.000909, |
|
"loss": 6.2159, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.00304, |
|
"grad_norm": 0.924153033838841, |
|
"learning_rate": 0.000912, |
|
"loss": 6.2068, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.00305, |
|
"grad_norm": 1.0129569357421315, |
|
"learning_rate": 0.000915, |
|
"loss": 6.2162, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.00306, |
|
"grad_norm": 0.908196615580651, |
|
"learning_rate": 0.000918, |
|
"loss": 6.1982, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.00307, |
|
"grad_norm": 0.9412115799039344, |
|
"learning_rate": 0.000921, |
|
"loss": 6.1922, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.00308, |
|
"grad_norm": 1.0133210251014897, |
|
"learning_rate": 0.000924, |
|
"loss": 6.1801, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.00309, |
|
"grad_norm": 0.7885110954474147, |
|
"learning_rate": 0.000927, |
|
"loss": 6.1916, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.0031, |
|
"grad_norm": 0.8502930513548597, |
|
"learning_rate": 0.00093, |
|
"loss": 6.1767, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.00311, |
|
"grad_norm": 0.8830498260903274, |
|
"learning_rate": 0.000933, |
|
"loss": 6.1706, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.00312, |
|
"grad_norm": 0.7731682808518218, |
|
"learning_rate": 0.000936, |
|
"loss": 6.1483, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.00313, |
|
"grad_norm": 0.5034693382195244, |
|
"learning_rate": 0.0009390000000000001, |
|
"loss": 6.1541, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.00314, |
|
"grad_norm": 0.5944206771214186, |
|
"learning_rate": 0.000942, |
|
"loss": 6.1615, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.00315, |
|
"grad_norm": 0.6198949865082991, |
|
"learning_rate": 0.000945, |
|
"loss": 6.1382, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.00316, |
|
"grad_norm": 0.6322301787116348, |
|
"learning_rate": 0.0009480000000000001, |
|
"loss": 6.112, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.00317, |
|
"grad_norm": 0.6034799354522895, |
|
"learning_rate": 0.000951, |
|
"loss": 6.1294, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.00318, |
|
"grad_norm": 0.594812791501401, |
|
"learning_rate": 0.000954, |
|
"loss": 6.1243, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.00319, |
|
"grad_norm": 0.5121880287121718, |
|
"learning_rate": 0.0009570000000000001, |
|
"loss": 6.1104, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 0.4383869814323231, |
|
"learning_rate": 0.00096, |
|
"loss": 6.1078, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.00321, |
|
"grad_norm": 0.46253996577569373, |
|
"learning_rate": 0.000963, |
|
"loss": 6.0957, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.00322, |
|
"grad_norm": 0.5330974668531825, |
|
"learning_rate": 0.0009660000000000001, |
|
"loss": 6.0852, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.00323, |
|
"grad_norm": 0.4269664457046623, |
|
"learning_rate": 0.000969, |
|
"loss": 6.0805, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.00324, |
|
"grad_norm": 0.4120363869720864, |
|
"learning_rate": 0.0009720000000000001, |
|
"loss": 6.0833, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.00325, |
|
"grad_norm": 0.43852524771556284, |
|
"learning_rate": 0.0009750000000000001, |
|
"loss": 6.0494, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.00326, |
|
"grad_norm": 0.5390182954459198, |
|
"learning_rate": 0.0009780000000000001, |
|
"loss": 6.0675, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.00327, |
|
"grad_norm": 0.7655676469366879, |
|
"learning_rate": 0.000981, |
|
"loss": 6.0729, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.00328, |
|
"grad_norm": 1.3095809996152918, |
|
"learning_rate": 0.000984, |
|
"loss": 6.0661, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.00329, |
|
"grad_norm": 0.9398963728555475, |
|
"learning_rate": 0.000987, |
|
"loss": 6.0593, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.0033, |
|
"grad_norm": 0.9687365266328093, |
|
"learning_rate": 0.00099, |
|
"loss": 6.0595, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.00331, |
|
"grad_norm": 1.0572725295338647, |
|
"learning_rate": 0.0009930000000000002, |
|
"loss": 6.0709, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.00332, |
|
"grad_norm": 0.7804253243852415, |
|
"learning_rate": 0.0009960000000000001, |
|
"loss": 6.0449, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.00333, |
|
"grad_norm": 0.9477444786898163, |
|
"learning_rate": 0.000999, |
|
"loss": 6.0352, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.00334, |
|
"grad_norm": 1.119636024931192, |
|
"learning_rate": 0.001002, |
|
"loss": 6.0509, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.00335, |
|
"grad_norm": 1.0398096390896505, |
|
"learning_rate": 0.001005, |
|
"loss": 6.0495, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.00336, |
|
"grad_norm": 0.8946811171098662, |
|
"learning_rate": 0.001008, |
|
"loss": 6.0348, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.00337, |
|
"grad_norm": 1.0240126692017184, |
|
"learning_rate": 0.0010110000000000002, |
|
"loss": 6.0215, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.00338, |
|
"grad_norm": 1.0245564805630587, |
|
"learning_rate": 0.0010140000000000001, |
|
"loss": 6.0414, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.00339, |
|
"grad_norm": 0.9606863659347135, |
|
"learning_rate": 0.0010170000000000001, |
|
"loss": 6.0439, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.0034, |
|
"grad_norm": 0.7549695201896788, |
|
"learning_rate": 0.00102, |
|
"loss": 6.0067, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.00341, |
|
"grad_norm": 0.7981881225838676, |
|
"learning_rate": 0.001023, |
|
"loss": 5.9935, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.00342, |
|
"grad_norm": 0.6476806075689499, |
|
"learning_rate": 0.001026, |
|
"loss": 5.9919, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.00343, |
|
"grad_norm": 0.6180530813345404, |
|
"learning_rate": 0.0010290000000000002, |
|
"loss": 5.9943, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.00344, |
|
"grad_norm": 0.5904235085071222, |
|
"learning_rate": 0.001032, |
|
"loss": 5.9824, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.00345, |
|
"grad_norm": 0.5662246103658737, |
|
"learning_rate": 0.001035, |
|
"loss": 5.9736, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.00346, |
|
"grad_norm": 0.5815647759401746, |
|
"learning_rate": 0.0010379999999999999, |
|
"loss": 5.9787, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.00347, |
|
"grad_norm": 0.5922585456492798, |
|
"learning_rate": 0.001041, |
|
"loss": 5.9505, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.00348, |
|
"grad_norm": 0.546770754957902, |
|
"learning_rate": 0.001044, |
|
"loss": 5.9566, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.00349, |
|
"grad_norm": 0.5259270224752429, |
|
"learning_rate": 0.001047, |
|
"loss": 5.9388, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.0035, |
|
"grad_norm": 0.5639453872626701, |
|
"learning_rate": 0.00105, |
|
"loss": 5.96, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.00351, |
|
"grad_norm": 0.562716119705, |
|
"learning_rate": 0.001053, |
|
"loss": 5.9575, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.00352, |
|
"grad_norm": 0.6276157353012132, |
|
"learning_rate": 0.0010559999999999999, |
|
"loss": 5.9357, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.00353, |
|
"grad_norm": 0.6899105072430818, |
|
"learning_rate": 0.001059, |
|
"loss": 5.939, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.00354, |
|
"grad_norm": 0.7414855002893123, |
|
"learning_rate": 0.001062, |
|
"loss": 5.9142, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.00355, |
|
"grad_norm": 0.8275245559939801, |
|
"learning_rate": 0.001065, |
|
"loss": 5.932, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.00356, |
|
"grad_norm": 0.8334699202442165, |
|
"learning_rate": 0.001068, |
|
"loss": 5.925, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.00357, |
|
"grad_norm": 0.733463396370079, |
|
"learning_rate": 0.001071, |
|
"loss": 5.9147, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.00358, |
|
"grad_norm": 0.5898600387972718, |
|
"learning_rate": 0.001074, |
|
"loss": 5.9022, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.00359, |
|
"grad_norm": 0.53807120274405, |
|
"learning_rate": 0.001077, |
|
"loss": 5.8933, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.0036, |
|
"grad_norm": 0.5868526923580266, |
|
"learning_rate": 0.00108, |
|
"loss": 5.8984, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.00361, |
|
"grad_norm": 0.6561371025439784, |
|
"learning_rate": 0.001083, |
|
"loss": 5.8927, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.00362, |
|
"grad_norm": 1.003392418604553, |
|
"learning_rate": 0.001086, |
|
"loss": 5.8918, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.00363, |
|
"grad_norm": 1.2318504963643775, |
|
"learning_rate": 0.001089, |
|
"loss": 5.899, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.00364, |
|
"grad_norm": 0.7073595215993886, |
|
"learning_rate": 0.001092, |
|
"loss": 5.8956, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.00365, |
|
"grad_norm": 0.6834408409212124, |
|
"learning_rate": 0.001095, |
|
"loss": 5.8924, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.00366, |
|
"grad_norm": 0.808409762735137, |
|
"learning_rate": 0.001098, |
|
"loss": 5.8732, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.00367, |
|
"grad_norm": 1.150363353399194, |
|
"learning_rate": 0.001101, |
|
"loss": 5.8871, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.00368, |
|
"grad_norm": 1.5759048240116487, |
|
"learning_rate": 0.001104, |
|
"loss": 5.9113, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.00369, |
|
"grad_norm": 0.6706600035773841, |
|
"learning_rate": 0.001107, |
|
"loss": 5.8826, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.0037, |
|
"grad_norm": 1.0165372867603926, |
|
"learning_rate": 0.00111, |
|
"loss": 5.8773, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.00371, |
|
"grad_norm": 1.199271508025637, |
|
"learning_rate": 0.001113, |
|
"loss": 5.8595, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.00372, |
|
"grad_norm": 0.8890339964058547, |
|
"learning_rate": 0.001116, |
|
"loss": 5.8693, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.00373, |
|
"grad_norm": 1.0853395614111072, |
|
"learning_rate": 0.001119, |
|
"loss": 5.8674, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.00374, |
|
"grad_norm": 0.9530307319001724, |
|
"learning_rate": 0.001122, |
|
"loss": 5.8703, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.00375, |
|
"grad_norm": 0.934750801302772, |
|
"learning_rate": 0.0011250000000000001, |
|
"loss": 5.8719, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.00376, |
|
"grad_norm": 0.7986123477309184, |
|
"learning_rate": 0.001128, |
|
"loss": 5.863, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.00377, |
|
"grad_norm": 1.034328210427756, |
|
"learning_rate": 0.001131, |
|
"loss": 5.8644, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.00378, |
|
"grad_norm": 1.1394107219722458, |
|
"learning_rate": 0.001134, |
|
"loss": 5.8702, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.00379, |
|
"grad_norm": 0.6778105205682995, |
|
"learning_rate": 0.001137, |
|
"loss": 5.8386, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.0038, |
|
"grad_norm": 0.7735792074169382, |
|
"learning_rate": 0.00114, |
|
"loss": 5.8619, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.00381, |
|
"grad_norm": 0.882590402307869, |
|
"learning_rate": 0.0011430000000000001, |
|
"loss": 5.83, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.00382, |
|
"grad_norm": 0.8860425089624719, |
|
"learning_rate": 0.001146, |
|
"loss": 5.8197, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.00383, |
|
"grad_norm": 1.0396684860073488, |
|
"learning_rate": 0.001149, |
|
"loss": 5.836, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.00384, |
|
"grad_norm": 0.9683520339794217, |
|
"learning_rate": 0.001152, |
|
"loss": 5.8386, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.00385, |
|
"grad_norm": 0.8210443052214895, |
|
"learning_rate": 0.001155, |
|
"loss": 5.8215, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.00386, |
|
"grad_norm": 1.030920073327714, |
|
"learning_rate": 0.001158, |
|
"loss": 5.8336, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.00387, |
|
"grad_norm": 0.8032114385497527, |
|
"learning_rate": 0.0011610000000000001, |
|
"loss": 5.8025, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.00388, |
|
"grad_norm": 0.6803620347459473, |
|
"learning_rate": 0.001164, |
|
"loss": 5.7978, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.00389, |
|
"grad_norm": 0.6752304208768743, |
|
"learning_rate": 0.001167, |
|
"loss": 5.8027, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.0039, |
|
"grad_norm": 0.6054825081153106, |
|
"learning_rate": 0.00117, |
|
"loss": 5.7828, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.00391, |
|
"grad_norm": 0.5156470046541872, |
|
"learning_rate": 0.001173, |
|
"loss": 5.7863, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.00392, |
|
"grad_norm": 0.49609211852516366, |
|
"learning_rate": 0.001176, |
|
"loss": 5.7945, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.00393, |
|
"grad_norm": 0.42817727304572534, |
|
"learning_rate": 0.0011790000000000001, |
|
"loss": 5.7664, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.00394, |
|
"grad_norm": 0.43894767278563757, |
|
"learning_rate": 0.001182, |
|
"loss": 5.7539, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.00395, |
|
"grad_norm": 0.41850660912289844, |
|
"learning_rate": 0.001185, |
|
"loss": 5.7366, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.00396, |
|
"grad_norm": 0.41745423249833347, |
|
"learning_rate": 0.001188, |
|
"loss": 5.7516, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.00397, |
|
"grad_norm": 0.40474070631964676, |
|
"learning_rate": 0.001191, |
|
"loss": 5.7433, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.00398, |
|
"grad_norm": 0.3677722018443306, |
|
"learning_rate": 0.0011940000000000002, |
|
"loss": 5.7479, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.00399, |
|
"grad_norm": 0.3480206547108819, |
|
"learning_rate": 0.0011970000000000001, |
|
"loss": 5.7478, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 0.3434828622202681, |
|
"learning_rate": 0.0012000000000000001, |
|
"loss": 5.7345, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.00401, |
|
"grad_norm": 0.34918136204349326, |
|
"learning_rate": 0.001203, |
|
"loss": 5.7155, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.00402, |
|
"grad_norm": 0.30554980038341767, |
|
"learning_rate": 0.001206, |
|
"loss": 5.7215, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.00403, |
|
"grad_norm": 0.38840665384838735, |
|
"learning_rate": 0.001209, |
|
"loss": 5.7184, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.00404, |
|
"grad_norm": 0.5409513056663879, |
|
"learning_rate": 0.0012120000000000002, |
|
"loss": 5.714, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.00405, |
|
"grad_norm": 0.845020924848713, |
|
"learning_rate": 0.0012150000000000002, |
|
"loss": 5.7181, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.00406, |
|
"grad_norm": 1.1911410915070972, |
|
"learning_rate": 0.0012180000000000001, |
|
"loss": 5.7205, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.00407, |
|
"grad_norm": 0.6577283880630926, |
|
"learning_rate": 0.0012209999999999999, |
|
"loss": 5.6994, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.00408, |
|
"grad_norm": 0.7475745975098248, |
|
"learning_rate": 0.001224, |
|
"loss": 5.7213, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.00409, |
|
"grad_norm": 0.9872391126413178, |
|
"learning_rate": 0.001227, |
|
"loss": 5.7126, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.0041, |
|
"grad_norm": 1.1359034721668335, |
|
"learning_rate": 0.00123, |
|
"loss": 5.7088, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00411, |
|
"grad_norm": 1.3596329145222696, |
|
"learning_rate": 0.001233, |
|
"loss": 5.7402, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.00412, |
|
"grad_norm": 0.7538358474928969, |
|
"learning_rate": 0.001236, |
|
"loss": 5.7066, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.00413, |
|
"grad_norm": 0.9465320539051596, |
|
"learning_rate": 0.0012389999999999999, |
|
"loss": 5.7197, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.00414, |
|
"grad_norm": 0.9262933655624658, |
|
"learning_rate": 0.001242, |
|
"loss": 5.6978, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.00415, |
|
"grad_norm": 1.1564175286146172, |
|
"learning_rate": 0.001245, |
|
"loss": 5.7105, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.00416, |
|
"grad_norm": 1.1001247072345506, |
|
"learning_rate": 0.001248, |
|
"loss": 5.6929, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.00417, |
|
"grad_norm": 1.0416153435685582, |
|
"learning_rate": 0.001251, |
|
"loss": 5.7199, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.00418, |
|
"grad_norm": 1.0281555694116995, |
|
"learning_rate": 0.001254, |
|
"loss": 5.6999, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.00419, |
|
"grad_norm": 1.1154617103247704, |
|
"learning_rate": 0.0012569999999999999, |
|
"loss": 5.6876, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.0042, |
|
"grad_norm": 0.9999912825556322, |
|
"learning_rate": 0.00126, |
|
"loss": 5.7045, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.00421, |
|
"grad_norm": 0.9729638313238949, |
|
"learning_rate": 0.001263, |
|
"loss": 5.6933, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.00422, |
|
"grad_norm": 1.0297208241186608, |
|
"learning_rate": 0.001266, |
|
"loss": 5.6894, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.00423, |
|
"grad_norm": 0.8381062456476874, |
|
"learning_rate": 0.001269, |
|
"loss": 5.6811, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.00424, |
|
"grad_norm": 0.73825928337582, |
|
"learning_rate": 0.001272, |
|
"loss": 5.6773, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.00425, |
|
"grad_norm": 0.8151693610653118, |
|
"learning_rate": 0.001275, |
|
"loss": 5.6815, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.00426, |
|
"grad_norm": 0.9877723869544494, |
|
"learning_rate": 0.001278, |
|
"loss": 5.6881, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.00427, |
|
"grad_norm": 0.8875063218226354, |
|
"learning_rate": 0.001281, |
|
"loss": 5.6676, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.00428, |
|
"grad_norm": 0.8761493100474917, |
|
"learning_rate": 0.001284, |
|
"loss": 5.6437, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.00429, |
|
"grad_norm": 1.109439755296158, |
|
"learning_rate": 0.001287, |
|
"loss": 5.668, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.0043, |
|
"grad_norm": 0.8726361797071612, |
|
"learning_rate": 0.00129, |
|
"loss": 5.6611, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.00431, |
|
"grad_norm": 0.5601358924900582, |
|
"learning_rate": 0.001293, |
|
"loss": 5.6342, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.00432, |
|
"grad_norm": 0.5940988397687699, |
|
"learning_rate": 0.001296, |
|
"loss": 5.6448, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.00433, |
|
"grad_norm": 0.480538337754196, |
|
"learning_rate": 0.001299, |
|
"loss": 5.6336, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.00434, |
|
"grad_norm": 0.5069699406038057, |
|
"learning_rate": 0.001302, |
|
"loss": 5.6331, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.00435, |
|
"grad_norm": 0.417220569666801, |
|
"learning_rate": 0.001305, |
|
"loss": 5.6227, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.00436, |
|
"grad_norm": 0.46597565539240443, |
|
"learning_rate": 0.001308, |
|
"loss": 5.6039, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.00437, |
|
"grad_norm": 0.37605303006482044, |
|
"learning_rate": 0.001311, |
|
"loss": 5.6161, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.00438, |
|
"grad_norm": 0.3526339213940271, |
|
"learning_rate": 0.001314, |
|
"loss": 5.5977, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.00439, |
|
"grad_norm": 0.3618369277094543, |
|
"learning_rate": 0.001317, |
|
"loss": 5.6246, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.0044, |
|
"grad_norm": 0.3223158135938896, |
|
"learning_rate": 0.00132, |
|
"loss": 5.5938, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.00441, |
|
"grad_norm": 0.3386640445759432, |
|
"learning_rate": 0.001323, |
|
"loss": 5.5905, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.00442, |
|
"grad_norm": 0.3397360216396013, |
|
"learning_rate": 0.0013260000000000001, |
|
"loss": 5.5838, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.00443, |
|
"grad_norm": 0.34532354892574607, |
|
"learning_rate": 0.001329, |
|
"loss": 5.5832, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.00444, |
|
"grad_norm": 0.37928556611065656, |
|
"learning_rate": 0.001332, |
|
"loss": 5.5853, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.00445, |
|
"grad_norm": 0.44947894711961484, |
|
"learning_rate": 0.001335, |
|
"loss": 5.5723, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.00446, |
|
"grad_norm": 0.5239308309237933, |
|
"learning_rate": 0.001338, |
|
"loss": 5.5751, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.00447, |
|
"grad_norm": 0.60889528761374, |
|
"learning_rate": 0.001341, |
|
"loss": 5.5777, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.00448, |
|
"grad_norm": 0.6150556040535831, |
|
"learning_rate": 0.0013440000000000001, |
|
"loss": 5.561, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.00449, |
|
"grad_norm": 0.5444850551876294, |
|
"learning_rate": 0.001347, |
|
"loss": 5.5623, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.0045, |
|
"grad_norm": 0.6163688303714219, |
|
"learning_rate": 0.00135, |
|
"loss": 5.5617, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.00451, |
|
"grad_norm": 0.7972728836658292, |
|
"learning_rate": 0.001353, |
|
"loss": 5.5614, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.00452, |
|
"grad_norm": 0.7711764764543457, |
|
"learning_rate": 0.001356, |
|
"loss": 5.5454, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.00453, |
|
"grad_norm": 0.7702356986189732, |
|
"learning_rate": 0.001359, |
|
"loss": 5.5379, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.00454, |
|
"grad_norm": 1.0838160417982272, |
|
"learning_rate": 0.0013620000000000001, |
|
"loss": 5.5686, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.00455, |
|
"grad_norm": 1.152655585803101, |
|
"learning_rate": 0.0013650000000000001, |
|
"loss": 5.5757, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.00456, |
|
"grad_norm": 1.2776355160689266, |
|
"learning_rate": 0.001368, |
|
"loss": 5.5831, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.00457, |
|
"grad_norm": 1.091292995937963, |
|
"learning_rate": 0.001371, |
|
"loss": 5.5727, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.00458, |
|
"grad_norm": 0.8963405103823251, |
|
"learning_rate": 0.001374, |
|
"loss": 5.5735, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.00459, |
|
"grad_norm": 1.0168648046101516, |
|
"learning_rate": 0.0013770000000000002, |
|
"loss": 5.5669, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.0046, |
|
"grad_norm": 1.296131601782866, |
|
"learning_rate": 0.0013800000000000002, |
|
"loss": 5.5665, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.00461, |
|
"grad_norm": 0.9634420565591739, |
|
"learning_rate": 0.0013830000000000001, |
|
"loss": 5.556, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.00462, |
|
"grad_norm": 0.9383281224355017, |
|
"learning_rate": 0.001386, |
|
"loss": 5.5634, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.00463, |
|
"grad_norm": 1.2569585597421309, |
|
"learning_rate": 0.001389, |
|
"loss": 5.5542, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.00464, |
|
"grad_norm": 0.9874595595654581, |
|
"learning_rate": 0.001392, |
|
"loss": 5.5689, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.00465, |
|
"grad_norm": 1.085595749506429, |
|
"learning_rate": 0.0013950000000000002, |
|
"loss": 5.5385, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.00466, |
|
"grad_norm": 1.0673943770446899, |
|
"learning_rate": 0.0013980000000000002, |
|
"loss": 5.5603, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.00467, |
|
"grad_norm": 0.8139501043376736, |
|
"learning_rate": 0.0014010000000000001, |
|
"loss": 5.5432, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.00468, |
|
"grad_norm": 0.7494382793960519, |
|
"learning_rate": 0.001404, |
|
"loss": 5.5245, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.00469, |
|
"grad_norm": 0.7634992086588068, |
|
"learning_rate": 0.001407, |
|
"loss": 5.5282, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.0047, |
|
"grad_norm": 0.8018093758476836, |
|
"learning_rate": 0.00141, |
|
"loss": 5.5404, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.00471, |
|
"grad_norm": 0.7418690809708749, |
|
"learning_rate": 0.001413, |
|
"loss": 5.5115, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.00472, |
|
"grad_norm": 0.7355325431039438, |
|
"learning_rate": 0.001416, |
|
"loss": 5.5216, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.00473, |
|
"grad_norm": 0.709026539269664, |
|
"learning_rate": 0.001419, |
|
"loss": 5.5305, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.00474, |
|
"grad_norm": 0.5742329758009745, |
|
"learning_rate": 0.0014219999999999999, |
|
"loss": 5.5064, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.00475, |
|
"grad_norm": 0.5859758403725885, |
|
"learning_rate": 0.001425, |
|
"loss": 5.4971, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.00476, |
|
"grad_norm": 0.6365922795308678, |
|
"learning_rate": 0.001428, |
|
"loss": 5.5308, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.00477, |
|
"grad_norm": 0.6539516343537074, |
|
"learning_rate": 0.001431, |
|
"loss": 5.4846, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.00478, |
|
"grad_norm": 0.6446859909585969, |
|
"learning_rate": 0.001434, |
|
"loss": 5.5074, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.00479, |
|
"grad_norm": 0.7791938725908187, |
|
"learning_rate": 0.001437, |
|
"loss": 5.4897, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 0.7608695568360718, |
|
"learning_rate": 0.0014399999999999999, |
|
"loss": 5.4855, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.00481, |
|
"grad_norm": 0.5435552532069989, |
|
"learning_rate": 0.001443, |
|
"loss": 5.4813, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.00482, |
|
"grad_norm": 0.4961185149512517, |
|
"learning_rate": 0.001446, |
|
"loss": 5.4538, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.00483, |
|
"grad_norm": 0.5120902463904886, |
|
"learning_rate": 0.001449, |
|
"loss": 5.4636, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.00484, |
|
"grad_norm": 0.418197369903841, |
|
"learning_rate": 0.001452, |
|
"loss": 5.464, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.00485, |
|
"grad_norm": 0.36311406822078424, |
|
"learning_rate": 0.001455, |
|
"loss": 5.4671, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.00486, |
|
"grad_norm": 0.3913777576995821, |
|
"learning_rate": 0.001458, |
|
"loss": 5.4393, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.00487, |
|
"grad_norm": 0.36874474197662527, |
|
"learning_rate": 0.001461, |
|
"loss": 5.4517, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.00488, |
|
"grad_norm": 0.41593519822402414, |
|
"learning_rate": 0.001464, |
|
"loss": 5.43, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.00489, |
|
"grad_norm": 0.4051383375955623, |
|
"learning_rate": 0.001467, |
|
"loss": 5.4204, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.0049, |
|
"grad_norm": 0.4948319932753325, |
|
"learning_rate": 0.00147, |
|
"loss": 5.434, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.00491, |
|
"grad_norm": 0.6887805434617323, |
|
"learning_rate": 0.001473, |
|
"loss": 5.4357, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.00492, |
|
"grad_norm": 0.912047432270828, |
|
"learning_rate": 0.001476, |
|
"loss": 5.4432, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.00493, |
|
"grad_norm": 0.8950009277905591, |
|
"learning_rate": 0.001479, |
|
"loss": 5.4415, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.00494, |
|
"grad_norm": 0.807683799932231, |
|
"learning_rate": 0.001482, |
|
"loss": 5.4427, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.00495, |
|
"grad_norm": 0.976722169908224, |
|
"learning_rate": 0.001485, |
|
"loss": 5.4601, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.00496, |
|
"grad_norm": 0.7224780529872387, |
|
"learning_rate": 0.001488, |
|
"loss": 5.4314, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.00497, |
|
"grad_norm": 0.7139613919522917, |
|
"learning_rate": 0.001491, |
|
"loss": 5.4172, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.00498, |
|
"grad_norm": 0.7556637282468179, |
|
"learning_rate": 0.001494, |
|
"loss": 5.4443, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.00499, |
|
"grad_norm": 0.8519321120342865, |
|
"learning_rate": 0.001497, |
|
"loss": 5.4223, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 0.9868888032280079, |
|
"learning_rate": 0.0015, |
|
"loss": 5.4308, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.00501, |
|
"grad_norm": 1.2028724714017198, |
|
"learning_rate": 0.001503, |
|
"loss": 5.4458, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.00502, |
|
"grad_norm": 0.9548534640519003, |
|
"learning_rate": 0.001506, |
|
"loss": 5.4405, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.00503, |
|
"grad_norm": 0.9423651398338494, |
|
"learning_rate": 0.0015090000000000001, |
|
"loss": 5.4484, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.00504, |
|
"grad_norm": 0.9672711526274779, |
|
"learning_rate": 0.001512, |
|
"loss": 5.4328, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.00505, |
|
"grad_norm": 0.9474889600476256, |
|
"learning_rate": 0.001515, |
|
"loss": 5.4283, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.00506, |
|
"grad_norm": 1.3039451405080307, |
|
"learning_rate": 0.001518, |
|
"loss": 5.4432, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.00507, |
|
"grad_norm": 1.2269782118632737, |
|
"learning_rate": 0.001521, |
|
"loss": 5.4452, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.00508, |
|
"grad_norm": 0.8597426683067237, |
|
"learning_rate": 0.001524, |
|
"loss": 5.4268, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.00509, |
|
"grad_norm": 0.9388657090373522, |
|
"learning_rate": 0.0015270000000000001, |
|
"loss": 5.4178, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.0051, |
|
"grad_norm": 0.9594427813189665, |
|
"learning_rate": 0.0015300000000000001, |
|
"loss": 5.4356, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.00511, |
|
"grad_norm": 1.04563577032056, |
|
"learning_rate": 0.001533, |
|
"loss": 5.4212, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.00512, |
|
"grad_norm": 0.733703407645156, |
|
"learning_rate": 0.001536, |
|
"loss": 5.4003, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.00513, |
|
"grad_norm": 0.8415210942026606, |
|
"learning_rate": 0.001539, |
|
"loss": 5.423, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.00514, |
|
"grad_norm": 0.8791751992621939, |
|
"learning_rate": 0.001542, |
|
"loss": 5.4064, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.00515, |
|
"grad_norm": 0.8161499995578689, |
|
"learning_rate": 0.0015450000000000001, |
|
"loss": 5.4094, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.00516, |
|
"grad_norm": 0.878383695319614, |
|
"learning_rate": 0.0015480000000000001, |
|
"loss": 5.4087, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.00517, |
|
"grad_norm": 0.9768725869756134, |
|
"learning_rate": 0.001551, |
|
"loss": 5.4055, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.00518, |
|
"grad_norm": 0.8865017723772849, |
|
"learning_rate": 0.001554, |
|
"loss": 5.3907, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.00519, |
|
"grad_norm": 0.8308797688973832, |
|
"learning_rate": 0.001557, |
|
"loss": 5.3905, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.0052, |
|
"grad_norm": 0.6978413162257922, |
|
"learning_rate": 0.0015600000000000002, |
|
"loss": 5.3938, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.00521, |
|
"grad_norm": 0.6562689530690187, |
|
"learning_rate": 0.0015630000000000002, |
|
"loss": 5.3676, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.00522, |
|
"grad_norm": 0.5577523148431155, |
|
"learning_rate": 0.0015660000000000001, |
|
"loss": 5.3673, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.00523, |
|
"grad_norm": 0.5298728018270966, |
|
"learning_rate": 0.001569, |
|
"loss": 5.3784, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.00524, |
|
"grad_norm": 0.44216623389663734, |
|
"learning_rate": 0.001572, |
|
"loss": 5.3811, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.00525, |
|
"grad_norm": 0.3702182111689363, |
|
"learning_rate": 0.001575, |
|
"loss": 5.369, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.00526, |
|
"grad_norm": 0.40289552198632295, |
|
"learning_rate": 0.0015780000000000002, |
|
"loss": 5.3444, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.00527, |
|
"grad_norm": 0.38284579814689895, |
|
"learning_rate": 0.0015810000000000002, |
|
"loss": 5.3551, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.00528, |
|
"grad_norm": 0.33950473031510653, |
|
"learning_rate": 0.0015840000000000001, |
|
"loss": 5.3339, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.00529, |
|
"grad_norm": 0.3777758983585419, |
|
"learning_rate": 0.001587, |
|
"loss": 5.343, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.0053, |
|
"grad_norm": 0.4257451161382566, |
|
"learning_rate": 0.00159, |
|
"loss": 5.3483, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.00531, |
|
"grad_norm": 0.5472045550610978, |
|
"learning_rate": 0.001593, |
|
"loss": 5.3387, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.00532, |
|
"grad_norm": 0.696856419507981, |
|
"learning_rate": 0.0015960000000000002, |
|
"loss": 5.3455, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.00533, |
|
"grad_norm": 0.7975941430607876, |
|
"learning_rate": 0.0015990000000000002, |
|
"loss": 5.3447, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.00534, |
|
"grad_norm": 0.6576622452461942, |
|
"learning_rate": 0.0016020000000000001, |
|
"loss": 5.3339, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.00535, |
|
"grad_norm": 0.5879483820814444, |
|
"learning_rate": 0.001605, |
|
"loss": 5.3347, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.00536, |
|
"grad_norm": 0.8212636469840171, |
|
"learning_rate": 0.001608, |
|
"loss": 5.3454, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.00537, |
|
"grad_norm": 0.7740232031924225, |
|
"learning_rate": 0.0016110000000000002, |
|
"loss": 5.3418, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.00538, |
|
"grad_norm": 0.683098129060214, |
|
"learning_rate": 0.0016140000000000002, |
|
"loss": 5.3143, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.00539, |
|
"grad_norm": 0.9170551975741953, |
|
"learning_rate": 0.0016170000000000002, |
|
"loss": 5.3245, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.0054, |
|
"grad_norm": 0.765080131514484, |
|
"learning_rate": 0.0016200000000000001, |
|
"loss": 5.3262, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.00541, |
|
"grad_norm": 0.658457026305436, |
|
"learning_rate": 0.001623, |
|
"loss": 5.3137, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.00542, |
|
"grad_norm": 0.601942869875084, |
|
"learning_rate": 0.001626, |
|
"loss": 5.3315, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.00543, |
|
"grad_norm": 0.6751097730454854, |
|
"learning_rate": 0.0016290000000000002, |
|
"loss": 5.2998, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.00544, |
|
"grad_norm": 0.6943293389301006, |
|
"learning_rate": 0.0016320000000000002, |
|
"loss": 5.3191, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.00545, |
|
"grad_norm": 0.757591621302123, |
|
"learning_rate": 0.0016350000000000002, |
|
"loss": 5.3293, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.00546, |
|
"grad_norm": 0.825685624372282, |
|
"learning_rate": 0.0016380000000000001, |
|
"loss": 5.3061, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.00547, |
|
"grad_norm": 0.8411824339962438, |
|
"learning_rate": 0.001641, |
|
"loss": 5.3051, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.00548, |
|
"grad_norm": 0.8225125547688507, |
|
"learning_rate": 0.001644, |
|
"loss": 5.303, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.00549, |
|
"grad_norm": 0.7692066087665821, |
|
"learning_rate": 0.0016470000000000002, |
|
"loss": 5.3016, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.0055, |
|
"grad_norm": 0.7541680263658305, |
|
"learning_rate": 0.0016500000000000002, |
|
"loss": 5.3099, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.00551, |
|
"grad_norm": 0.8632990593818363, |
|
"learning_rate": 0.0016530000000000002, |
|
"loss": 5.3123, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.00552, |
|
"grad_norm": 0.9083583396116485, |
|
"learning_rate": 0.0016560000000000001, |
|
"loss": 5.3063, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.00553, |
|
"grad_norm": 0.9125681148017897, |
|
"learning_rate": 0.001659, |
|
"loss": 5.304, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.00554, |
|
"grad_norm": 0.9360850923631763, |
|
"learning_rate": 0.0016620000000000003, |
|
"loss": 5.3021, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.00555, |
|
"grad_norm": 0.9519012966124738, |
|
"learning_rate": 0.0016650000000000002, |
|
"loss": 5.3154, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.00556, |
|
"grad_norm": 0.9297335237671114, |
|
"learning_rate": 0.0016680000000000002, |
|
"loss": 5.2962, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.00557, |
|
"grad_norm": 1.1187109821510643, |
|
"learning_rate": 0.0016710000000000002, |
|
"loss": 5.2935, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.00558, |
|
"grad_norm": 0.9868599839032521, |
|
"learning_rate": 0.0016740000000000001, |
|
"loss": 5.3219, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.00559, |
|
"grad_norm": 0.8786610310513777, |
|
"learning_rate": 0.001677, |
|
"loss": 5.2938, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.0056, |
|
"grad_norm": 0.9118163270219521, |
|
"learning_rate": 0.0016800000000000003, |
|
"loss": 5.2767, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.00561, |
|
"grad_norm": 0.8649386991384733, |
|
"learning_rate": 0.0016830000000000003, |
|
"loss": 5.3004, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.00562, |
|
"grad_norm": 0.5982058914442256, |
|
"learning_rate": 0.0016860000000000002, |
|
"loss": 5.279, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.00563, |
|
"grad_norm": 0.5830709316445766, |
|
"learning_rate": 0.001689, |
|
"loss": 5.2821, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.00564, |
|
"grad_norm": 0.5564672250442253, |
|
"learning_rate": 0.001692, |
|
"loss": 5.2582, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.00565, |
|
"grad_norm": 0.5903966881939692, |
|
"learning_rate": 0.001695, |
|
"loss": 5.2619, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.00566, |
|
"grad_norm": 0.5777761648359326, |
|
"learning_rate": 0.0016979999999999999, |
|
"loss": 5.2552, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.00567, |
|
"grad_norm": 0.6069030603134064, |
|
"learning_rate": 0.0017009999999999998, |
|
"loss": 5.2491, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.00568, |
|
"grad_norm": 0.5693132087018719, |
|
"learning_rate": 0.0017039999999999998, |
|
"loss": 5.2604, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.00569, |
|
"grad_norm": 0.47662406778838745, |
|
"learning_rate": 0.001707, |
|
"loss": 5.2359, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.0057, |
|
"grad_norm": 0.49231021705037487, |
|
"learning_rate": 0.00171, |
|
"loss": 5.253, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.00571, |
|
"grad_norm": 0.4167352661720621, |
|
"learning_rate": 0.001713, |
|
"loss": 5.2491, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.00572, |
|
"grad_norm": 0.39011854138215074, |
|
"learning_rate": 0.0017159999999999999, |
|
"loss": 5.2258, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.00573, |
|
"grad_norm": 0.45971389455263184, |
|
"learning_rate": 0.0017189999999999998, |
|
"loss": 5.2313, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.00574, |
|
"grad_norm": 0.4643694238461635, |
|
"learning_rate": 0.001722, |
|
"loss": 5.2399, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.00575, |
|
"grad_norm": 0.4439840100405838, |
|
"learning_rate": 0.001725, |
|
"loss": 5.2168, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.00576, |
|
"grad_norm": 0.4794795740566699, |
|
"learning_rate": 0.001728, |
|
"loss": 5.2109, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.00577, |
|
"grad_norm": 0.5445912068882687, |
|
"learning_rate": 0.001731, |
|
"loss": 5.2354, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.00578, |
|
"grad_norm": 0.5881619532039025, |
|
"learning_rate": 0.0017339999999999999, |
|
"loss": 5.2156, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.00579, |
|
"grad_norm": 0.6688964155004951, |
|
"learning_rate": 0.0017369999999999998, |
|
"loss": 5.2064, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.0058, |
|
"grad_norm": 0.6446644513052245, |
|
"learning_rate": 0.00174, |
|
"loss": 5.2223, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.00581, |
|
"grad_norm": 0.7385299343409043, |
|
"learning_rate": 0.001743, |
|
"loss": 5.2222, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.00582, |
|
"grad_norm": 1.0486135149054512, |
|
"learning_rate": 0.001746, |
|
"loss": 5.2114, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.00583, |
|
"grad_norm": 0.9184260585056472, |
|
"learning_rate": 0.001749, |
|
"loss": 5.2374, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.00584, |
|
"grad_norm": 0.7724931135788974, |
|
"learning_rate": 0.0017519999999999999, |
|
"loss": 5.2235, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.00585, |
|
"grad_norm": 0.903942587317279, |
|
"learning_rate": 0.0017549999999999998, |
|
"loss": 5.2218, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.00586, |
|
"grad_norm": 0.8496888678331875, |
|
"learning_rate": 0.001758, |
|
"loss": 5.2272, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.00587, |
|
"grad_norm": 0.8580070219006531, |
|
"learning_rate": 0.001761, |
|
"loss": 5.2094, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.00588, |
|
"grad_norm": 1.0100984711915582, |
|
"learning_rate": 0.001764, |
|
"loss": 5.233, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.00589, |
|
"grad_norm": 0.9311738464832717, |
|
"learning_rate": 0.001767, |
|
"loss": 5.2289, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.0059, |
|
"grad_norm": 0.9363758859588256, |
|
"learning_rate": 0.0017699999999999999, |
|
"loss": 5.2155, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.00591, |
|
"grad_norm": 0.9695025163620248, |
|
"learning_rate": 0.001773, |
|
"loss": 5.2165, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.00592, |
|
"grad_norm": 0.9805216711841677, |
|
"learning_rate": 0.001776, |
|
"loss": 5.2417, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.00593, |
|
"grad_norm": 0.9827241162755458, |
|
"learning_rate": 0.001779, |
|
"loss": 5.2392, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.00594, |
|
"grad_norm": 1.1445433178010636, |
|
"learning_rate": 0.001782, |
|
"loss": 5.2522, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.00595, |
|
"grad_norm": 0.950699335886284, |
|
"learning_rate": 0.001785, |
|
"loss": 5.2351, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.00596, |
|
"grad_norm": 0.8373647963925929, |
|
"learning_rate": 0.0017879999999999999, |
|
"loss": 5.2269, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.00597, |
|
"grad_norm": 0.9799263768822638, |
|
"learning_rate": 0.001791, |
|
"loss": 5.2118, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.00598, |
|
"grad_norm": 0.9413379648187816, |
|
"learning_rate": 0.001794, |
|
"loss": 5.2215, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.00599, |
|
"grad_norm": 0.8392983563516706, |
|
"learning_rate": 0.001797, |
|
"loss": 5.2191, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.006, |
|
"grad_norm": 0.8414286288934597, |
|
"learning_rate": 0.0018, |
|
"loss": 5.2134, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.00601, |
|
"grad_norm": 0.8993201270126511, |
|
"learning_rate": 0.001803, |
|
"loss": 5.2089, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.00602, |
|
"grad_norm": 0.8678008919316418, |
|
"learning_rate": 0.0018059999999999999, |
|
"loss": 5.1957, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.00603, |
|
"grad_norm": 0.6967382011990112, |
|
"learning_rate": 0.001809, |
|
"loss": 5.2122, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.00604, |
|
"grad_norm": 0.5312401482691863, |
|
"learning_rate": 0.001812, |
|
"loss": 5.1788, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.00605, |
|
"grad_norm": 0.5795052729398412, |
|
"learning_rate": 0.001815, |
|
"loss": 5.1915, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.00606, |
|
"grad_norm": 0.6345039250131549, |
|
"learning_rate": 0.001818, |
|
"loss": 5.1894, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.00607, |
|
"grad_norm": 0.6716049737606092, |
|
"learning_rate": 0.001821, |
|
"loss": 5.1916, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.00608, |
|
"grad_norm": 0.631863512626499, |
|
"learning_rate": 0.001824, |
|
"loss": 5.1762, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.00609, |
|
"grad_norm": 0.5345713642167306, |
|
"learning_rate": 0.001827, |
|
"loss": 5.177, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.0061, |
|
"grad_norm": 0.455626202663485, |
|
"learning_rate": 0.00183, |
|
"loss": 5.1434, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.00611, |
|
"grad_norm": 0.46197362206927406, |
|
"learning_rate": 0.001833, |
|
"loss": 5.1489, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.00612, |
|
"grad_norm": 0.3943653846553885, |
|
"learning_rate": 0.001836, |
|
"loss": 5.1488, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.00613, |
|
"grad_norm": 0.4053887102586243, |
|
"learning_rate": 0.001839, |
|
"loss": 5.1409, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.00614, |
|
"grad_norm": 0.44363743610311057, |
|
"learning_rate": 0.001842, |
|
"loss": 5.1611, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.00615, |
|
"grad_norm": 0.40025757691173514, |
|
"learning_rate": 0.001845, |
|
"loss": 5.145, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.00616, |
|
"grad_norm": 0.4067321452673224, |
|
"learning_rate": 0.001848, |
|
"loss": 5.1299, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.00617, |
|
"grad_norm": 0.42403873536126996, |
|
"learning_rate": 0.001851, |
|
"loss": 5.1397, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.00618, |
|
"grad_norm": 0.44192084147381183, |
|
"learning_rate": 0.001854, |
|
"loss": 5.1265, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.00619, |
|
"grad_norm": 0.5104655865853382, |
|
"learning_rate": 0.001857, |
|
"loss": 5.1299, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.0062, |
|
"grad_norm": 0.6226145292933671, |
|
"learning_rate": 0.00186, |
|
"loss": 5.118, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.00621, |
|
"grad_norm": 0.745237667035521, |
|
"learning_rate": 0.001863, |
|
"loss": 5.1237, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.00622, |
|
"grad_norm": 0.7986865810762169, |
|
"learning_rate": 0.001866, |
|
"loss": 5.1267, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.00623, |
|
"grad_norm": 0.7723208155309241, |
|
"learning_rate": 0.001869, |
|
"loss": 5.1084, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.00624, |
|
"grad_norm": 0.8171878766946273, |
|
"learning_rate": 0.001872, |
|
"loss": 5.1246, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.00625, |
|
"grad_norm": 0.7607424856276187, |
|
"learning_rate": 0.001875, |
|
"loss": 5.1359, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.00626, |
|
"grad_norm": 0.7005729458198662, |
|
"learning_rate": 0.0018780000000000001, |
|
"loss": 5.1104, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.00627, |
|
"grad_norm": 0.6735260479679158, |
|
"learning_rate": 0.001881, |
|
"loss": 5.1015, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.00628, |
|
"grad_norm": 0.780486998163919, |
|
"learning_rate": 0.001884, |
|
"loss": 5.1349, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.00629, |
|
"grad_norm": 0.8746830311438225, |
|
"learning_rate": 0.001887, |
|
"loss": 5.1097, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.0063, |
|
"grad_norm": 0.9536711950620466, |
|
"learning_rate": 0.00189, |
|
"loss": 5.1137, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.00631, |
|
"grad_norm": 0.8628106812141149, |
|
"learning_rate": 0.0018930000000000002, |
|
"loss": 5.1035, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.00632, |
|
"grad_norm": 0.9320890333045916, |
|
"learning_rate": 0.0018960000000000001, |
|
"loss": 5.1281, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.00633, |
|
"grad_norm": 1.0593968691082751, |
|
"learning_rate": 0.001899, |
|
"loss": 5.1321, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.00634, |
|
"grad_norm": 0.943443350801409, |
|
"learning_rate": 0.001902, |
|
"loss": 5.1182, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.00635, |
|
"grad_norm": 0.9628484504873114, |
|
"learning_rate": 0.001905, |
|
"loss": 5.1089, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.00636, |
|
"grad_norm": 1.1043383962750646, |
|
"learning_rate": 0.001908, |
|
"loss": 5.1289, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.00637, |
|
"grad_norm": 0.8987493866500654, |
|
"learning_rate": 0.0019110000000000002, |
|
"loss": 5.1351, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.00638, |
|
"grad_norm": 0.9251804377428581, |
|
"learning_rate": 0.0019140000000000001, |
|
"loss": 5.1288, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.00639, |
|
"grad_norm": 0.854418425044198, |
|
"learning_rate": 0.001917, |
|
"loss": 5.0998, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 0.9324816679284724, |
|
"learning_rate": 0.00192, |
|
"loss": 5.1038, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.00641, |
|
"grad_norm": 0.9892889234371413, |
|
"learning_rate": 0.001923, |
|
"loss": 5.1163, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.00642, |
|
"grad_norm": 1.0346602459121752, |
|
"learning_rate": 0.001926, |
|
"loss": 5.106, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.00643, |
|
"grad_norm": 0.8661994645957561, |
|
"learning_rate": 0.0019290000000000002, |
|
"loss": 5.117, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.00644, |
|
"grad_norm": 0.8724056100423225, |
|
"learning_rate": 0.0019320000000000001, |
|
"loss": 5.0889, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.00645, |
|
"grad_norm": 0.8584186184200229, |
|
"learning_rate": 0.001935, |
|
"loss": 5.1004, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.00646, |
|
"grad_norm": 0.7360558672224548, |
|
"learning_rate": 0.001938, |
|
"loss": 5.0955, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.00647, |
|
"grad_norm": 0.7977702647925389, |
|
"learning_rate": 0.001941, |
|
"loss": 5.1058, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.00648, |
|
"grad_norm": 0.7872116543506851, |
|
"learning_rate": 0.0019440000000000002, |
|
"loss": 5.0908, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.00649, |
|
"grad_norm": 0.7104658813349117, |
|
"learning_rate": 0.0019470000000000002, |
|
"loss": 5.0718, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.0065, |
|
"grad_norm": 0.7453763255239747, |
|
"learning_rate": 0.0019500000000000001, |
|
"loss": 5.0953, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.00651, |
|
"grad_norm": 0.7781624388594444, |
|
"learning_rate": 0.001953, |
|
"loss": 5.0758, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.00652, |
|
"grad_norm": 0.7616046275009601, |
|
"learning_rate": 0.0019560000000000003, |
|
"loss": 5.0661, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.00653, |
|
"grad_norm": 0.5945469625366651, |
|
"learning_rate": 0.0019590000000000002, |
|
"loss": 5.0539, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.00654, |
|
"grad_norm": 0.6024408595794577, |
|
"learning_rate": 0.001962, |
|
"loss": 5.0374, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.00655, |
|
"grad_norm": 0.5905307565923603, |
|
"learning_rate": 0.001965, |
|
"loss": 5.048, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.00656, |
|
"grad_norm": 0.5236322372626927, |
|
"learning_rate": 0.001968, |
|
"loss": 5.04, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.00657, |
|
"grad_norm": 0.5283416618835216, |
|
"learning_rate": 0.001971, |
|
"loss": 5.0223, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.00658, |
|
"grad_norm": 0.5563146586062104, |
|
"learning_rate": 0.001974, |
|
"loss": 5.0415, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.00659, |
|
"grad_norm": 0.6297873363395704, |
|
"learning_rate": 0.001977, |
|
"loss": 5.0241, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.0066, |
|
"grad_norm": 0.5780538180580159, |
|
"learning_rate": 0.00198, |
|
"loss": 5.0197, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.00661, |
|
"grad_norm": 0.5505266007864265, |
|
"learning_rate": 0.001983, |
|
"loss": 4.9938, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.00662, |
|
"grad_norm": 0.6077995116545319, |
|
"learning_rate": 0.0019860000000000004, |
|
"loss": 4.9946, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.00663, |
|
"grad_norm": 0.693544741809416, |
|
"learning_rate": 0.0019890000000000003, |
|
"loss": 5.01, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.00664, |
|
"grad_norm": 0.801492715102265, |
|
"learning_rate": 0.0019920000000000003, |
|
"loss": 5.0017, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.00665, |
|
"grad_norm": 0.8803866066519176, |
|
"learning_rate": 0.0019950000000000002, |
|
"loss": 4.9845, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.00666, |
|
"grad_norm": 0.9133314823416234, |
|
"learning_rate": 0.001998, |
|
"loss": 5.0048, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.00667, |
|
"grad_norm": 0.9006055773427947, |
|
"learning_rate": 0.002001, |
|
"loss": 4.9967, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.00668, |
|
"grad_norm": 0.8268536663020751, |
|
"learning_rate": 0.002004, |
|
"loss": 5.0164, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.00669, |
|
"grad_norm": 1.1034218430158187, |
|
"learning_rate": 0.002007, |
|
"loss": 5.0358, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.0067, |
|
"grad_norm": 1.0710112933622913, |
|
"learning_rate": 0.00201, |
|
"loss": 5.0286, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.00671, |
|
"grad_norm": 1.0384170566197124, |
|
"learning_rate": 0.002013, |
|
"loss": 5.0085, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.00672, |
|
"grad_norm": 0.9247071512184438, |
|
"learning_rate": 0.002016, |
|
"loss": 4.9973, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.00673, |
|
"grad_norm": 0.9905174678816935, |
|
"learning_rate": 0.002019, |
|
"loss": 5.0151, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.00674, |
|
"grad_norm": 0.8930692065183647, |
|
"learning_rate": 0.0020220000000000004, |
|
"loss": 5.0014, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.00675, |
|
"grad_norm": 1.0067908289229996, |
|
"learning_rate": 0.0020250000000000003, |
|
"loss": 5.0182, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.00676, |
|
"grad_norm": 0.972209366764115, |
|
"learning_rate": 0.0020280000000000003, |
|
"loss": 5.0082, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.00677, |
|
"grad_norm": 1.0440090424594235, |
|
"learning_rate": 0.0020310000000000003, |
|
"loss": 5.0109, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.00678, |
|
"grad_norm": 1.1292207401563255, |
|
"learning_rate": 0.0020340000000000002, |
|
"loss": 5.0278, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.00679, |
|
"grad_norm": 0.9872089241398577, |
|
"learning_rate": 0.002037, |
|
"loss": 5.0054, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.0068, |
|
"grad_norm": 0.8748535789650627, |
|
"learning_rate": 0.00204, |
|
"loss": 5.0004, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.00681, |
|
"grad_norm": 1.0039141852392888, |
|
"learning_rate": 0.002043, |
|
"loss": 5.0018, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.00682, |
|
"grad_norm": 1.0414085118011525, |
|
"learning_rate": 0.002046, |
|
"loss": 5.0026, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.00683, |
|
"grad_norm": 0.8083819733286619, |
|
"learning_rate": 0.002049, |
|
"loss": 4.9746, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.00684, |
|
"grad_norm": 0.618682103661302, |
|
"learning_rate": 0.002052, |
|
"loss": 4.9803, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.00685, |
|
"grad_norm": 0.5775508276061283, |
|
"learning_rate": 0.0020550000000000004, |
|
"loss": 4.9792, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.00686, |
|
"grad_norm": 0.5359292821422027, |
|
"learning_rate": 0.0020580000000000004, |
|
"loss": 4.9508, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.00687, |
|
"grad_norm": 0.5439874489161504, |
|
"learning_rate": 0.0020610000000000003, |
|
"loss": 4.9456, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.00688, |
|
"grad_norm": 0.6521058673701751, |
|
"learning_rate": 0.002064, |
|
"loss": 4.9472, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.00689, |
|
"grad_norm": 0.7201992210148584, |
|
"learning_rate": 0.002067, |
|
"loss": 4.9514, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.0069, |
|
"grad_norm": 0.6329359289093167, |
|
"learning_rate": 0.00207, |
|
"loss": 4.9393, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.00691, |
|
"grad_norm": 0.5452341441548267, |
|
"learning_rate": 0.0020729999999999998, |
|
"loss": 4.9065, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.00692, |
|
"grad_norm": 0.524421270821715, |
|
"learning_rate": 0.0020759999999999997, |
|
"loss": 4.9264, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.00693, |
|
"grad_norm": 0.5237198583423548, |
|
"learning_rate": 0.0020789999999999997, |
|
"loss": 4.9275, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.00694, |
|
"grad_norm": 0.5943284166970615, |
|
"learning_rate": 0.002082, |
|
"loss": 4.9065, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.00695, |
|
"grad_norm": 0.6331066584123409, |
|
"learning_rate": 0.002085, |
|
"loss": 4.9096, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.00696, |
|
"grad_norm": 0.6552141599381052, |
|
"learning_rate": 0.002088, |
|
"loss": 4.9103, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.00697, |
|
"grad_norm": 0.7459258980378775, |
|
"learning_rate": 0.002091, |
|
"loss": 4.9188, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.00698, |
|
"grad_norm": 0.902413176094811, |
|
"learning_rate": 0.002094, |
|
"loss": 4.9109, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.00699, |
|
"grad_norm": 0.9488326760888935, |
|
"learning_rate": 0.002097, |
|
"loss": 4.9116, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.007, |
|
"grad_norm": 0.9189763967394823, |
|
"learning_rate": 0.0021, |
|
"loss": 4.929, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.00701, |
|
"grad_norm": 0.8924436046521577, |
|
"learning_rate": 0.002103, |
|
"loss": 4.9083, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.00702, |
|
"grad_norm": 0.8863614629240012, |
|
"learning_rate": 0.002106, |
|
"loss": 4.9092, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.00703, |
|
"grad_norm": 0.7940709541538681, |
|
"learning_rate": 0.0021089999999999998, |
|
"loss": 4.891, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.00704, |
|
"grad_norm": 0.7939787736751149, |
|
"learning_rate": 0.0021119999999999997, |
|
"loss": 4.8964, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.00705, |
|
"grad_norm": 1.0444656030359551, |
|
"learning_rate": 0.002115, |
|
"loss": 4.9151, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.00706, |
|
"grad_norm": 1.1431311909042268, |
|
"learning_rate": 0.002118, |
|
"loss": 4.9234, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.00707, |
|
"grad_norm": 0.8384635334186645, |
|
"learning_rate": 0.002121, |
|
"loss": 4.8902, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.00708, |
|
"grad_norm": 0.9594405079672866, |
|
"learning_rate": 0.002124, |
|
"loss": 4.8745, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.00709, |
|
"grad_norm": 0.8900382869322284, |
|
"learning_rate": 0.002127, |
|
"loss": 4.9161, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.0071, |
|
"grad_norm": 0.8868570415283396, |
|
"learning_rate": 0.00213, |
|
"loss": 4.9065, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.00711, |
|
"grad_norm": 0.8610490944817158, |
|
"learning_rate": 0.002133, |
|
"loss": 4.8804, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.00712, |
|
"grad_norm": 0.8727164938852855, |
|
"learning_rate": 0.002136, |
|
"loss": 4.9046, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.00713, |
|
"grad_norm": 0.8857525308493206, |
|
"learning_rate": 0.002139, |
|
"loss": 4.9135, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.00714, |
|
"grad_norm": 0.9495661806955594, |
|
"learning_rate": 0.002142, |
|
"loss": 4.918, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.00715, |
|
"grad_norm": 1.0263024097609161, |
|
"learning_rate": 0.0021449999999999998, |
|
"loss": 4.8857, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.00716, |
|
"grad_norm": 0.8876358680026493, |
|
"learning_rate": 0.002148, |
|
"loss": 4.8749, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.00717, |
|
"grad_norm": 0.8225498605776377, |
|
"learning_rate": 0.002151, |
|
"loss": 4.8925, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.00718, |
|
"grad_norm": 0.628552485265691, |
|
"learning_rate": 0.002154, |
|
"loss": 4.8659, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.00719, |
|
"grad_norm": 0.6584104654465238, |
|
"learning_rate": 0.002157, |
|
"loss": 4.8747, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.0072, |
|
"grad_norm": 0.6698592474865601, |
|
"learning_rate": 0.00216, |
|
"loss": 4.8635, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.00721, |
|
"grad_norm": 0.6673590176314685, |
|
"learning_rate": 0.002163, |
|
"loss": 4.8639, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.00722, |
|
"grad_norm": 0.6674098225397388, |
|
"learning_rate": 0.002166, |
|
"loss": 4.8386, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.00723, |
|
"grad_norm": 0.6090726175552883, |
|
"learning_rate": 0.002169, |
|
"loss": 4.8464, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.00724, |
|
"grad_norm": 0.6325507361418539, |
|
"learning_rate": 0.002172, |
|
"loss": 4.8403, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.00725, |
|
"grad_norm": 0.6927587431932604, |
|
"learning_rate": 0.002175, |
|
"loss": 4.8341, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.00726, |
|
"grad_norm": 0.7422551683158218, |
|
"learning_rate": 0.002178, |
|
"loss": 4.8448, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.00727, |
|
"grad_norm": 0.7946686392459241, |
|
"learning_rate": 0.0021809999999999998, |
|
"loss": 4.8264, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.00728, |
|
"grad_norm": 0.651194780867581, |
|
"learning_rate": 0.002184, |
|
"loss": 4.8373, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.00729, |
|
"grad_norm": 0.5507866158426874, |
|
"learning_rate": 0.002187, |
|
"loss": 4.8279, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.0073, |
|
"grad_norm": 0.5770531279665235, |
|
"learning_rate": 0.00219, |
|
"loss": 4.8256, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.00731, |
|
"grad_norm": 0.6604772562967653, |
|
"learning_rate": 0.002193, |
|
"loss": 4.8198, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.00732, |
|
"grad_norm": 0.7902754963422924, |
|
"learning_rate": 0.002196, |
|
"loss": 4.832, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.00733, |
|
"grad_norm": 0.8530754775219535, |
|
"learning_rate": 0.002199, |
|
"loss": 4.814, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.00734, |
|
"grad_norm": 0.808835939559151, |
|
"learning_rate": 0.002202, |
|
"loss": 4.8365, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.00735, |
|
"grad_norm": 0.7793455630355729, |
|
"learning_rate": 0.002205, |
|
"loss": 4.8484, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.00736, |
|
"grad_norm": 0.8899384636665534, |
|
"learning_rate": 0.002208, |
|
"loss": 4.8322, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.00737, |
|
"grad_norm": 1.0166382867407526, |
|
"learning_rate": 0.002211, |
|
"loss": 4.8145, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.00738, |
|
"grad_norm": 0.9857126416807526, |
|
"learning_rate": 0.002214, |
|
"loss": 4.8213, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.00739, |
|
"grad_norm": 0.8982016923721464, |
|
"learning_rate": 0.0022170000000000002, |
|
"loss": 4.809, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.0074, |
|
"grad_norm": 0.8517808259905928, |
|
"learning_rate": 0.00222, |
|
"loss": 4.8138, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.00741, |
|
"grad_norm": 0.6571636698582773, |
|
"learning_rate": 0.002223, |
|
"loss": 4.778, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.00742, |
|
"grad_norm": 0.5983530041008951, |
|
"learning_rate": 0.002226, |
|
"loss": 4.8043, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.00743, |
|
"grad_norm": 0.613767022264535, |
|
"learning_rate": 0.002229, |
|
"loss": 4.772, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.00744, |
|
"grad_norm": 0.5746332772613801, |
|
"learning_rate": 0.002232, |
|
"loss": 4.7755, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.00745, |
|
"grad_norm": 0.5491916195482714, |
|
"learning_rate": 0.002235, |
|
"loss": 4.7792, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.00746, |
|
"grad_norm": 0.6224704559098753, |
|
"learning_rate": 0.002238, |
|
"loss": 4.7785, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.00747, |
|
"grad_norm": 0.721423929849636, |
|
"learning_rate": 0.002241, |
|
"loss": 4.766, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.00748, |
|
"grad_norm": 0.8211263973312402, |
|
"learning_rate": 0.002244, |
|
"loss": 4.7825, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.00749, |
|
"grad_norm": 0.9630311220772746, |
|
"learning_rate": 0.002247, |
|
"loss": 4.7687, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 1.0671208539128567, |
|
"learning_rate": 0.0022500000000000003, |
|
"loss": 4.8201, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.00751, |
|
"grad_norm": 1.0285377005887373, |
|
"learning_rate": 0.0022530000000000002, |
|
"loss": 4.8053, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.00752, |
|
"grad_norm": 0.9937387135055332, |
|
"learning_rate": 0.002256, |
|
"loss": 4.7635, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.00753, |
|
"grad_norm": 0.8939099354397223, |
|
"learning_rate": 0.002259, |
|
"loss": 4.8048, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.00754, |
|
"grad_norm": 0.958657384547811, |
|
"learning_rate": 0.002262, |
|
"loss": 4.816, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.00755, |
|
"grad_norm": 0.8579165829387244, |
|
"learning_rate": 0.002265, |
|
"loss": 4.7809, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.00756, |
|
"grad_norm": 0.8147493051985796, |
|
"learning_rate": 0.002268, |
|
"loss": 4.7687, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.00757, |
|
"grad_norm": 0.9899674342411924, |
|
"learning_rate": 0.002271, |
|
"loss": 4.7845, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.00758, |
|
"grad_norm": 1.2327208130150207, |
|
"learning_rate": 0.002274, |
|
"loss": 4.7967, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.00759, |
|
"grad_norm": 0.8760473410677282, |
|
"learning_rate": 0.002277, |
|
"loss": 4.7951, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.0076, |
|
"grad_norm": 0.9553393765090987, |
|
"learning_rate": 0.00228, |
|
"loss": 4.7842, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.00761, |
|
"grad_norm": 1.088855928225056, |
|
"learning_rate": 0.002283, |
|
"loss": 4.769, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.00762, |
|
"grad_norm": 0.9818480683263884, |
|
"learning_rate": 0.0022860000000000003, |
|
"loss": 4.7512, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.00763, |
|
"grad_norm": 0.9731540924632093, |
|
"learning_rate": 0.0022890000000000002, |
|
"loss": 4.7931, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.00764, |
|
"grad_norm": 1.0508884929557651, |
|
"learning_rate": 0.002292, |
|
"loss": 4.8167, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.00765, |
|
"grad_norm": 1.0020863769727308, |
|
"learning_rate": 0.002295, |
|
"loss": 4.7984, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.00766, |
|
"grad_norm": 1.1527463652354557, |
|
"learning_rate": 0.002298, |
|
"loss": 4.8085, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.00767, |
|
"grad_norm": 0.9657952239159258, |
|
"learning_rate": 0.002301, |
|
"loss": 4.7959, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.00768, |
|
"grad_norm": 1.0234917976922082, |
|
"learning_rate": 0.002304, |
|
"loss": 4.8012, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.00769, |
|
"grad_norm": 0.9850893067060651, |
|
"learning_rate": 0.002307, |
|
"loss": 4.8144, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.0077, |
|
"grad_norm": 0.9062134932024389, |
|
"learning_rate": 0.00231, |
|
"loss": 4.7653, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.00771, |
|
"grad_norm": 0.8476285286232204, |
|
"learning_rate": 0.002313, |
|
"loss": 4.7979, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.00772, |
|
"grad_norm": 0.9122213123018311, |
|
"learning_rate": 0.002316, |
|
"loss": 4.7851, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.00773, |
|
"grad_norm": 1.0718910624781612, |
|
"learning_rate": 0.0023190000000000003, |
|
"loss": 4.8052, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.00774, |
|
"grad_norm": 0.7792131883523417, |
|
"learning_rate": 0.0023220000000000003, |
|
"loss": 4.7945, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.00775, |
|
"grad_norm": 0.7995411986928386, |
|
"learning_rate": 0.0023250000000000002, |
|
"loss": 4.7914, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.00776, |
|
"grad_norm": 0.7054590225014301, |
|
"learning_rate": 0.002328, |
|
"loss": 4.7883, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.00777, |
|
"grad_norm": 0.6505869359405926, |
|
"learning_rate": 0.002331, |
|
"loss": 4.7585, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.00778, |
|
"grad_norm": 0.6484695284206986, |
|
"learning_rate": 0.002334, |
|
"loss": 4.7652, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.00779, |
|
"grad_norm": 0.6047799586124271, |
|
"learning_rate": 0.002337, |
|
"loss": 4.7239, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.0078, |
|
"grad_norm": 0.5436502526586032, |
|
"learning_rate": 0.00234, |
|
"loss": 4.7364, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.00781, |
|
"grad_norm": 0.5682167623371829, |
|
"learning_rate": 0.002343, |
|
"loss": 4.7355, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.00782, |
|
"grad_norm": 0.628910610744215, |
|
"learning_rate": 0.002346, |
|
"loss": 4.7356, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.00783, |
|
"grad_norm": 0.5845457532196663, |
|
"learning_rate": 0.002349, |
|
"loss": 4.7086, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.00784, |
|
"grad_norm": 0.6345965495249546, |
|
"learning_rate": 0.002352, |
|
"loss": 4.723, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.00785, |
|
"grad_norm": 0.4959017064429773, |
|
"learning_rate": 0.0023550000000000003, |
|
"loss": 4.7138, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.00786, |
|
"grad_norm": 0.4358915945164792, |
|
"learning_rate": 0.0023580000000000003, |
|
"loss": 4.7131, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.00787, |
|
"grad_norm": 0.4259891850743534, |
|
"learning_rate": 0.0023610000000000003, |
|
"loss": 4.7113, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.00788, |
|
"grad_norm": 0.4529949912379573, |
|
"learning_rate": 0.002364, |
|
"loss": 4.7077, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.00789, |
|
"grad_norm": 0.4944753699230628, |
|
"learning_rate": 0.002367, |
|
"loss": 4.7334, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.0079, |
|
"grad_norm": 0.6295031827770177, |
|
"learning_rate": 0.00237, |
|
"loss": 4.7144, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.00791, |
|
"grad_norm": 0.8474014146600959, |
|
"learning_rate": 0.002373, |
|
"loss": 4.6777, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.00792, |
|
"grad_norm": 0.8040402155973354, |
|
"learning_rate": 0.002376, |
|
"loss": 4.7267, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.00793, |
|
"grad_norm": 0.5568568781614048, |
|
"learning_rate": 0.002379, |
|
"loss": 4.7006, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.00794, |
|
"grad_norm": 0.8220014797505664, |
|
"learning_rate": 0.002382, |
|
"loss": 4.7246, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.00795, |
|
"grad_norm": 0.6551332917875898, |
|
"learning_rate": 0.002385, |
|
"loss": 4.6938, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.00796, |
|
"grad_norm": 0.5666814801389223, |
|
"learning_rate": 0.0023880000000000004, |
|
"loss": 4.6826, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.00797, |
|
"grad_norm": 0.654430341218369, |
|
"learning_rate": 0.0023910000000000003, |
|
"loss": 4.7248, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.00798, |
|
"grad_norm": 0.5511512463730408, |
|
"learning_rate": 0.0023940000000000003, |
|
"loss": 4.6864, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.00799, |
|
"grad_norm": 0.5084174359945534, |
|
"learning_rate": 0.0023970000000000003, |
|
"loss": 4.7161, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.41968398472595075, |
|
"learning_rate": 0.0024000000000000002, |
|
"loss": 4.6652, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.00801, |
|
"grad_norm": 0.4546309911468048, |
|
"learning_rate": 0.002403, |
|
"loss": 4.6967, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.00802, |
|
"grad_norm": 0.430018994115786, |
|
"learning_rate": 0.002406, |
|
"loss": 4.6827, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.00803, |
|
"grad_norm": 0.44614830716967085, |
|
"learning_rate": 0.002409, |
|
"loss": 4.6617, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.00804, |
|
"grad_norm": 0.44722400844593674, |
|
"learning_rate": 0.002412, |
|
"loss": 4.6598, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.00805, |
|
"grad_norm": 0.5179127215582825, |
|
"learning_rate": 0.002415, |
|
"loss": 4.6599, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.00806, |
|
"grad_norm": 0.5610832008078775, |
|
"learning_rate": 0.002418, |
|
"loss": 4.677, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.00807, |
|
"grad_norm": 0.5167453223410896, |
|
"learning_rate": 0.0024210000000000004, |
|
"loss": 4.6671, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.00808, |
|
"grad_norm": 0.46468933196331563, |
|
"learning_rate": 0.0024240000000000004, |
|
"loss": 4.6511, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.00809, |
|
"grad_norm": 0.5221883532574668, |
|
"learning_rate": 0.0024270000000000003, |
|
"loss": 4.6468, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.0081, |
|
"grad_norm": 0.4992566900849729, |
|
"learning_rate": 0.0024300000000000003, |
|
"loss": 4.6744, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.00811, |
|
"grad_norm": 0.4854147467055134, |
|
"learning_rate": 0.0024330000000000003, |
|
"loss": 4.646, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.00812, |
|
"grad_norm": 0.650970729431075, |
|
"learning_rate": 0.0024360000000000002, |
|
"loss": 4.6307, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.00813, |
|
"grad_norm": 0.8160691589494683, |
|
"learning_rate": 0.0024389999999999998, |
|
"loss": 4.6711, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.00814, |
|
"grad_norm": 0.9918101747931352, |
|
"learning_rate": 0.0024419999999999997, |
|
"loss": 4.6946, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.00815, |
|
"grad_norm": 1.247963175893729, |
|
"learning_rate": 0.0024449999999999997, |
|
"loss": 4.7226, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.00816, |
|
"grad_norm": 0.8376200515557375, |
|
"learning_rate": 0.002448, |
|
"loss": 4.6777, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.00817, |
|
"grad_norm": 0.9161032619759178, |
|
"learning_rate": 0.002451, |
|
"loss": 4.6939, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.00818, |
|
"grad_norm": 1.0914649908014256, |
|
"learning_rate": 0.002454, |
|
"loss": 4.6886, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.00819, |
|
"grad_norm": 0.9806171410774952, |
|
"learning_rate": 0.002457, |
|
"loss": 4.712, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.0082, |
|
"grad_norm": 0.992236077471004, |
|
"learning_rate": 0.00246, |
|
"loss": 4.6918, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.00821, |
|
"grad_norm": 1.0594557870263281, |
|
"learning_rate": 0.002463, |
|
"loss": 4.6759, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.00822, |
|
"grad_norm": 1.0346800919438124, |
|
"learning_rate": 0.002466, |
|
"loss": 4.6853, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.00823, |
|
"grad_norm": 0.9573573191186882, |
|
"learning_rate": 0.002469, |
|
"loss": 4.6833, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.00824, |
|
"grad_norm": 1.1123514933123841, |
|
"learning_rate": 0.002472, |
|
"loss": 4.714, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.00825, |
|
"grad_norm": 0.8463845700248506, |
|
"learning_rate": 0.0024749999999999998, |
|
"loss": 4.7191, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.00826, |
|
"grad_norm": 0.8444785606085857, |
|
"learning_rate": 0.0024779999999999997, |
|
"loss": 4.672, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.00827, |
|
"grad_norm": 0.9726341870117121, |
|
"learning_rate": 0.002481, |
|
"loss": 4.7078, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.00828, |
|
"grad_norm": 0.9106448417621353, |
|
"learning_rate": 0.002484, |
|
"loss": 4.7003, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.00829, |
|
"grad_norm": 0.7565680418878746, |
|
"learning_rate": 0.002487, |
|
"loss": 4.6856, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.0083, |
|
"grad_norm": 0.8537774465977133, |
|
"learning_rate": 0.00249, |
|
"loss": 4.7017, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.00831, |
|
"grad_norm": 0.9023323948099834, |
|
"learning_rate": 0.002493, |
|
"loss": 4.6871, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.00832, |
|
"grad_norm": 0.8524529451127855, |
|
"learning_rate": 0.002496, |
|
"loss": 4.6815, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.00833, |
|
"grad_norm": 0.9428655185832147, |
|
"learning_rate": 0.002499, |
|
"loss": 4.6808, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.00834, |
|
"grad_norm": 0.9597220185428569, |
|
"learning_rate": 0.002502, |
|
"loss": 4.6879, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.00835, |
|
"grad_norm": 0.7735101632354252, |
|
"learning_rate": 0.002505, |
|
"loss": 4.6724, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.00836, |
|
"grad_norm": 0.9597202731139803, |
|
"learning_rate": 0.002508, |
|
"loss": 4.7061, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.00837, |
|
"grad_norm": 0.9520863539431935, |
|
"learning_rate": 0.0025109999999999998, |
|
"loss": 4.6636, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.00838, |
|
"grad_norm": 0.7800128524395746, |
|
"learning_rate": 0.0025139999999999997, |
|
"loss": 4.6721, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.00839, |
|
"grad_norm": 0.8122589832425033, |
|
"learning_rate": 0.002517, |
|
"loss": 4.675, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.0084, |
|
"grad_norm": 0.8183344402395425, |
|
"learning_rate": 0.00252, |
|
"loss": 4.6669, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.00841, |
|
"grad_norm": 0.6918735110390536, |
|
"learning_rate": 0.002523, |
|
"loss": 4.6489, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.00842, |
|
"grad_norm": 0.6201385747244391, |
|
"learning_rate": 0.002526, |
|
"loss": 4.6423, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.00843, |
|
"grad_norm": 0.606127970479136, |
|
"learning_rate": 0.002529, |
|
"loss": 4.6465, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.00844, |
|
"grad_norm": 0.5515773209874846, |
|
"learning_rate": 0.002532, |
|
"loss": 4.6607, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.00845, |
|
"grad_norm": 0.6203742299859808, |
|
"learning_rate": 0.002535, |
|
"loss": 4.6293, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.00846, |
|
"grad_norm": 0.5875832865020281, |
|
"learning_rate": 0.002538, |
|
"loss": 4.6474, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.00847, |
|
"grad_norm": 0.5703256353430879, |
|
"learning_rate": 0.002541, |
|
"loss": 4.6282, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.00848, |
|
"grad_norm": 0.602830367643936, |
|
"learning_rate": 0.002544, |
|
"loss": 4.6269, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.00849, |
|
"grad_norm": 0.6741507039909044, |
|
"learning_rate": 0.002547, |
|
"loss": 4.6233, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.0085, |
|
"grad_norm": 0.6288739006540759, |
|
"learning_rate": 0.00255, |
|
"loss": 4.6341, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.00851, |
|
"grad_norm": 0.5820099008678455, |
|
"learning_rate": 0.002553, |
|
"loss": 4.644, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.00852, |
|
"grad_norm": 0.586123912558797, |
|
"learning_rate": 0.002556, |
|
"loss": 4.6367, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.00853, |
|
"grad_norm": 0.5127813487098001, |
|
"learning_rate": 0.002559, |
|
"loss": 4.6085, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.00854, |
|
"grad_norm": 0.4730499644759358, |
|
"learning_rate": 0.002562, |
|
"loss": 4.6029, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.00855, |
|
"grad_norm": 0.44708869980986227, |
|
"learning_rate": 0.002565, |
|
"loss": 4.5799, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.00856, |
|
"grad_norm": 0.466044480858233, |
|
"learning_rate": 0.002568, |
|
"loss": 4.6142, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.00857, |
|
"grad_norm": 0.5382201915945353, |
|
"learning_rate": 0.002571, |
|
"loss": 4.6036, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.00858, |
|
"grad_norm": 0.6780662034295477, |
|
"learning_rate": 0.002574, |
|
"loss": 4.609, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.00859, |
|
"grad_norm": 0.9086610382483981, |
|
"learning_rate": 0.002577, |
|
"loss": 4.6039, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.0086, |
|
"grad_norm": 0.8563688949272525, |
|
"learning_rate": 0.00258, |
|
"loss": 4.6531, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.00861, |
|
"grad_norm": 0.5965670098126366, |
|
"learning_rate": 0.0025830000000000002, |
|
"loss": 4.6461, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.00862, |
|
"grad_norm": 0.7975052365958228, |
|
"learning_rate": 0.002586, |
|
"loss": 4.611, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.00863, |
|
"grad_norm": 0.650099032572018, |
|
"learning_rate": 0.002589, |
|
"loss": 4.6284, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.00864, |
|
"grad_norm": 0.6277114763068243, |
|
"learning_rate": 0.002592, |
|
"loss": 4.5809, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.00865, |
|
"grad_norm": 0.7499269309750987, |
|
"learning_rate": 0.002595, |
|
"loss": 4.6024, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.00866, |
|
"grad_norm": 0.7085973518403954, |
|
"learning_rate": 0.002598, |
|
"loss": 4.6233, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.00867, |
|
"grad_norm": 0.605538925445329, |
|
"learning_rate": 0.002601, |
|
"loss": 4.6101, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.00868, |
|
"grad_norm": 0.48200776305054654, |
|
"learning_rate": 0.002604, |
|
"loss": 4.586, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.00869, |
|
"grad_norm": 0.5266950965425763, |
|
"learning_rate": 0.002607, |
|
"loss": 4.5966, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.0087, |
|
"grad_norm": 0.48953699231607295, |
|
"learning_rate": 0.00261, |
|
"loss": 4.5832, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.00871, |
|
"grad_norm": 0.5478274928438833, |
|
"learning_rate": 0.002613, |
|
"loss": 4.5992, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.00872, |
|
"grad_norm": 0.6263670684952429, |
|
"learning_rate": 0.002616, |
|
"loss": 4.5864, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.00873, |
|
"grad_norm": 0.6672951132542, |
|
"learning_rate": 0.0026190000000000002, |
|
"loss": 4.5977, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.00874, |
|
"grad_norm": 0.677096167715366, |
|
"learning_rate": 0.002622, |
|
"loss": 4.617, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.00875, |
|
"grad_norm": 0.6959913524482387, |
|
"learning_rate": 0.002625, |
|
"loss": 4.5696, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.00876, |
|
"grad_norm": 0.6682762743495083, |
|
"learning_rate": 0.002628, |
|
"loss": 4.5958, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.00877, |
|
"grad_norm": 0.6375266502117092, |
|
"learning_rate": 0.002631, |
|
"loss": 4.612, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.00878, |
|
"grad_norm": 0.7079418482290942, |
|
"learning_rate": 0.002634, |
|
"loss": 4.5486, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.00879, |
|
"grad_norm": 0.6282689223941402, |
|
"learning_rate": 0.002637, |
|
"loss": 4.578, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.0088, |
|
"grad_norm": 0.5460943252882049, |
|
"learning_rate": 0.00264, |
|
"loss": 4.5852, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.00881, |
|
"grad_norm": 0.5723972494402886, |
|
"learning_rate": 0.002643, |
|
"loss": 4.5869, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.00882, |
|
"grad_norm": 0.647818443655113, |
|
"learning_rate": 0.002646, |
|
"loss": 4.588, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.00883, |
|
"grad_norm": 0.8827068805337381, |
|
"learning_rate": 0.002649, |
|
"loss": 4.5935, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.00884, |
|
"grad_norm": 1.3000201706023533, |
|
"learning_rate": 0.0026520000000000003, |
|
"loss": 4.6052, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.00885, |
|
"grad_norm": 0.7527768384442359, |
|
"learning_rate": 0.0026550000000000002, |
|
"loss": 4.5797, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.00886, |
|
"grad_norm": 0.7313595200920677, |
|
"learning_rate": 0.002658, |
|
"loss": 4.6019, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.00887, |
|
"grad_norm": 0.553327654847044, |
|
"learning_rate": 0.002661, |
|
"loss": 4.5828, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.00888, |
|
"grad_norm": 0.6064219625843388, |
|
"learning_rate": 0.002664, |
|
"loss": 4.5894, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.00889, |
|
"grad_norm": 0.6392357596846293, |
|
"learning_rate": 0.002667, |
|
"loss": 4.5422, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.0089, |
|
"grad_norm": 0.6860123914477424, |
|
"learning_rate": 0.00267, |
|
"loss": 4.5989, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.00891, |
|
"grad_norm": 0.7088960904364014, |
|
"learning_rate": 0.002673, |
|
"loss": 4.5822, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.00892, |
|
"grad_norm": 0.7157207147763361, |
|
"learning_rate": 0.002676, |
|
"loss": 4.5934, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.00893, |
|
"grad_norm": 0.7412527752908875, |
|
"learning_rate": 0.002679, |
|
"loss": 4.5709, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.00894, |
|
"grad_norm": 0.8084836835989728, |
|
"learning_rate": 0.002682, |
|
"loss": 4.5639, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.00895, |
|
"grad_norm": 0.9923307111818513, |
|
"learning_rate": 0.0026850000000000003, |
|
"loss": 4.5864, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.00896, |
|
"grad_norm": 1.2171682577354312, |
|
"learning_rate": 0.0026880000000000003, |
|
"loss": 4.6057, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.00897, |
|
"grad_norm": 0.797478427208377, |
|
"learning_rate": 0.0026910000000000002, |
|
"loss": 4.5989, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.00898, |
|
"grad_norm": 0.7928728804117916, |
|
"learning_rate": 0.002694, |
|
"loss": 4.594, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.00899, |
|
"grad_norm": 0.8357403035452178, |
|
"learning_rate": 0.002697, |
|
"loss": 4.5983, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.009, |
|
"grad_norm": 0.8448290091163538, |
|
"learning_rate": 0.0027, |
|
"loss": 4.6292, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.00901, |
|
"grad_norm": 0.9488092229670547, |
|
"learning_rate": 0.002703, |
|
"loss": 4.5868, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.00902, |
|
"grad_norm": 0.9434404658743749, |
|
"learning_rate": 0.002706, |
|
"loss": 4.5999, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.00903, |
|
"grad_norm": 1.0122099567822476, |
|
"learning_rate": 0.002709, |
|
"loss": 4.6102, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.00904, |
|
"grad_norm": 0.9358691681287052, |
|
"learning_rate": 0.002712, |
|
"loss": 4.5848, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.00905, |
|
"grad_norm": 0.8321510442485943, |
|
"learning_rate": 0.002715, |
|
"loss": 4.5984, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.00906, |
|
"grad_norm": 0.8914473393947665, |
|
"learning_rate": 0.002718, |
|
"loss": 4.6112, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.00907, |
|
"grad_norm": 0.9883982303638487, |
|
"learning_rate": 0.0027210000000000003, |
|
"loss": 4.6386, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.00908, |
|
"grad_norm": 0.86073203349026, |
|
"learning_rate": 0.0027240000000000003, |
|
"loss": 4.6116, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.00909, |
|
"grad_norm": 0.7773747412069614, |
|
"learning_rate": 0.0027270000000000003, |
|
"loss": 4.6163, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.0091, |
|
"grad_norm": 0.7370585718531062, |
|
"learning_rate": 0.0027300000000000002, |
|
"loss": 4.6234, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.00911, |
|
"grad_norm": 0.6906269071273593, |
|
"learning_rate": 0.002733, |
|
"loss": 4.5785, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.00912, |
|
"grad_norm": 0.6578032292778252, |
|
"learning_rate": 0.002736, |
|
"loss": 4.5778, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.00913, |
|
"grad_norm": 0.6528626059582382, |
|
"learning_rate": 0.002739, |
|
"loss": 4.5704, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.00914, |
|
"grad_norm": 0.599731896856576, |
|
"learning_rate": 0.002742, |
|
"loss": 4.595, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.00915, |
|
"grad_norm": 0.5922054086035364, |
|
"learning_rate": 0.002745, |
|
"loss": 4.5555, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.00916, |
|
"grad_norm": 0.528646140228931, |
|
"learning_rate": 0.002748, |
|
"loss": 4.5304, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.00917, |
|
"grad_norm": 0.5305158198561161, |
|
"learning_rate": 0.002751, |
|
"loss": 4.5419, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.00918, |
|
"grad_norm": 0.4736382884122071, |
|
"learning_rate": 0.0027540000000000004, |
|
"loss": 4.5569, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.00919, |
|
"grad_norm": 0.45838817911808083, |
|
"learning_rate": 0.0027570000000000003, |
|
"loss": 4.5357, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.0092, |
|
"grad_norm": 0.4361472184478695, |
|
"learning_rate": 0.0027600000000000003, |
|
"loss": 4.5493, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.00921, |
|
"grad_norm": 0.43931126757575867, |
|
"learning_rate": 0.0027630000000000003, |
|
"loss": 4.5336, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.00922, |
|
"grad_norm": 0.46920285200840567, |
|
"learning_rate": 0.0027660000000000002, |
|
"loss": 4.5412, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.00923, |
|
"grad_norm": 0.5996209745093354, |
|
"learning_rate": 0.002769, |
|
"loss": 4.5218, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.00924, |
|
"grad_norm": 0.702528530852281, |
|
"learning_rate": 0.002772, |
|
"loss": 4.5564, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.00925, |
|
"grad_norm": 0.6651779892024297, |
|
"learning_rate": 0.002775, |
|
"loss": 4.5288, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.00926, |
|
"grad_norm": 0.5723855300218565, |
|
"learning_rate": 0.002778, |
|
"loss": 4.5426, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.00927, |
|
"grad_norm": 0.6080148945846373, |
|
"learning_rate": 0.002781, |
|
"loss": 4.5043, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.00928, |
|
"grad_norm": 0.6957035289314476, |
|
"learning_rate": 0.002784, |
|
"loss": 4.5502, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.00929, |
|
"grad_norm": 0.7366634335117053, |
|
"learning_rate": 0.0027870000000000004, |
|
"loss": 4.5403, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.0093, |
|
"grad_norm": 0.7796102692352119, |
|
"learning_rate": 0.0027900000000000004, |
|
"loss": 4.5265, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.00931, |
|
"grad_norm": 0.7116909602884058, |
|
"learning_rate": 0.0027930000000000003, |
|
"loss": 4.5333, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.00932, |
|
"grad_norm": 0.8319854024158545, |
|
"learning_rate": 0.0027960000000000003, |
|
"loss": 4.5481, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.00933, |
|
"grad_norm": 0.8001418279766108, |
|
"learning_rate": 0.0027990000000000003, |
|
"loss": 4.5286, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.00934, |
|
"grad_norm": 0.6519619232143173, |
|
"learning_rate": 0.0028020000000000002, |
|
"loss": 4.5422, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.00935, |
|
"grad_norm": 0.7580737482550882, |
|
"learning_rate": 0.002805, |
|
"loss": 4.5559, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.00936, |
|
"grad_norm": 0.8499038627491867, |
|
"learning_rate": 0.002808, |
|
"loss": 4.5679, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.00937, |
|
"grad_norm": 0.7597302495348821, |
|
"learning_rate": 0.002811, |
|
"loss": 4.5708, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.00938, |
|
"grad_norm": 0.9878821641788273, |
|
"learning_rate": 0.002814, |
|
"loss": 4.5609, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.00939, |
|
"grad_norm": 0.9691729918808772, |
|
"learning_rate": 0.002817, |
|
"loss": 4.563, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.0094, |
|
"grad_norm": 0.8937843559478598, |
|
"learning_rate": 0.00282, |
|
"loss": 4.56, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.00941, |
|
"grad_norm": 0.9477839045288606, |
|
"learning_rate": 0.002823, |
|
"loss": 4.5409, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.00942, |
|
"grad_norm": 1.088560613057821, |
|
"learning_rate": 0.002826, |
|
"loss": 4.5819, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.00943, |
|
"grad_norm": 0.8020128186220904, |
|
"learning_rate": 0.002829, |
|
"loss": 4.556, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.00944, |
|
"grad_norm": 0.7970499406732843, |
|
"learning_rate": 0.002832, |
|
"loss": 4.5652, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.00945, |
|
"grad_norm": 0.760430287307007, |
|
"learning_rate": 0.002835, |
|
"loss": 4.567, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.00946, |
|
"grad_norm": 0.8410168172764453, |
|
"learning_rate": 0.002838, |
|
"loss": 4.5808, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.00947, |
|
"grad_norm": 0.8502364092306604, |
|
"learning_rate": 0.0028409999999999998, |
|
"loss": 4.5581, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.00948, |
|
"grad_norm": 0.7534324730199542, |
|
"learning_rate": 0.0028439999999999997, |
|
"loss": 4.533, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.00949, |
|
"grad_norm": 0.8075715283027973, |
|
"learning_rate": 0.002847, |
|
"loss": 4.5789, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.0095, |
|
"grad_norm": 0.8790685187514339, |
|
"learning_rate": 0.00285, |
|
"loss": 4.5764, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.00951, |
|
"grad_norm": 0.8527621336415785, |
|
"learning_rate": 0.002853, |
|
"loss": 4.552, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.00952, |
|
"grad_norm": 0.793648162843131, |
|
"learning_rate": 0.002856, |
|
"loss": 4.5771, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.00953, |
|
"grad_norm": 0.7100823051409002, |
|
"learning_rate": 0.002859, |
|
"loss": 4.5151, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.00954, |
|
"grad_norm": 0.776086010454581, |
|
"learning_rate": 0.002862, |
|
"loss": 4.5748, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.00955, |
|
"grad_norm": 0.7357834745016256, |
|
"learning_rate": 0.002865, |
|
"loss": 4.5651, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.00956, |
|
"grad_norm": 0.6871788604084053, |
|
"learning_rate": 0.002868, |
|
"loss": 4.5378, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.00957, |
|
"grad_norm": 0.6293704920093642, |
|
"learning_rate": 0.002871, |
|
"loss": 4.5585, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.00958, |
|
"grad_norm": 0.6933721545151298, |
|
"learning_rate": 0.002874, |
|
"loss": 4.5402, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.00959, |
|
"grad_norm": 0.6216290945191316, |
|
"learning_rate": 0.002877, |
|
"loss": 4.5294, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 0.44090482568449035, |
|
"learning_rate": 0.0028799999999999997, |
|
"loss": 4.5205, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.00961, |
|
"grad_norm": 0.5026549244936088, |
|
"learning_rate": 0.002883, |
|
"loss": 4.4973, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.00962, |
|
"grad_norm": 0.46550744372429714, |
|
"learning_rate": 0.002886, |
|
"loss": 4.5199, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.00963, |
|
"grad_norm": 0.4817462883709995, |
|
"learning_rate": 0.002889, |
|
"loss": 4.5204, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.00964, |
|
"grad_norm": 0.5021989893794454, |
|
"learning_rate": 0.002892, |
|
"loss": 4.5105, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.00965, |
|
"grad_norm": 0.6331237702649058, |
|
"learning_rate": 0.002895, |
|
"loss": 4.4888, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.00966, |
|
"grad_norm": 0.7186463225121739, |
|
"learning_rate": 0.002898, |
|
"loss": 4.5122, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.00967, |
|
"grad_norm": 0.835541291398658, |
|
"learning_rate": 0.002901, |
|
"loss": 4.5497, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.00968, |
|
"grad_norm": 0.7770950591910699, |
|
"learning_rate": 0.002904, |
|
"loss": 4.5246, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.00969, |
|
"grad_norm": 0.6421972738290654, |
|
"learning_rate": 0.002907, |
|
"loss": 4.5465, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.0097, |
|
"grad_norm": 0.6170493579190435, |
|
"learning_rate": 0.00291, |
|
"loss": 4.5014, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.00971, |
|
"grad_norm": 0.671893763395282, |
|
"learning_rate": 0.002913, |
|
"loss": 4.5134, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.00972, |
|
"grad_norm": 0.5384349268117217, |
|
"learning_rate": 0.002916, |
|
"loss": 4.51, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.00973, |
|
"grad_norm": 0.6570052261370841, |
|
"learning_rate": 0.002919, |
|
"loss": 4.5075, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.00974, |
|
"grad_norm": 0.6469437996214488, |
|
"learning_rate": 0.002922, |
|
"loss": 4.5042, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.00975, |
|
"grad_norm": 0.5139434995269291, |
|
"learning_rate": 0.002925, |
|
"loss": 4.5141, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.00976, |
|
"grad_norm": 0.540350404123188, |
|
"learning_rate": 0.002928, |
|
"loss": 4.4984, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.00977, |
|
"grad_norm": 0.5640158884340003, |
|
"learning_rate": 0.002931, |
|
"loss": 4.5359, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.00978, |
|
"grad_norm": 0.5471232322596488, |
|
"learning_rate": 0.002934, |
|
"loss": 4.5069, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.00979, |
|
"grad_norm": 0.6391692840311302, |
|
"learning_rate": 0.002937, |
|
"loss": 4.4972, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.0098, |
|
"grad_norm": 0.7065424241899814, |
|
"learning_rate": 0.00294, |
|
"loss": 4.4963, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.00981, |
|
"grad_norm": 0.6903724510426201, |
|
"learning_rate": 0.002943, |
|
"loss": 4.5078, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.00982, |
|
"grad_norm": 0.653302049548968, |
|
"learning_rate": 0.002946, |
|
"loss": 4.4991, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.00983, |
|
"grad_norm": 0.7786836590229197, |
|
"learning_rate": 0.0029490000000000002, |
|
"loss": 4.5061, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.00984, |
|
"grad_norm": 0.8944946184941699, |
|
"learning_rate": 0.002952, |
|
"loss": 4.5043, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.00985, |
|
"grad_norm": 0.9434362337974719, |
|
"learning_rate": 0.002955, |
|
"loss": 4.5335, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.00986, |
|
"grad_norm": 0.9312010498282314, |
|
"learning_rate": 0.002958, |
|
"loss": 4.4908, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.00987, |
|
"grad_norm": 0.7600059379317153, |
|
"learning_rate": 0.002961, |
|
"loss": 4.5038, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.00988, |
|
"grad_norm": 0.6949983731085813, |
|
"learning_rate": 0.002964, |
|
"loss": 4.533, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.00989, |
|
"grad_norm": 0.6958207764457225, |
|
"learning_rate": 0.002967, |
|
"loss": 4.4744, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.0099, |
|
"grad_norm": 0.8314159874496235, |
|
"learning_rate": 0.00297, |
|
"loss": 4.5218, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.00991, |
|
"grad_norm": 0.90109982550625, |
|
"learning_rate": 0.002973, |
|
"loss": 4.5224, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.00992, |
|
"grad_norm": 0.951208720196979, |
|
"learning_rate": 0.002976, |
|
"loss": 4.5281, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.00993, |
|
"grad_norm": 1.034654706401041, |
|
"learning_rate": 0.002979, |
|
"loss": 4.539, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.00994, |
|
"grad_norm": 1.0078120294806783, |
|
"learning_rate": 0.002982, |
|
"loss": 4.5459, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.00995, |
|
"grad_norm": 0.9905056664043064, |
|
"learning_rate": 0.0029850000000000002, |
|
"loss": 4.5461, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.00996, |
|
"grad_norm": 1.1603429750232241, |
|
"learning_rate": 0.002988, |
|
"loss": 4.5605, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.00997, |
|
"grad_norm": 0.8595677567729485, |
|
"learning_rate": 0.002991, |
|
"loss": 4.5366, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.00998, |
|
"grad_norm": 0.9672785170741734, |
|
"learning_rate": 0.002994, |
|
"loss": 4.5467, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.00999, |
|
"grad_norm": 0.8855476122040005, |
|
"learning_rate": 0.002997, |
|
"loss": 4.5306, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7479993027173912, |
|
"learning_rate": 0.003, |
|
"loss": 4.5436, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.9643642855424e+16, |
|
"train_batch_size": 1024, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|