diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.02, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1e-05, + "grad_norm": 1.085229352812366, + "learning_rate": 3e-06, + "loss": 10.849, + "step": 1 + }, + { + "epoch": 2e-05, + "grad_norm": 1.0764689186661929, + "learning_rate": 6e-06, + "loss": 10.8489, + "step": 2 + }, + { + "epoch": 3e-05, + "grad_norm": 1.0926036068515363, + "learning_rate": 9e-06, + "loss": 10.8486, + "step": 3 + }, + { + "epoch": 4e-05, + "grad_norm": 1.0859011783792423, + "learning_rate": 1.2e-05, + "loss": 10.848, + "step": 4 + }, + { + "epoch": 5e-05, + "grad_norm": 1.0906873388641662, + "learning_rate": 1.5e-05, + "loss": 10.8453, + "step": 5 + }, + { + "epoch": 6e-05, + "grad_norm": 1.0895888734627917, + "learning_rate": 1.8e-05, + "loss": 10.8447, + "step": 6 + }, + { + "epoch": 7e-05, + "grad_norm": 1.0913329404483254, + "learning_rate": 2.1000000000000002e-05, + "loss": 10.8355, + "step": 7 + }, + { + "epoch": 8e-05, + "grad_norm": 1.0766237663279077, + "learning_rate": 2.4e-05, + "loss": 10.8141, + "step": 8 + }, + { + "epoch": 9e-05, + "grad_norm": 1.0617425338278697, + "learning_rate": 2.7e-05, + "loss": 10.8099, + "step": 9 + }, + { + "epoch": 0.0001, + "grad_norm": 1.080271312928173, + "learning_rate": 3e-05, + "loss": 10.7983, + "step": 10 + }, + { + "epoch": 0.00011, + "grad_norm": 1.0663119503184246, + "learning_rate": 3.2999999999999996e-05, + "loss": 10.7844, + "step": 11 + }, + { + "epoch": 0.00012, + "grad_norm": 1.0694505502575085, + "learning_rate": 3.6e-05, + "loss": 10.7739, + "step": 12 + }, + { + "epoch": 0.00013, + "grad_norm": 1.0509206118753271, + "learning_rate": 3.9e-05, + "loss": 10.7554, + "step": 13 + }, + { + "epoch": 0.00014, + "grad_norm": 1.0427932624919014, + "learning_rate": 4.2000000000000004e-05, + "loss": 10.7452, + "step": 14 + }, + { + "epoch": 0.00015, + "grad_norm": 1.0323600599358198, + "learning_rate": 4.4999999999999996e-05, + "loss": 10.736, + "step": 15 + }, + { + "epoch": 0.00016, + "grad_norm": 1.0039727604034705, + "learning_rate": 4.8e-05, + "loss": 10.7191, + "step": 16 + }, + { + "epoch": 0.00017, + "grad_norm": 0.9736033767894897, + "learning_rate": 5.1000000000000006e-05, + "loss": 10.7049, + "step": 17 + }, + { + "epoch": 0.00018, + "grad_norm": 0.9665971933165531, + "learning_rate": 5.4e-05, + "loss": 10.69, + "step": 18 + }, + { + "epoch": 0.00019, + "grad_norm": 0.9428089164290921, + "learning_rate": 5.7e-05, + "loss": 10.6767, + "step": 19 + }, + { + "epoch": 0.0002, + "grad_norm": 0.943192779381936, + "learning_rate": 6e-05, + "loss": 10.666, + "step": 20 + }, + { + "epoch": 0.00021, + "grad_norm": 0.9246956411226879, + "learning_rate": 6.3e-05, + "loss": 10.6525, + "step": 21 + }, + { + "epoch": 0.00022, + "grad_norm": 0.9206776841693647, + "learning_rate": 6.599999999999999e-05, + "loss": 10.6378, + "step": 22 + }, + { + "epoch": 0.00023, + "grad_norm": 0.9152142474292139, + "learning_rate": 6.9e-05, + "loss": 10.626, + "step": 23 + }, + { + "epoch": 0.00024, + "grad_norm": 0.9101286615991204, + "learning_rate": 7.2e-05, + "loss": 10.613, + "step": 24 + }, + { + "epoch": 0.00025, + "grad_norm": 0.9100685942512898, + "learning_rate": 7.500000000000001e-05, + "loss": 10.5998, + "step": 25 + }, + { + "epoch": 0.00026, + "grad_norm": 0.9093157110032968, + "learning_rate": 7.8e-05, + "loss": 10.5859, + "step": 26 + }, + { + "epoch": 0.00027, + "grad_norm": 0.9036082412984342, + "learning_rate": 8.1e-05, + "loss": 10.5742, + "step": 27 + }, + { + "epoch": 0.00028, + "grad_norm": 0.9033464449252919, + "learning_rate": 8.400000000000001e-05, + "loss": 10.5612, + "step": 28 + }, + { + "epoch": 0.00029, + "grad_norm": 0.9067163029768021, + "learning_rate": 8.7e-05, + "loss": 10.5465, + "step": 29 + }, + { + "epoch": 0.0003, + "grad_norm": 0.9152916501496612, + "learning_rate": 8.999999999999999e-05, + "loss": 10.5321, + "step": 30 + }, + { + "epoch": 0.00031, + "grad_norm": 0.9122391999445313, + "learning_rate": 9.3e-05, + "loss": 10.5167, + "step": 31 + }, + { + "epoch": 0.00032, + "grad_norm": 0.9090668181794543, + "learning_rate": 9.6e-05, + "loss": 10.5034, + "step": 32 + }, + { + "epoch": 0.00033, + "grad_norm": 0.9095629132760054, + "learning_rate": 9.900000000000001e-05, + "loss": 10.4865, + "step": 33 + }, + { + "epoch": 0.00034, + "grad_norm": 0.9107692784496799, + "learning_rate": 0.00010200000000000001, + "loss": 10.4706, + "step": 34 + }, + { + "epoch": 0.00035, + "grad_norm": 0.9162740070354684, + "learning_rate": 0.00010500000000000002, + "loss": 10.4535, + "step": 35 + }, + { + "epoch": 0.00036, + "grad_norm": 0.9114324016484531, + "learning_rate": 0.000108, + "loss": 10.4367, + "step": 36 + }, + { + "epoch": 0.00037, + "grad_norm": 0.901331547883087, + "learning_rate": 0.000111, + "loss": 10.4196, + "step": 37 + }, + { + "epoch": 0.00038, + "grad_norm": 0.9056352937834914, + "learning_rate": 0.000114, + "loss": 10.4003, + "step": 38 + }, + { + "epoch": 0.00039, + "grad_norm": 0.9071824304759276, + "learning_rate": 0.000117, + "loss": 10.3815, + "step": 39 + }, + { + "epoch": 0.0004, + "grad_norm": 0.911150856035614, + "learning_rate": 0.00012, + "loss": 10.3599, + "step": 40 + }, + { + "epoch": 0.00041, + "grad_norm": 0.9001357892993758, + "learning_rate": 0.000123, + "loss": 10.3411, + "step": 41 + }, + { + "epoch": 0.00042, + "grad_norm": 0.9057215347961051, + "learning_rate": 0.000126, + "loss": 10.3208, + "step": 42 + }, + { + "epoch": 0.00043, + "grad_norm": 0.9101872556211666, + "learning_rate": 0.000129, + "loss": 10.2978, + "step": 43 + }, + { + "epoch": 0.00044, + "grad_norm": 0.9097345366629672, + "learning_rate": 0.00013199999999999998, + "loss": 10.2768, + "step": 44 + }, + { + "epoch": 0.00045, + "grad_norm": 0.9111670268341607, + "learning_rate": 0.000135, + "loss": 10.2539, + "step": 45 + }, + { + "epoch": 0.00046, + "grad_norm": 0.9030473796809102, + "learning_rate": 0.000138, + "loss": 10.2322, + "step": 46 + }, + { + "epoch": 0.00047, + "grad_norm": 0.9086055117133346, + "learning_rate": 0.000141, + "loss": 10.2064, + "step": 47 + }, + { + "epoch": 0.00048, + "grad_norm": 0.9060414917101882, + "learning_rate": 0.000144, + "loss": 10.1837, + "step": 48 + }, + { + "epoch": 0.00049, + "grad_norm": 0.9059366436676172, + "learning_rate": 0.000147, + "loss": 10.1599, + "step": 49 + }, + { + "epoch": 0.0005, + "grad_norm": 0.9106165244124662, + "learning_rate": 0.00015000000000000001, + "loss": 10.1343, + "step": 50 + }, + { + "epoch": 0.00051, + "grad_norm": 0.9110992341155927, + "learning_rate": 0.000153, + "loss": 10.1078, + "step": 51 + }, + { + "epoch": 0.00052, + "grad_norm": 0.9077294523989683, + "learning_rate": 0.000156, + "loss": 10.0815, + "step": 52 + }, + { + "epoch": 0.00053, + "grad_norm": 0.8995624264210066, + "learning_rate": 0.000159, + "loss": 10.0581, + "step": 53 + }, + { + "epoch": 0.00054, + "grad_norm": 0.9187536344258231, + "learning_rate": 0.000162, + "loss": 10.026, + "step": 54 + }, + { + "epoch": 0.00055, + "grad_norm": 0.9102867236672618, + "learning_rate": 0.000165, + "loss": 10.0019, + "step": 55 + }, + { + "epoch": 0.00056, + "grad_norm": 0.9031170239719724, + "learning_rate": 0.00016800000000000002, + "loss": 9.9743, + "step": 56 + }, + { + "epoch": 0.00057, + "grad_norm": 0.9090239107255728, + "learning_rate": 0.000171, + "loss": 9.9467, + "step": 57 + }, + { + "epoch": 0.00058, + "grad_norm": 0.9070896749665766, + "learning_rate": 0.000174, + "loss": 9.9223, + "step": 58 + }, + { + "epoch": 0.00059, + "grad_norm": 0.9116582783399498, + "learning_rate": 0.000177, + "loss": 9.8905, + "step": 59 + }, + { + "epoch": 0.0006, + "grad_norm": 0.899071209460366, + "learning_rate": 0.00017999999999999998, + "loss": 9.8642, + "step": 60 + }, + { + "epoch": 0.00061, + "grad_norm": 0.8984032767613607, + "learning_rate": 0.000183, + "loss": 9.8367, + "step": 61 + }, + { + "epoch": 0.00062, + "grad_norm": 0.9043779152804675, + "learning_rate": 0.000186, + "loss": 9.8058, + "step": 62 + }, + { + "epoch": 0.00063, + "grad_norm": 0.8969230453763369, + "learning_rate": 0.000189, + "loss": 9.7784, + "step": 63 + }, + { + "epoch": 0.00064, + "grad_norm": 0.8929896633083918, + "learning_rate": 0.000192, + "loss": 9.7485, + "step": 64 + }, + { + "epoch": 0.00065, + "grad_norm": 0.9016512171852502, + "learning_rate": 0.00019500000000000002, + "loss": 9.7165, + "step": 65 + }, + { + "epoch": 0.00066, + "grad_norm": 0.8946267084059, + "learning_rate": 0.00019800000000000002, + "loss": 9.6927, + "step": 66 + }, + { + "epoch": 0.00067, + "grad_norm": 0.8963872265737496, + "learning_rate": 0.000201, + "loss": 9.6633, + "step": 67 + }, + { + "epoch": 0.00068, + "grad_norm": 0.9035635569767697, + "learning_rate": 0.00020400000000000003, + "loss": 9.6313, + "step": 68 + }, + { + "epoch": 0.00069, + "grad_norm": 0.8884430485081615, + "learning_rate": 0.00020700000000000002, + "loss": 9.604, + "step": 69 + }, + { + "epoch": 0.0007, + "grad_norm": 0.901710386427562, + "learning_rate": 0.00021000000000000004, + "loss": 9.5675, + "step": 70 + }, + { + "epoch": 0.00071, + "grad_norm": 0.8913815692585527, + "learning_rate": 0.00021299999999999997, + "loss": 9.5403, + "step": 71 + }, + { + "epoch": 0.00072, + "grad_norm": 0.8947982003050186, + "learning_rate": 0.000216, + "loss": 9.5138, + "step": 72 + }, + { + "epoch": 0.00073, + "grad_norm": 0.8936046055705469, + "learning_rate": 0.00021899999999999998, + "loss": 9.4802, + "step": 73 + }, + { + "epoch": 0.00074, + "grad_norm": 0.894403551075387, + "learning_rate": 0.000222, + "loss": 9.4539, + "step": 74 + }, + { + "epoch": 0.00075, + "grad_norm": 0.8897139679621429, + "learning_rate": 0.000225, + "loss": 9.4187, + "step": 75 + }, + { + "epoch": 0.00076, + "grad_norm": 0.89755782108852, + "learning_rate": 0.000228, + "loss": 9.3929, + "step": 76 + }, + { + "epoch": 0.00077, + "grad_norm": 0.8917412779828411, + "learning_rate": 0.000231, + "loss": 9.3649, + "step": 77 + }, + { + "epoch": 0.00078, + "grad_norm": 0.886930008239094, + "learning_rate": 0.000234, + "loss": 9.3351, + "step": 78 + }, + { + "epoch": 0.00079, + "grad_norm": 0.8893115023720741, + "learning_rate": 0.00023700000000000001, + "loss": 9.2992, + "step": 79 + }, + { + "epoch": 0.0008, + "grad_norm": 0.8917772276535281, + "learning_rate": 0.00024, + "loss": 9.2632, + "step": 80 + }, + { + "epoch": 0.00081, + "grad_norm": 0.8971231461853929, + "learning_rate": 0.00024300000000000002, + "loss": 9.2316, + "step": 81 + }, + { + "epoch": 0.00082, + "grad_norm": 0.9071155901294882, + "learning_rate": 0.000246, + "loss": 9.206, + "step": 82 + }, + { + "epoch": 0.00083, + "grad_norm": 0.8963243651316662, + "learning_rate": 0.00024900000000000004, + "loss": 9.1708, + "step": 83 + }, + { + "epoch": 0.00084, + "grad_norm": 0.9002519637093493, + "learning_rate": 0.000252, + "loss": 9.1395, + "step": 84 + }, + { + "epoch": 0.00085, + "grad_norm": 0.8962870771351267, + "learning_rate": 0.000255, + "loss": 9.1159, + "step": 85 + }, + { + "epoch": 0.00086, + "grad_norm": 0.8973951423301171, + "learning_rate": 0.000258, + "loss": 9.0799, + "step": 86 + }, + { + "epoch": 0.00087, + "grad_norm": 0.8894399774297843, + "learning_rate": 0.000261, + "loss": 9.0571, + "step": 87 + }, + { + "epoch": 0.00088, + "grad_norm": 0.8954974565265822, + "learning_rate": 0.00026399999999999997, + "loss": 9.0211, + "step": 88 + }, + { + "epoch": 0.00089, + "grad_norm": 0.8903887799592864, + "learning_rate": 0.000267, + "loss": 8.9943, + "step": 89 + }, + { + "epoch": 0.0009, + "grad_norm": 0.8896831425922581, + "learning_rate": 0.00027, + "loss": 8.9666, + "step": 90 + }, + { + "epoch": 0.00091, + "grad_norm": 0.8841935030693385, + "learning_rate": 0.000273, + "loss": 8.9365, + "step": 91 + }, + { + "epoch": 0.00092, + "grad_norm": 0.8862593694142118, + "learning_rate": 0.000276, + "loss": 8.9052, + "step": 92 + }, + { + "epoch": 0.00093, + "grad_norm": 0.8881041222250594, + "learning_rate": 0.000279, + "loss": 8.8752, + "step": 93 + }, + { + "epoch": 0.00094, + "grad_norm": 0.8868931156100198, + "learning_rate": 0.000282, + "loss": 8.8494, + "step": 94 + }, + { + "epoch": 0.00095, + "grad_norm": 0.8826333164427848, + "learning_rate": 0.000285, + "loss": 8.8203, + "step": 95 + }, + { + "epoch": 0.00096, + "grad_norm": 0.8910955494857569, + "learning_rate": 0.000288, + "loss": 8.7905, + "step": 96 + }, + { + "epoch": 0.00097, + "grad_norm": 0.8809279978723125, + "learning_rate": 0.000291, + "loss": 8.7668, + "step": 97 + }, + { + "epoch": 0.00098, + "grad_norm": 0.8829202734417477, + "learning_rate": 0.000294, + "loss": 8.73, + "step": 98 + }, + { + "epoch": 0.00099, + "grad_norm": 0.8858154061322314, + "learning_rate": 0.000297, + "loss": 8.7072, + "step": 99 + }, + { + "epoch": 0.001, + "grad_norm": 0.8861735275161908, + "learning_rate": 0.00030000000000000003, + "loss": 8.6806, + "step": 100 + }, + { + "epoch": 0.00101, + "grad_norm": 0.8886468466177152, + "learning_rate": 0.00030300000000000005, + "loss": 8.6455, + "step": 101 + }, + { + "epoch": 0.00102, + "grad_norm": 0.8755547614914917, + "learning_rate": 0.000306, + "loss": 8.625, + "step": 102 + }, + { + "epoch": 0.00103, + "grad_norm": 0.879237266457317, + "learning_rate": 0.000309, + "loss": 8.6003, + "step": 103 + }, + { + "epoch": 0.00104, + "grad_norm": 0.8697854479956653, + "learning_rate": 0.000312, + "loss": 8.5786, + "step": 104 + }, + { + "epoch": 0.00105, + "grad_norm": 0.8733949043334459, + "learning_rate": 0.000315, + "loss": 8.552, + "step": 105 + }, + { + "epoch": 0.00106, + "grad_norm": 0.8706148334292045, + "learning_rate": 0.000318, + "loss": 8.5224, + "step": 106 + }, + { + "epoch": 0.00107, + "grad_norm": 0.8653683244965015, + "learning_rate": 0.000321, + "loss": 8.4981, + "step": 107 + }, + { + "epoch": 0.00108, + "grad_norm": 0.8737239863866451, + "learning_rate": 0.000324, + "loss": 8.4698, + "step": 108 + }, + { + "epoch": 0.00109, + "grad_norm": 0.8706512729216435, + "learning_rate": 0.000327, + "loss": 8.4501, + "step": 109 + }, + { + "epoch": 0.0011, + "grad_norm": 0.8842127152624679, + "learning_rate": 0.00033, + "loss": 8.4274, + "step": 110 + }, + { + "epoch": 0.00111, + "grad_norm": 0.9007754832304464, + "learning_rate": 0.000333, + "loss": 8.3985, + "step": 111 + }, + { + "epoch": 0.00112, + "grad_norm": 0.9159855921530741, + "learning_rate": 0.00033600000000000004, + "loss": 8.3784, + "step": 112 + }, + { + "epoch": 0.00113, + "grad_norm": 0.9063278036144603, + "learning_rate": 0.000339, + "loss": 8.3391, + "step": 113 + }, + { + "epoch": 0.00114, + "grad_norm": 0.8437820836704115, + "learning_rate": 0.000342, + "loss": 8.3286, + "step": 114 + }, + { + "epoch": 0.00115, + "grad_norm": 0.8612821674982505, + "learning_rate": 0.00034500000000000004, + "loss": 8.3022, + "step": 115 + }, + { + "epoch": 0.00116, + "grad_norm": 0.8581012057508914, + "learning_rate": 0.000348, + "loss": 8.2786, + "step": 116 + }, + { + "epoch": 0.00117, + "grad_norm": 0.834681162463853, + "learning_rate": 0.000351, + "loss": 8.2472, + "step": 117 + }, + { + "epoch": 0.00118, + "grad_norm": 0.8526508674143746, + "learning_rate": 0.000354, + "loss": 8.2312, + "step": 118 + }, + { + "epoch": 0.00119, + "grad_norm": 0.8471842679679056, + "learning_rate": 0.000357, + "loss": 8.2186, + "step": 119 + }, + { + "epoch": 0.0012, + "grad_norm": 0.8238029079166322, + "learning_rate": 0.00035999999999999997, + "loss": 8.1917, + "step": 120 + }, + { + "epoch": 0.00121, + "grad_norm": 0.832770075662114, + "learning_rate": 0.000363, + "loss": 8.1641, + "step": 121 + }, + { + "epoch": 0.00122, + "grad_norm": 0.8176689288160716, + "learning_rate": 0.000366, + "loss": 8.1471, + "step": 122 + }, + { + "epoch": 0.00123, + "grad_norm": 0.8121975389696077, + "learning_rate": 0.000369, + "loss": 8.1243, + "step": 123 + }, + { + "epoch": 0.00124, + "grad_norm": 0.8287555396618358, + "learning_rate": 0.000372, + "loss": 8.1037, + "step": 124 + }, + { + "epoch": 0.00125, + "grad_norm": 0.8387006564379252, + "learning_rate": 0.000375, + "loss": 8.0802, + "step": 125 + }, + { + "epoch": 0.00126, + "grad_norm": 0.7985855201767323, + "learning_rate": 0.000378, + "loss": 8.0672, + "step": 126 + }, + { + "epoch": 0.00127, + "grad_norm": 0.7830434817433392, + "learning_rate": 0.000381, + "loss": 8.0437, + "step": 127 + }, + { + "epoch": 0.00128, + "grad_norm": 0.8466276209824322, + "learning_rate": 0.000384, + "loss": 8.0264, + "step": 128 + }, + { + "epoch": 0.00129, + "grad_norm": 1.1259201196462498, + "learning_rate": 0.00038700000000000003, + "loss": 8.0199, + "step": 129 + }, + { + "epoch": 0.0013, + "grad_norm": 1.356061683657538, + "learning_rate": 0.00039000000000000005, + "loss": 8.0054, + "step": 130 + }, + { + "epoch": 0.00131, + "grad_norm": 0.781068809712636, + "learning_rate": 0.000393, + "loss": 7.9576, + "step": 131 + }, + { + "epoch": 0.00132, + "grad_norm": 1.0367188815921278, + "learning_rate": 0.00039600000000000003, + "loss": 7.9591, + "step": 132 + }, + { + "epoch": 0.00133, + "grad_norm": 0.7841067594214959, + "learning_rate": 0.00039900000000000005, + "loss": 7.9293, + "step": 133 + }, + { + "epoch": 0.00134, + "grad_norm": 0.8594113418007541, + "learning_rate": 0.000402, + "loss": 7.9217, + "step": 134 + }, + { + "epoch": 0.00135, + "grad_norm": 0.7468719464596628, + "learning_rate": 0.00040500000000000003, + "loss": 7.8883, + "step": 135 + }, + { + "epoch": 0.00136, + "grad_norm": 0.8415432392576198, + "learning_rate": 0.00040800000000000005, + "loss": 7.8771, + "step": 136 + }, + { + "epoch": 0.00137, + "grad_norm": 0.7386877288273068, + "learning_rate": 0.000411, + "loss": 7.8542, + "step": 137 + }, + { + "epoch": 0.00138, + "grad_norm": 0.7450176106214967, + "learning_rate": 0.00041400000000000003, + "loss": 7.8322, + "step": 138 + }, + { + "epoch": 0.00139, + "grad_norm": 0.7424698225823185, + "learning_rate": 0.00041700000000000005, + "loss": 7.8197, + "step": 139 + }, + { + "epoch": 0.0014, + "grad_norm": 0.7268306219948636, + "learning_rate": 0.00042000000000000007, + "loss": 7.8018, + "step": 140 + }, + { + "epoch": 0.00141, + "grad_norm": 0.7115037341291065, + "learning_rate": 0.000423, + "loss": 7.7905, + "step": 141 + }, + { + "epoch": 0.00142, + "grad_norm": 0.6725409058271569, + "learning_rate": 0.00042599999999999995, + "loss": 7.7772, + "step": 142 + }, + { + "epoch": 0.00143, + "grad_norm": 0.6830809074405504, + "learning_rate": 0.00042899999999999997, + "loss": 7.7496, + "step": 143 + }, + { + "epoch": 0.00144, + "grad_norm": 0.6791849969278475, + "learning_rate": 0.000432, + "loss": 7.7318, + "step": 144 + }, + { + "epoch": 0.00145, + "grad_norm": 0.6462817248800249, + "learning_rate": 0.000435, + "loss": 7.7251, + "step": 145 + }, + { + "epoch": 0.00146, + "grad_norm": 0.6695930969912641, + "learning_rate": 0.00043799999999999997, + "loss": 7.6893, + "step": 146 + }, + { + "epoch": 0.00147, + "grad_norm": 0.7012896651032599, + "learning_rate": 0.000441, + "loss": 7.6817, + "step": 147 + }, + { + "epoch": 0.00148, + "grad_norm": 0.7755580877429182, + "learning_rate": 0.000444, + "loss": 7.6698, + "step": 148 + }, + { + "epoch": 0.00149, + "grad_norm": 1.088947674236225, + "learning_rate": 0.00044699999999999997, + "loss": 7.6713, + "step": 149 + }, + { + "epoch": 0.0015, + "grad_norm": 0.9396541473912592, + "learning_rate": 0.00045, + "loss": 7.6436, + "step": 150 + }, + { + "epoch": 0.00151, + "grad_norm": 0.640217037835256, + "learning_rate": 0.000453, + "loss": 7.6133, + "step": 151 + }, + { + "epoch": 0.00152, + "grad_norm": 0.9098496631236208, + "learning_rate": 0.000456, + "loss": 7.6149, + "step": 152 + }, + { + "epoch": 0.00153, + "grad_norm": 0.6175759444520236, + "learning_rate": 0.000459, + "loss": 7.5962, + "step": 153 + }, + { + "epoch": 0.00154, + "grad_norm": 0.6884616968083866, + "learning_rate": 0.000462, + "loss": 7.5772, + "step": 154 + }, + { + "epoch": 0.00155, + "grad_norm": 0.6360938051960316, + "learning_rate": 0.000465, + "loss": 7.5598, + "step": 155 + }, + { + "epoch": 0.00156, + "grad_norm": 0.6937352973699618, + "learning_rate": 0.000468, + "loss": 7.5366, + "step": 156 + }, + { + "epoch": 0.00157, + "grad_norm": 0.6334556877551312, + "learning_rate": 0.000471, + "loss": 7.53, + "step": 157 + }, + { + "epoch": 0.00158, + "grad_norm": 0.9707170671679272, + "learning_rate": 0.00047400000000000003, + "loss": 7.5153, + "step": 158 + }, + { + "epoch": 0.00159, + "grad_norm": 1.0240750882994218, + "learning_rate": 0.000477, + "loss": 7.5076, + "step": 159 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9322894026235434, + "learning_rate": 0.00048, + "loss": 7.4836, + "step": 160 + }, + { + "epoch": 0.00161, + "grad_norm": 0.5279037898518898, + "learning_rate": 0.00048300000000000003, + "loss": 7.4527, + "step": 161 + }, + { + "epoch": 0.00162, + "grad_norm": 0.687356662308957, + "learning_rate": 0.00048600000000000005, + "loss": 7.4601, + "step": 162 + }, + { + "epoch": 0.00163, + "grad_norm": 0.5623951705594973, + "learning_rate": 0.0004890000000000001, + "loss": 7.4388, + "step": 163 + }, + { + "epoch": 0.00164, + "grad_norm": 0.5581337114560441, + "learning_rate": 0.000492, + "loss": 7.4399, + "step": 164 + }, + { + "epoch": 0.00165, + "grad_norm": 0.5516159301488641, + "learning_rate": 0.000495, + "loss": 7.4126, + "step": 165 + }, + { + "epoch": 0.00166, + "grad_norm": 0.5242244578051735, + "learning_rate": 0.0004980000000000001, + "loss": 7.3876, + "step": 166 + }, + { + "epoch": 0.00167, + "grad_norm": 0.45997959649003123, + "learning_rate": 0.000501, + "loss": 7.3779, + "step": 167 + }, + { + "epoch": 0.00168, + "grad_norm": 0.5436289820614866, + "learning_rate": 0.000504, + "loss": 7.3569, + "step": 168 + }, + { + "epoch": 0.00169, + "grad_norm": 0.4983067598465849, + "learning_rate": 0.0005070000000000001, + "loss": 7.3495, + "step": 169 + }, + { + "epoch": 0.0017, + "grad_norm": 0.4402852485923817, + "learning_rate": 0.00051, + "loss": 7.3316, + "step": 170 + }, + { + "epoch": 0.00171, + "grad_norm": 0.5221521396945228, + "learning_rate": 0.000513, + "loss": 7.3138, + "step": 171 + }, + { + "epoch": 0.00172, + "grad_norm": 0.45544347662440743, + "learning_rate": 0.000516, + "loss": 7.3129, + "step": 172 + }, + { + "epoch": 0.00173, + "grad_norm": 0.4745602833877857, + "learning_rate": 0.0005189999999999999, + "loss": 7.2961, + "step": 173 + }, + { + "epoch": 0.00174, + "grad_norm": 0.5121580158942174, + "learning_rate": 0.000522, + "loss": 7.311, + "step": 174 + }, + { + "epoch": 0.00175, + "grad_norm": 0.680505499537256, + "learning_rate": 0.000525, + "loss": 7.2769, + "step": 175 + }, + { + "epoch": 0.00176, + "grad_norm": 0.7210959926983863, + "learning_rate": 0.0005279999999999999, + "loss": 7.2549, + "step": 176 + }, + { + "epoch": 0.00177, + "grad_norm": 0.7510526045152774, + "learning_rate": 0.000531, + "loss": 7.2572, + "step": 177 + }, + { + "epoch": 0.00178, + "grad_norm": 0.7343359791017195, + "learning_rate": 0.000534, + "loss": 7.2506, + "step": 178 + }, + { + "epoch": 0.00179, + "grad_norm": 0.6185909230661502, + "learning_rate": 0.000537, + "loss": 7.2371, + "step": 179 + }, + { + "epoch": 0.0018, + "grad_norm": 0.42887581745789505, + "learning_rate": 0.00054, + "loss": 7.2041, + "step": 180 + }, + { + "epoch": 0.00181, + "grad_norm": 0.5359194273155663, + "learning_rate": 0.000543, + "loss": 7.1961, + "step": 181 + }, + { + "epoch": 0.00182, + "grad_norm": 0.4369454993609359, + "learning_rate": 0.000546, + "loss": 7.188, + "step": 182 + }, + { + "epoch": 0.00183, + "grad_norm": 0.41001210068633426, + "learning_rate": 0.000549, + "loss": 7.1769, + "step": 183 + }, + { + "epoch": 0.00184, + "grad_norm": 0.4396906853721559, + "learning_rate": 0.000552, + "loss": 7.1646, + "step": 184 + }, + { + "epoch": 0.00185, + "grad_norm": 0.4311329351383538, + "learning_rate": 0.000555, + "loss": 7.1403, + "step": 185 + }, + { + "epoch": 0.00186, + "grad_norm": 0.44013221474943204, + "learning_rate": 0.000558, + "loss": 7.1495, + "step": 186 + }, + { + "epoch": 0.00187, + "grad_norm": 0.42719641993731927, + "learning_rate": 0.000561, + "loss": 7.1244, + "step": 187 + }, + { + "epoch": 0.00188, + "grad_norm": 0.39192004433941763, + "learning_rate": 0.000564, + "loss": 7.1022, + "step": 188 + }, + { + "epoch": 0.00189, + "grad_norm": 0.5474017259570552, + "learning_rate": 0.000567, + "loss": 7.1002, + "step": 189 + }, + { + "epoch": 0.0019, + "grad_norm": 0.5320208949420774, + "learning_rate": 0.00057, + "loss": 7.0824, + "step": 190 + }, + { + "epoch": 0.00191, + "grad_norm": 0.7661733453917681, + "learning_rate": 0.000573, + "loss": 7.0764, + "step": 191 + }, + { + "epoch": 0.00192, + "grad_norm": 0.9858385620454592, + "learning_rate": 0.000576, + "loss": 7.0775, + "step": 192 + }, + { + "epoch": 0.00193, + "grad_norm": 0.9591489873604585, + "learning_rate": 0.000579, + "loss": 7.0725, + "step": 193 + }, + { + "epoch": 0.00194, + "grad_norm": 0.8774582305545432, + "learning_rate": 0.000582, + "loss": 7.0715, + "step": 194 + }, + { + "epoch": 0.00195, + "grad_norm": 0.9015156284206135, + "learning_rate": 0.000585, + "loss": 7.0487, + "step": 195 + }, + { + "epoch": 0.00196, + "grad_norm": 0.8000421121121074, + "learning_rate": 0.000588, + "loss": 7.0339, + "step": 196 + }, + { + "epoch": 0.00197, + "grad_norm": 0.8468645119231468, + "learning_rate": 0.000591, + "loss": 7.0329, + "step": 197 + }, + { + "epoch": 0.00198, + "grad_norm": 0.6924161058762034, + "learning_rate": 0.000594, + "loss": 7.0197, + "step": 198 + }, + { + "epoch": 0.00199, + "grad_norm": 0.5671884633245193, + "learning_rate": 0.0005970000000000001, + "loss": 6.9992, + "step": 199 + }, + { + "epoch": 0.002, + "grad_norm": 0.5557515599131739, + "learning_rate": 0.0006000000000000001, + "loss": 6.9982, + "step": 200 + }, + { + "epoch": 0.00201, + "grad_norm": 0.4619645912916116, + "learning_rate": 0.000603, + "loss": 6.9694, + "step": 201 + }, + { + "epoch": 0.00202, + "grad_norm": 0.4929222182059183, + "learning_rate": 0.0006060000000000001, + "loss": 6.9601, + "step": 202 + }, + { + "epoch": 0.00203, + "grad_norm": 0.37702087689736097, + "learning_rate": 0.0006090000000000001, + "loss": 6.9599, + "step": 203 + }, + { + "epoch": 0.00204, + "grad_norm": 0.45518892473332057, + "learning_rate": 0.000612, + "loss": 6.9476, + "step": 204 + }, + { + "epoch": 0.00205, + "grad_norm": 0.384809834745988, + "learning_rate": 0.000615, + "loss": 6.9322, + "step": 205 + }, + { + "epoch": 0.00206, + "grad_norm": 0.38976835520829006, + "learning_rate": 0.000618, + "loss": 6.9255, + "step": 206 + }, + { + "epoch": 0.00207, + "grad_norm": 0.4375749283114229, + "learning_rate": 0.000621, + "loss": 6.9166, + "step": 207 + }, + { + "epoch": 0.00208, + "grad_norm": 0.6317927736115376, + "learning_rate": 0.000624, + "loss": 6.9157, + "step": 208 + }, + { + "epoch": 0.00209, + "grad_norm": 0.7772617938427908, + "learning_rate": 0.000627, + "loss": 6.9007, + "step": 209 + }, + { + "epoch": 0.0021, + "grad_norm": 1.2298132092555871, + "learning_rate": 0.00063, + "loss": 6.9015, + "step": 210 + }, + { + "epoch": 0.00211, + "grad_norm": 0.7435502579532725, + "learning_rate": 0.000633, + "loss": 6.8883, + "step": 211 + }, + { + "epoch": 0.00212, + "grad_norm": 0.3794792840193541, + "learning_rate": 0.000636, + "loss": 6.8693, + "step": 212 + }, + { + "epoch": 0.00213, + "grad_norm": 0.688114698891817, + "learning_rate": 0.000639, + "loss": 6.8589, + "step": 213 + }, + { + "epoch": 0.00214, + "grad_norm": 0.49868168831557785, + "learning_rate": 0.000642, + "loss": 6.8504, + "step": 214 + }, + { + "epoch": 0.00215, + "grad_norm": 0.6292678113208914, + "learning_rate": 0.000645, + "loss": 6.8506, + "step": 215 + }, + { + "epoch": 0.00216, + "grad_norm": 0.3673676300147008, + "learning_rate": 0.000648, + "loss": 6.8383, + "step": 216 + }, + { + "epoch": 0.00217, + "grad_norm": 0.4511278951279821, + "learning_rate": 0.000651, + "loss": 6.8131, + "step": 217 + }, + { + "epoch": 0.00218, + "grad_norm": 0.3783399316849399, + "learning_rate": 0.000654, + "loss": 6.8146, + "step": 218 + }, + { + "epoch": 0.00219, + "grad_norm": 0.3651267362185199, + "learning_rate": 0.000657, + "loss": 6.7867, + "step": 219 + }, + { + "epoch": 0.0022, + "grad_norm": 0.4708567379601776, + "learning_rate": 0.00066, + "loss": 6.8074, + "step": 220 + }, + { + "epoch": 0.00221, + "grad_norm": 0.5218809374007617, + "learning_rate": 0.0006630000000000001, + "loss": 6.7837, + "step": 221 + }, + { + "epoch": 0.00222, + "grad_norm": 0.7518789100021657, + "learning_rate": 0.000666, + "loss": 6.7774, + "step": 222 + }, + { + "epoch": 0.00223, + "grad_norm": 0.9964198530393009, + "learning_rate": 0.000669, + "loss": 6.7824, + "step": 223 + }, + { + "epoch": 0.00224, + "grad_norm": 0.807522133159797, + "learning_rate": 0.0006720000000000001, + "loss": 6.7748, + "step": 224 + }, + { + "epoch": 0.00225, + "grad_norm": 0.6803785939854445, + "learning_rate": 0.000675, + "loss": 6.7563, + "step": 225 + }, + { + "epoch": 0.00226, + "grad_norm": 1.0762485305388094, + "learning_rate": 0.000678, + "loss": 6.7681, + "step": 226 + }, + { + "epoch": 0.00227, + "grad_norm": 0.9129777590140102, + "learning_rate": 0.0006810000000000001, + "loss": 6.734, + "step": 227 + }, + { + "epoch": 0.00228, + "grad_norm": 1.299087365220233, + "learning_rate": 0.000684, + "loss": 6.7517, + "step": 228 + }, + { + "epoch": 0.00229, + "grad_norm": 0.7392729796935101, + "learning_rate": 0.000687, + "loss": 6.7331, + "step": 229 + }, + { + "epoch": 0.0023, + "grad_norm": 0.6158971034672628, + "learning_rate": 0.0006900000000000001, + "loss": 6.7089, + "step": 230 + }, + { + "epoch": 0.00231, + "grad_norm": 0.7016695647497186, + "learning_rate": 0.000693, + "loss": 6.6956, + "step": 231 + }, + { + "epoch": 0.00232, + "grad_norm": 0.6434185638703606, + "learning_rate": 0.000696, + "loss": 6.7082, + "step": 232 + }, + { + "epoch": 0.00233, + "grad_norm": 0.6044879418578446, + "learning_rate": 0.0006990000000000001, + "loss": 6.6921, + "step": 233 + }, + { + "epoch": 0.00234, + "grad_norm": 0.4480061456613071, + "learning_rate": 0.000702, + "loss": 6.679, + "step": 234 + }, + { + "epoch": 0.00235, + "grad_norm": 0.4492075259819563, + "learning_rate": 0.000705, + "loss": 6.6661, + "step": 235 + }, + { + "epoch": 0.00236, + "grad_norm": 0.3889271878969786, + "learning_rate": 0.000708, + "loss": 6.6458, + "step": 236 + }, + { + "epoch": 0.00237, + "grad_norm": 0.41607485698419117, + "learning_rate": 0.0007109999999999999, + "loss": 6.6575, + "step": 237 + }, + { + "epoch": 0.00238, + "grad_norm": 0.3515958046168432, + "learning_rate": 0.000714, + "loss": 6.6432, + "step": 238 + }, + { + "epoch": 0.00239, + "grad_norm": 0.4049983983930021, + "learning_rate": 0.000717, + "loss": 6.6274, + "step": 239 + }, + { + "epoch": 0.0024, + "grad_norm": 0.3247748587680522, + "learning_rate": 0.0007199999999999999, + "loss": 6.6414, + "step": 240 + }, + { + "epoch": 0.00241, + "grad_norm": 0.36262136655648425, + "learning_rate": 0.000723, + "loss": 6.6267, + "step": 241 + }, + { + "epoch": 0.00242, + "grad_norm": 0.3238608381517686, + "learning_rate": 0.000726, + "loss": 6.598, + "step": 242 + }, + { + "epoch": 0.00243, + "grad_norm": 0.31681258130419926, + "learning_rate": 0.000729, + "loss": 6.5877, + "step": 243 + }, + { + "epoch": 0.00244, + "grad_norm": 0.3616370322868285, + "learning_rate": 0.000732, + "loss": 6.5911, + "step": 244 + }, + { + "epoch": 0.00245, + "grad_norm": 0.4634076592576489, + "learning_rate": 0.000735, + "loss": 6.5815, + "step": 245 + }, + { + "epoch": 0.00246, + "grad_norm": 0.7486687333799987, + "learning_rate": 0.000738, + "loss": 6.5693, + "step": 246 + }, + { + "epoch": 0.00247, + "grad_norm": 1.662590620725261, + "learning_rate": 0.000741, + "loss": 6.5942, + "step": 247 + }, + { + "epoch": 0.00248, + "grad_norm": 0.7962452736060305, + "learning_rate": 0.000744, + "loss": 6.5799, + "step": 248 + }, + { + "epoch": 0.00249, + "grad_norm": 1.091558045553116, + "learning_rate": 0.000747, + "loss": 6.5681, + "step": 249 + }, + { + "epoch": 0.0025, + "grad_norm": 1.1566118279124307, + "learning_rate": 0.00075, + "loss": 6.5821, + "step": 250 + }, + { + "epoch": 0.00251, + "grad_norm": 0.7784764288643516, + "learning_rate": 0.000753, + "loss": 6.5311, + "step": 251 + }, + { + "epoch": 0.00252, + "grad_norm": 0.8327004278614876, + "learning_rate": 0.000756, + "loss": 6.5476, + "step": 252 + }, + { + "epoch": 0.00253, + "grad_norm": 0.8184368732684698, + "learning_rate": 0.000759, + "loss": 6.5327, + "step": 253 + }, + { + "epoch": 0.00254, + "grad_norm": 1.1486609061260633, + "learning_rate": 0.000762, + "loss": 6.541, + "step": 254 + }, + { + "epoch": 0.00255, + "grad_norm": 0.8085196503687528, + "learning_rate": 0.0007650000000000001, + "loss": 6.533, + "step": 255 + }, + { + "epoch": 0.00256, + "grad_norm": 0.6227788989369496, + "learning_rate": 0.000768, + "loss": 6.5039, + "step": 256 + }, + { + "epoch": 0.00257, + "grad_norm": 0.7880922816831363, + "learning_rate": 0.000771, + "loss": 6.5163, + "step": 257 + }, + { + "epoch": 0.00258, + "grad_norm": 0.8888953354311911, + "learning_rate": 0.0007740000000000001, + "loss": 6.497, + "step": 258 + }, + { + "epoch": 0.00259, + "grad_norm": 0.6556759438280303, + "learning_rate": 0.000777, + "loss": 6.491, + "step": 259 + }, + { + "epoch": 0.0026, + "grad_norm": 0.5556045743581063, + "learning_rate": 0.0007800000000000001, + "loss": 6.4736, + "step": 260 + }, + { + "epoch": 0.00261, + "grad_norm": 0.5757853625150707, + "learning_rate": 0.0007830000000000001, + "loss": 6.4835, + "step": 261 + }, + { + "epoch": 0.00262, + "grad_norm": 0.4702991355160636, + "learning_rate": 0.000786, + "loss": 6.4605, + "step": 262 + }, + { + "epoch": 0.00263, + "grad_norm": 0.5236270398277312, + "learning_rate": 0.0007890000000000001, + "loss": 6.4668, + "step": 263 + }, + { + "epoch": 0.00264, + "grad_norm": 0.57918798311576, + "learning_rate": 0.0007920000000000001, + "loss": 6.4494, + "step": 264 + }, + { + "epoch": 0.00265, + "grad_norm": 0.48386144230733374, + "learning_rate": 0.000795, + "loss": 6.4394, + "step": 265 + }, + { + "epoch": 0.00266, + "grad_norm": 0.44170660781498655, + "learning_rate": 0.0007980000000000001, + "loss": 6.4372, + "step": 266 + }, + { + "epoch": 0.00267, + "grad_norm": 0.35002993788441544, + "learning_rate": 0.0008010000000000001, + "loss": 6.4273, + "step": 267 + }, + { + "epoch": 0.00268, + "grad_norm": 0.49450621569184094, + "learning_rate": 0.000804, + "loss": 6.426, + "step": 268 + }, + { + "epoch": 0.00269, + "grad_norm": 0.5917094821390496, + "learning_rate": 0.0008070000000000001, + "loss": 6.4104, + "step": 269 + }, + { + "epoch": 0.0027, + "grad_norm": 0.8610573273382283, + "learning_rate": 0.0008100000000000001, + "loss": 6.4165, + "step": 270 + }, + { + "epoch": 0.00271, + "grad_norm": 0.9687752227819599, + "learning_rate": 0.000813, + "loss": 6.4342, + "step": 271 + }, + { + "epoch": 0.00272, + "grad_norm": 0.8252297813091672, + "learning_rate": 0.0008160000000000001, + "loss": 6.4109, + "step": 272 + }, + { + "epoch": 0.00273, + "grad_norm": 1.0509340869742279, + "learning_rate": 0.0008190000000000001, + "loss": 6.4091, + "step": 273 + }, + { + "epoch": 0.00274, + "grad_norm": 1.2492890299831212, + "learning_rate": 0.000822, + "loss": 6.3984, + "step": 274 + }, + { + "epoch": 0.00275, + "grad_norm": 0.7449833128353471, + "learning_rate": 0.0008250000000000001, + "loss": 6.3937, + "step": 275 + }, + { + "epoch": 0.00276, + "grad_norm": 0.650217944416595, + "learning_rate": 0.0008280000000000001, + "loss": 6.383, + "step": 276 + }, + { + "epoch": 0.00277, + "grad_norm": 0.6897831829588578, + "learning_rate": 0.0008310000000000001, + "loss": 6.3791, + "step": 277 + }, + { + "epoch": 0.00278, + "grad_norm": 0.6845751052836879, + "learning_rate": 0.0008340000000000001, + "loss": 6.3625, + "step": 278 + }, + { + "epoch": 0.00279, + "grad_norm": 0.6895680336476154, + "learning_rate": 0.0008370000000000001, + "loss": 6.3607, + "step": 279 + }, + { + "epoch": 0.0028, + "grad_norm": 0.7962805776833526, + "learning_rate": 0.0008400000000000001, + "loss": 6.3497, + "step": 280 + }, + { + "epoch": 0.00281, + "grad_norm": 0.7281805890288613, + "learning_rate": 0.0008430000000000001, + "loss": 6.3358, + "step": 281 + }, + { + "epoch": 0.00282, + "grad_norm": 0.6077351466654652, + "learning_rate": 0.000846, + "loss": 6.3343, + "step": 282 + }, + { + "epoch": 0.00283, + "grad_norm": 0.7021452115418912, + "learning_rate": 0.0008489999999999999, + "loss": 6.3268, + "step": 283 + }, + { + "epoch": 0.00284, + "grad_norm": 0.8067589645945545, + "learning_rate": 0.0008519999999999999, + "loss": 6.3217, + "step": 284 + }, + { + "epoch": 0.00285, + "grad_norm": 0.7238661685202905, + "learning_rate": 0.000855, + "loss": 6.3229, + "step": 285 + }, + { + "epoch": 0.00286, + "grad_norm": 0.6566110557704579, + "learning_rate": 0.0008579999999999999, + "loss": 6.3002, + "step": 286 + }, + { + "epoch": 0.00287, + "grad_norm": 0.5448339024770075, + "learning_rate": 0.000861, + "loss": 6.3016, + "step": 287 + }, + { + "epoch": 0.00288, + "grad_norm": 0.4516947530472597, + "learning_rate": 0.000864, + "loss": 6.279, + "step": 288 + }, + { + "epoch": 0.00289, + "grad_norm": 0.5415480649305122, + "learning_rate": 0.0008669999999999999, + "loss": 6.2929, + "step": 289 + }, + { + "epoch": 0.0029, + "grad_norm": 0.5812113854990302, + "learning_rate": 0.00087, + "loss": 6.2743, + "step": 290 + }, + { + "epoch": 0.00291, + "grad_norm": 0.5620937871349196, + "learning_rate": 0.000873, + "loss": 6.2679, + "step": 291 + }, + { + "epoch": 0.00292, + "grad_norm": 0.43362286284353735, + "learning_rate": 0.0008759999999999999, + "loss": 6.2618, + "step": 292 + }, + { + "epoch": 0.00293, + "grad_norm": 0.563953479113573, + "learning_rate": 0.000879, + "loss": 6.2491, + "step": 293 + }, + { + "epoch": 0.00294, + "grad_norm": 0.5784107789505917, + "learning_rate": 0.000882, + "loss": 6.2636, + "step": 294 + }, + { + "epoch": 0.00295, + "grad_norm": 0.5410292555611181, + "learning_rate": 0.0008849999999999999, + "loss": 6.2293, + "step": 295 + }, + { + "epoch": 0.00296, + "grad_norm": 0.5413778994680785, + "learning_rate": 0.000888, + "loss": 6.2428, + "step": 296 + }, + { + "epoch": 0.00297, + "grad_norm": 0.7587824592105622, + "learning_rate": 0.000891, + "loss": 6.2186, + "step": 297 + }, + { + "epoch": 0.00298, + "grad_norm": 1.2390223277612324, + "learning_rate": 0.0008939999999999999, + "loss": 6.2454, + "step": 298 + }, + { + "epoch": 0.00299, + "grad_norm": 1.0051162436627608, + "learning_rate": 0.000897, + "loss": 6.2195, + "step": 299 + }, + { + "epoch": 0.003, + "grad_norm": 0.9301924220055711, + "learning_rate": 0.0009, + "loss": 6.2223, + "step": 300 + }, + { + "epoch": 0.00301, + "grad_norm": 0.9092669241988458, + "learning_rate": 0.0009029999999999999, + "loss": 6.2133, + "step": 301 + }, + { + "epoch": 0.00302, + "grad_norm": 1.0280476917286576, + "learning_rate": 0.000906, + "loss": 6.2045, + "step": 302 + }, + { + "epoch": 0.00303, + "grad_norm": 0.955686909103224, + "learning_rate": 0.000909, + "loss": 6.2159, + "step": 303 + }, + { + "epoch": 0.00304, + "grad_norm": 0.924153033838841, + "learning_rate": 0.000912, + "loss": 6.2068, + "step": 304 + }, + { + "epoch": 0.00305, + "grad_norm": 1.0129569357421315, + "learning_rate": 0.000915, + "loss": 6.2162, + "step": 305 + }, + { + "epoch": 0.00306, + "grad_norm": 0.908196615580651, + "learning_rate": 0.000918, + "loss": 6.1982, + "step": 306 + }, + { + "epoch": 0.00307, + "grad_norm": 0.9412115799039344, + "learning_rate": 0.000921, + "loss": 6.1922, + "step": 307 + }, + { + "epoch": 0.00308, + "grad_norm": 1.0133210251014897, + "learning_rate": 0.000924, + "loss": 6.1801, + "step": 308 + }, + { + "epoch": 0.00309, + "grad_norm": 0.7885110954474147, + "learning_rate": 0.000927, + "loss": 6.1916, + "step": 309 + }, + { + "epoch": 0.0031, + "grad_norm": 0.8502930513548597, + "learning_rate": 0.00093, + "loss": 6.1767, + "step": 310 + }, + { + "epoch": 0.00311, + "grad_norm": 0.8830498260903274, + "learning_rate": 0.000933, + "loss": 6.1706, + "step": 311 + }, + { + "epoch": 0.00312, + "grad_norm": 0.7731682808518218, + "learning_rate": 0.000936, + "loss": 6.1483, + "step": 312 + }, + { + "epoch": 0.00313, + "grad_norm": 0.5034693382195244, + "learning_rate": 0.0009390000000000001, + "loss": 6.1541, + "step": 313 + }, + { + "epoch": 0.00314, + "grad_norm": 0.5944206771214186, + "learning_rate": 0.000942, + "loss": 6.1615, + "step": 314 + }, + { + "epoch": 0.00315, + "grad_norm": 0.6198949865082991, + "learning_rate": 0.000945, + "loss": 6.1382, + "step": 315 + }, + { + "epoch": 0.00316, + "grad_norm": 0.6322301787116348, + "learning_rate": 0.0009480000000000001, + "loss": 6.112, + "step": 316 + }, + { + "epoch": 0.00317, + "grad_norm": 0.6034799354522895, + "learning_rate": 0.000951, + "loss": 6.1294, + "step": 317 + }, + { + "epoch": 0.00318, + "grad_norm": 0.594812791501401, + "learning_rate": 0.000954, + "loss": 6.1243, + "step": 318 + }, + { + "epoch": 0.00319, + "grad_norm": 0.5121880287121718, + "learning_rate": 0.0009570000000000001, + "loss": 6.1104, + "step": 319 + }, + { + "epoch": 0.0032, + "grad_norm": 0.4383869814323231, + "learning_rate": 0.00096, + "loss": 6.1078, + "step": 320 + }, + { + "epoch": 0.00321, + "grad_norm": 0.46253996577569373, + "learning_rate": 0.000963, + "loss": 6.0957, + "step": 321 + }, + { + "epoch": 0.00322, + "grad_norm": 0.5330974668531825, + "learning_rate": 0.0009660000000000001, + "loss": 6.0852, + "step": 322 + }, + { + "epoch": 0.00323, + "grad_norm": 0.4269664457046623, + "learning_rate": 0.000969, + "loss": 6.0805, + "step": 323 + }, + { + "epoch": 0.00324, + "grad_norm": 0.4120363869720864, + "learning_rate": 0.0009720000000000001, + "loss": 6.0833, + "step": 324 + }, + { + "epoch": 0.00325, + "grad_norm": 0.43852524771556284, + "learning_rate": 0.0009750000000000001, + "loss": 6.0494, + "step": 325 + }, + { + "epoch": 0.00326, + "grad_norm": 0.5390182954459198, + "learning_rate": 0.0009780000000000001, + "loss": 6.0675, + "step": 326 + }, + { + "epoch": 0.00327, + "grad_norm": 0.7655676469366879, + "learning_rate": 0.000981, + "loss": 6.0729, + "step": 327 + }, + { + "epoch": 0.00328, + "grad_norm": 1.3095809996152918, + "learning_rate": 0.000984, + "loss": 6.0661, + "step": 328 + }, + { + "epoch": 0.00329, + "grad_norm": 0.9398963728555475, + "learning_rate": 0.000987, + "loss": 6.0593, + "step": 329 + }, + { + "epoch": 0.0033, + "grad_norm": 0.9687365266328093, + "learning_rate": 0.00099, + "loss": 6.0595, + "step": 330 + }, + { + "epoch": 0.00331, + "grad_norm": 1.0572725295338647, + "learning_rate": 0.0009930000000000002, + "loss": 6.0709, + "step": 331 + }, + { + "epoch": 0.00332, + "grad_norm": 0.7804253243852415, + "learning_rate": 0.0009960000000000001, + "loss": 6.0449, + "step": 332 + }, + { + "epoch": 0.00333, + "grad_norm": 0.9477444786898163, + "learning_rate": 0.000999, + "loss": 6.0352, + "step": 333 + }, + { + "epoch": 0.00334, + "grad_norm": 1.119636024931192, + "learning_rate": 0.001002, + "loss": 6.0509, + "step": 334 + }, + { + "epoch": 0.00335, + "grad_norm": 1.0398096390896505, + "learning_rate": 0.001005, + "loss": 6.0495, + "step": 335 + }, + { + "epoch": 0.00336, + "grad_norm": 0.8946811171098662, + "learning_rate": 0.001008, + "loss": 6.0348, + "step": 336 + }, + { + "epoch": 0.00337, + "grad_norm": 1.0240126692017184, + "learning_rate": 0.0010110000000000002, + "loss": 6.0215, + "step": 337 + }, + { + "epoch": 0.00338, + "grad_norm": 1.0245564805630587, + "learning_rate": 0.0010140000000000001, + "loss": 6.0414, + "step": 338 + }, + { + "epoch": 0.00339, + "grad_norm": 0.9606863659347135, + "learning_rate": 0.0010170000000000001, + "loss": 6.0439, + "step": 339 + }, + { + "epoch": 0.0034, + "grad_norm": 0.7549695201896788, + "learning_rate": 0.00102, + "loss": 6.0067, + "step": 340 + }, + { + "epoch": 0.00341, + "grad_norm": 0.7981881225838676, + "learning_rate": 0.001023, + "loss": 5.9935, + "step": 341 + }, + { + "epoch": 0.00342, + "grad_norm": 0.6476806075689499, + "learning_rate": 0.001026, + "loss": 5.9919, + "step": 342 + }, + { + "epoch": 0.00343, + "grad_norm": 0.6180530813345404, + "learning_rate": 0.0010290000000000002, + "loss": 5.9943, + "step": 343 + }, + { + "epoch": 0.00344, + "grad_norm": 0.5904235085071222, + "learning_rate": 0.001032, + "loss": 5.9824, + "step": 344 + }, + { + "epoch": 0.00345, + "grad_norm": 0.5662246103658737, + "learning_rate": 0.001035, + "loss": 5.9736, + "step": 345 + }, + { + "epoch": 0.00346, + "grad_norm": 0.5815647759401746, + "learning_rate": 0.0010379999999999999, + "loss": 5.9787, + "step": 346 + }, + { + "epoch": 0.00347, + "grad_norm": 0.5922585456492798, + "learning_rate": 0.001041, + "loss": 5.9505, + "step": 347 + }, + { + "epoch": 0.00348, + "grad_norm": 0.546770754957902, + "learning_rate": 0.001044, + "loss": 5.9566, + "step": 348 + }, + { + "epoch": 0.00349, + "grad_norm": 0.5259270224752429, + "learning_rate": 0.001047, + "loss": 5.9388, + "step": 349 + }, + { + "epoch": 0.0035, + "grad_norm": 0.5639453872626701, + "learning_rate": 0.00105, + "loss": 5.96, + "step": 350 + }, + { + "epoch": 0.00351, + "grad_norm": 0.562716119705, + "learning_rate": 0.001053, + "loss": 5.9575, + "step": 351 + }, + { + "epoch": 0.00352, + "grad_norm": 0.6276157353012132, + "learning_rate": 0.0010559999999999999, + "loss": 5.9357, + "step": 352 + }, + { + "epoch": 0.00353, + "grad_norm": 0.6899105072430818, + "learning_rate": 0.001059, + "loss": 5.939, + "step": 353 + }, + { + "epoch": 0.00354, + "grad_norm": 0.7414855002893123, + "learning_rate": 0.001062, + "loss": 5.9142, + "step": 354 + }, + { + "epoch": 0.00355, + "grad_norm": 0.8275245559939801, + "learning_rate": 0.001065, + "loss": 5.932, + "step": 355 + }, + { + "epoch": 0.00356, + "grad_norm": 0.8334699202442165, + "learning_rate": 0.001068, + "loss": 5.925, + "step": 356 + }, + { + "epoch": 0.00357, + "grad_norm": 0.733463396370079, + "learning_rate": 0.001071, + "loss": 5.9147, + "step": 357 + }, + { + "epoch": 0.00358, + "grad_norm": 0.5898600387972718, + "learning_rate": 0.001074, + "loss": 5.9022, + "step": 358 + }, + { + "epoch": 0.00359, + "grad_norm": 0.53807120274405, + "learning_rate": 0.001077, + "loss": 5.8933, + "step": 359 + }, + { + "epoch": 0.0036, + "grad_norm": 0.5868526923580266, + "learning_rate": 0.00108, + "loss": 5.8984, + "step": 360 + }, + { + "epoch": 0.00361, + "grad_norm": 0.6561371025439784, + "learning_rate": 0.001083, + "loss": 5.8927, + "step": 361 + }, + { + "epoch": 0.00362, + "grad_norm": 1.003392418604553, + "learning_rate": 0.001086, + "loss": 5.8918, + "step": 362 + }, + { + "epoch": 0.00363, + "grad_norm": 1.2318504963643775, + "learning_rate": 0.001089, + "loss": 5.899, + "step": 363 + }, + { + "epoch": 0.00364, + "grad_norm": 0.7073595215993886, + "learning_rate": 0.001092, + "loss": 5.8956, + "step": 364 + }, + { + "epoch": 0.00365, + "grad_norm": 0.6834408409212124, + "learning_rate": 0.001095, + "loss": 5.8924, + "step": 365 + }, + { + "epoch": 0.00366, + "grad_norm": 0.808409762735137, + "learning_rate": 0.001098, + "loss": 5.8732, + "step": 366 + }, + { + "epoch": 0.00367, + "grad_norm": 1.150363353399194, + "learning_rate": 0.001101, + "loss": 5.8871, + "step": 367 + }, + { + "epoch": 0.00368, + "grad_norm": 1.5759048240116487, + "learning_rate": 0.001104, + "loss": 5.9113, + "step": 368 + }, + { + "epoch": 0.00369, + "grad_norm": 0.6706600035773841, + "learning_rate": 0.001107, + "loss": 5.8826, + "step": 369 + }, + { + "epoch": 0.0037, + "grad_norm": 1.0165372867603926, + "learning_rate": 0.00111, + "loss": 5.8773, + "step": 370 + }, + { + "epoch": 0.00371, + "grad_norm": 1.199271508025637, + "learning_rate": 0.001113, + "loss": 5.8595, + "step": 371 + }, + { + "epoch": 0.00372, + "grad_norm": 0.8890339964058547, + "learning_rate": 0.001116, + "loss": 5.8693, + "step": 372 + }, + { + "epoch": 0.00373, + "grad_norm": 1.0853395614111072, + "learning_rate": 0.001119, + "loss": 5.8674, + "step": 373 + }, + { + "epoch": 0.00374, + "grad_norm": 0.9530307319001724, + "learning_rate": 0.001122, + "loss": 5.8703, + "step": 374 + }, + { + "epoch": 0.00375, + "grad_norm": 0.934750801302772, + "learning_rate": 0.0011250000000000001, + "loss": 5.8719, + "step": 375 + }, + { + "epoch": 0.00376, + "grad_norm": 0.7986123477309184, + "learning_rate": 0.001128, + "loss": 5.863, + "step": 376 + }, + { + "epoch": 0.00377, + "grad_norm": 1.034328210427756, + "learning_rate": 0.001131, + "loss": 5.8644, + "step": 377 + }, + { + "epoch": 0.00378, + "grad_norm": 1.1394107219722458, + "learning_rate": 0.001134, + "loss": 5.8702, + "step": 378 + }, + { + "epoch": 0.00379, + "grad_norm": 0.6778105205682995, + "learning_rate": 0.001137, + "loss": 5.8386, + "step": 379 + }, + { + "epoch": 0.0038, + "grad_norm": 0.7735792074169382, + "learning_rate": 0.00114, + "loss": 5.8619, + "step": 380 + }, + { + "epoch": 0.00381, + "grad_norm": 0.882590402307869, + "learning_rate": 0.0011430000000000001, + "loss": 5.83, + "step": 381 + }, + { + "epoch": 0.00382, + "grad_norm": 0.8860425089624719, + "learning_rate": 0.001146, + "loss": 5.8197, + "step": 382 + }, + { + "epoch": 0.00383, + "grad_norm": 1.0396684860073488, + "learning_rate": 0.001149, + "loss": 5.836, + "step": 383 + }, + { + "epoch": 0.00384, + "grad_norm": 0.9683520339794217, + "learning_rate": 0.001152, + "loss": 5.8386, + "step": 384 + }, + { + "epoch": 0.00385, + "grad_norm": 0.8210443052214895, + "learning_rate": 0.001155, + "loss": 5.8215, + "step": 385 + }, + { + "epoch": 0.00386, + "grad_norm": 1.030920073327714, + "learning_rate": 0.001158, + "loss": 5.8336, + "step": 386 + }, + { + "epoch": 0.00387, + "grad_norm": 0.8032114385497527, + "learning_rate": 0.0011610000000000001, + "loss": 5.8025, + "step": 387 + }, + { + "epoch": 0.00388, + "grad_norm": 0.6803620347459473, + "learning_rate": 0.001164, + "loss": 5.7978, + "step": 388 + }, + { + "epoch": 0.00389, + "grad_norm": 0.6752304208768743, + "learning_rate": 0.001167, + "loss": 5.8027, + "step": 389 + }, + { + "epoch": 0.0039, + "grad_norm": 0.6054825081153106, + "learning_rate": 0.00117, + "loss": 5.7828, + "step": 390 + }, + { + "epoch": 0.00391, + "grad_norm": 0.5156470046541872, + "learning_rate": 0.001173, + "loss": 5.7863, + "step": 391 + }, + { + "epoch": 0.00392, + "grad_norm": 0.49609211852516366, + "learning_rate": 0.001176, + "loss": 5.7945, + "step": 392 + }, + { + "epoch": 0.00393, + "grad_norm": 0.42817727304572534, + "learning_rate": 0.0011790000000000001, + "loss": 5.7664, + "step": 393 + }, + { + "epoch": 0.00394, + "grad_norm": 0.43894767278563757, + "learning_rate": 0.001182, + "loss": 5.7539, + "step": 394 + }, + { + "epoch": 0.00395, + "grad_norm": 0.41850660912289844, + "learning_rate": 0.001185, + "loss": 5.7366, + "step": 395 + }, + { + "epoch": 0.00396, + "grad_norm": 0.41745423249833347, + "learning_rate": 0.001188, + "loss": 5.7516, + "step": 396 + }, + { + "epoch": 0.00397, + "grad_norm": 0.40474070631964676, + "learning_rate": 0.001191, + "loss": 5.7433, + "step": 397 + }, + { + "epoch": 0.00398, + "grad_norm": 0.3677722018443306, + "learning_rate": 0.0011940000000000002, + "loss": 5.7479, + "step": 398 + }, + { + "epoch": 0.00399, + "grad_norm": 0.3480206547108819, + "learning_rate": 0.0011970000000000001, + "loss": 5.7478, + "step": 399 + }, + { + "epoch": 0.004, + "grad_norm": 0.3434828622202681, + "learning_rate": 0.0012000000000000001, + "loss": 5.7345, + "step": 400 + }, + { + "epoch": 0.00401, + "grad_norm": 0.34918136204349326, + "learning_rate": 0.001203, + "loss": 5.7155, + "step": 401 + }, + { + "epoch": 0.00402, + "grad_norm": 0.30554980038341767, + "learning_rate": 0.001206, + "loss": 5.7215, + "step": 402 + }, + { + "epoch": 0.00403, + "grad_norm": 0.38840665384838735, + "learning_rate": 0.001209, + "loss": 5.7184, + "step": 403 + }, + { + "epoch": 0.00404, + "grad_norm": 0.5409513056663879, + "learning_rate": 0.0012120000000000002, + "loss": 5.714, + "step": 404 + }, + { + "epoch": 0.00405, + "grad_norm": 0.845020924848713, + "learning_rate": 0.0012150000000000002, + "loss": 5.7181, + "step": 405 + }, + { + "epoch": 0.00406, + "grad_norm": 1.1911410915070972, + "learning_rate": 0.0012180000000000001, + "loss": 5.7205, + "step": 406 + }, + { + "epoch": 0.00407, + "grad_norm": 0.6577283880630926, + "learning_rate": 0.0012209999999999999, + "loss": 5.6994, + "step": 407 + }, + { + "epoch": 0.00408, + "grad_norm": 0.7475745975098248, + "learning_rate": 0.001224, + "loss": 5.7213, + "step": 408 + }, + { + "epoch": 0.00409, + "grad_norm": 0.9872391126413178, + "learning_rate": 0.001227, + "loss": 5.7126, + "step": 409 + }, + { + "epoch": 0.0041, + "grad_norm": 1.1359034721668335, + "learning_rate": 0.00123, + "loss": 5.7088, + "step": 410 + }, + { + "epoch": 0.00411, + "grad_norm": 1.3596329145222696, + "learning_rate": 0.001233, + "loss": 5.7402, + "step": 411 + }, + { + "epoch": 0.00412, + "grad_norm": 0.7538358474928969, + "learning_rate": 0.001236, + "loss": 5.7066, + "step": 412 + }, + { + "epoch": 0.00413, + "grad_norm": 0.9465320539051596, + "learning_rate": 0.0012389999999999999, + "loss": 5.7197, + "step": 413 + }, + { + "epoch": 0.00414, + "grad_norm": 0.9262933655624658, + "learning_rate": 0.001242, + "loss": 5.6978, + "step": 414 + }, + { + "epoch": 0.00415, + "grad_norm": 1.1564175286146172, + "learning_rate": 0.001245, + "loss": 5.7105, + "step": 415 + }, + { + "epoch": 0.00416, + "grad_norm": 1.1001247072345506, + "learning_rate": 0.001248, + "loss": 5.6929, + "step": 416 + }, + { + "epoch": 0.00417, + "grad_norm": 1.0416153435685582, + "learning_rate": 0.001251, + "loss": 5.7199, + "step": 417 + }, + { + "epoch": 0.00418, + "grad_norm": 1.0281555694116995, + "learning_rate": 0.001254, + "loss": 5.6999, + "step": 418 + }, + { + "epoch": 0.00419, + "grad_norm": 1.1154617103247704, + "learning_rate": 0.0012569999999999999, + "loss": 5.6876, + "step": 419 + }, + { + "epoch": 0.0042, + "grad_norm": 0.9999912825556322, + "learning_rate": 0.00126, + "loss": 5.7045, + "step": 420 + }, + { + "epoch": 0.00421, + "grad_norm": 0.9729638313238949, + "learning_rate": 0.001263, + "loss": 5.6933, + "step": 421 + }, + { + "epoch": 0.00422, + "grad_norm": 1.0297208241186608, + "learning_rate": 0.001266, + "loss": 5.6894, + "step": 422 + }, + { + "epoch": 0.00423, + "grad_norm": 0.8381062456476874, + "learning_rate": 0.001269, + "loss": 5.6811, + "step": 423 + }, + { + "epoch": 0.00424, + "grad_norm": 0.73825928337582, + "learning_rate": 0.001272, + "loss": 5.6773, + "step": 424 + }, + { + "epoch": 0.00425, + "grad_norm": 0.8151693610653118, + "learning_rate": 0.001275, + "loss": 5.6815, + "step": 425 + }, + { + "epoch": 0.00426, + "grad_norm": 0.9877723869544494, + "learning_rate": 0.001278, + "loss": 5.6881, + "step": 426 + }, + { + "epoch": 0.00427, + "grad_norm": 0.8875063218226354, + "learning_rate": 0.001281, + "loss": 5.6676, + "step": 427 + }, + { + "epoch": 0.00428, + "grad_norm": 0.8761493100474917, + "learning_rate": 0.001284, + "loss": 5.6437, + "step": 428 + }, + { + "epoch": 0.00429, + "grad_norm": 1.109439755296158, + "learning_rate": 0.001287, + "loss": 5.668, + "step": 429 + }, + { + "epoch": 0.0043, + "grad_norm": 0.8726361797071612, + "learning_rate": 0.00129, + "loss": 5.6611, + "step": 430 + }, + { + "epoch": 0.00431, + "grad_norm": 0.5601358924900582, + "learning_rate": 0.001293, + "loss": 5.6342, + "step": 431 + }, + { + "epoch": 0.00432, + "grad_norm": 0.5940988397687699, + "learning_rate": 0.001296, + "loss": 5.6448, + "step": 432 + }, + { + "epoch": 0.00433, + "grad_norm": 0.480538337754196, + "learning_rate": 0.001299, + "loss": 5.6336, + "step": 433 + }, + { + "epoch": 0.00434, + "grad_norm": 0.5069699406038057, + "learning_rate": 0.001302, + "loss": 5.6331, + "step": 434 + }, + { + "epoch": 0.00435, + "grad_norm": 0.417220569666801, + "learning_rate": 0.001305, + "loss": 5.6227, + "step": 435 + }, + { + "epoch": 0.00436, + "grad_norm": 0.46597565539240443, + "learning_rate": 0.001308, + "loss": 5.6039, + "step": 436 + }, + { + "epoch": 0.00437, + "grad_norm": 0.37605303006482044, + "learning_rate": 0.001311, + "loss": 5.6161, + "step": 437 + }, + { + "epoch": 0.00438, + "grad_norm": 0.3526339213940271, + "learning_rate": 0.001314, + "loss": 5.5977, + "step": 438 + }, + { + "epoch": 0.00439, + "grad_norm": 0.3618369277094543, + "learning_rate": 0.001317, + "loss": 5.6246, + "step": 439 + }, + { + "epoch": 0.0044, + "grad_norm": 0.3223158135938896, + "learning_rate": 0.00132, + "loss": 5.5938, + "step": 440 + }, + { + "epoch": 0.00441, + "grad_norm": 0.3386640445759432, + "learning_rate": 0.001323, + "loss": 5.5905, + "step": 441 + }, + { + "epoch": 0.00442, + "grad_norm": 0.3397360216396013, + "learning_rate": 0.0013260000000000001, + "loss": 5.5838, + "step": 442 + }, + { + "epoch": 0.00443, + "grad_norm": 0.34532354892574607, + "learning_rate": 0.001329, + "loss": 5.5832, + "step": 443 + }, + { + "epoch": 0.00444, + "grad_norm": 0.37928556611065656, + "learning_rate": 0.001332, + "loss": 5.5853, + "step": 444 + }, + { + "epoch": 0.00445, + "grad_norm": 0.44947894711961484, + "learning_rate": 0.001335, + "loss": 5.5723, + "step": 445 + }, + { + "epoch": 0.00446, + "grad_norm": 0.5239308309237933, + "learning_rate": 0.001338, + "loss": 5.5751, + "step": 446 + }, + { + "epoch": 0.00447, + "grad_norm": 0.60889528761374, + "learning_rate": 0.001341, + "loss": 5.5777, + "step": 447 + }, + { + "epoch": 0.00448, + "grad_norm": 0.6150556040535831, + "learning_rate": 0.0013440000000000001, + "loss": 5.561, + "step": 448 + }, + { + "epoch": 0.00449, + "grad_norm": 0.5444850551876294, + "learning_rate": 0.001347, + "loss": 5.5623, + "step": 449 + }, + { + "epoch": 0.0045, + "grad_norm": 0.6163688303714219, + "learning_rate": 0.00135, + "loss": 5.5617, + "step": 450 + }, + { + "epoch": 0.00451, + "grad_norm": 0.7972728836658292, + "learning_rate": 0.001353, + "loss": 5.5614, + "step": 451 + }, + { + "epoch": 0.00452, + "grad_norm": 0.7711764764543457, + "learning_rate": 0.001356, + "loss": 5.5454, + "step": 452 + }, + { + "epoch": 0.00453, + "grad_norm": 0.7702356986189732, + "learning_rate": 0.001359, + "loss": 5.5379, + "step": 453 + }, + { + "epoch": 0.00454, + "grad_norm": 1.0838160417982272, + "learning_rate": 0.0013620000000000001, + "loss": 5.5686, + "step": 454 + }, + { + "epoch": 0.00455, + "grad_norm": 1.152655585803101, + "learning_rate": 0.0013650000000000001, + "loss": 5.5757, + "step": 455 + }, + { + "epoch": 0.00456, + "grad_norm": 1.2776355160689266, + "learning_rate": 0.001368, + "loss": 5.5831, + "step": 456 + }, + { + "epoch": 0.00457, + "grad_norm": 1.091292995937963, + "learning_rate": 0.001371, + "loss": 5.5727, + "step": 457 + }, + { + "epoch": 0.00458, + "grad_norm": 0.8963405103823251, + "learning_rate": 0.001374, + "loss": 5.5735, + "step": 458 + }, + { + "epoch": 0.00459, + "grad_norm": 1.0168648046101516, + "learning_rate": 0.0013770000000000002, + "loss": 5.5669, + "step": 459 + }, + { + "epoch": 0.0046, + "grad_norm": 1.296131601782866, + "learning_rate": 0.0013800000000000002, + "loss": 5.5665, + "step": 460 + }, + { + "epoch": 0.00461, + "grad_norm": 0.9634420565591739, + "learning_rate": 0.0013830000000000001, + "loss": 5.556, + "step": 461 + }, + { + "epoch": 0.00462, + "grad_norm": 0.9383281224355017, + "learning_rate": 0.001386, + "loss": 5.5634, + "step": 462 + }, + { + "epoch": 0.00463, + "grad_norm": 1.2569585597421309, + "learning_rate": 0.001389, + "loss": 5.5542, + "step": 463 + }, + { + "epoch": 0.00464, + "grad_norm": 0.9874595595654581, + "learning_rate": 0.001392, + "loss": 5.5689, + "step": 464 + }, + { + "epoch": 0.00465, + "grad_norm": 1.085595749506429, + "learning_rate": 0.0013950000000000002, + "loss": 5.5385, + "step": 465 + }, + { + "epoch": 0.00466, + "grad_norm": 1.0673943770446899, + "learning_rate": 0.0013980000000000002, + "loss": 5.5603, + "step": 466 + }, + { + "epoch": 0.00467, + "grad_norm": 0.8139501043376736, + "learning_rate": 0.0014010000000000001, + "loss": 5.5432, + "step": 467 + }, + { + "epoch": 0.00468, + "grad_norm": 0.7494382793960519, + "learning_rate": 0.001404, + "loss": 5.5245, + "step": 468 + }, + { + "epoch": 0.00469, + "grad_norm": 0.7634992086588068, + "learning_rate": 0.001407, + "loss": 5.5282, + "step": 469 + }, + { + "epoch": 0.0047, + "grad_norm": 0.8018093758476836, + "learning_rate": 0.00141, + "loss": 5.5404, + "step": 470 + }, + { + "epoch": 0.00471, + "grad_norm": 0.7418690809708749, + "learning_rate": 0.001413, + "loss": 5.5115, + "step": 471 + }, + { + "epoch": 0.00472, + "grad_norm": 0.7355325431039438, + "learning_rate": 0.001416, + "loss": 5.5216, + "step": 472 + }, + { + "epoch": 0.00473, + "grad_norm": 0.709026539269664, + "learning_rate": 0.001419, + "loss": 5.5305, + "step": 473 + }, + { + "epoch": 0.00474, + "grad_norm": 0.5742329758009745, + "learning_rate": 0.0014219999999999999, + "loss": 5.5064, + "step": 474 + }, + { + "epoch": 0.00475, + "grad_norm": 0.5859758403725885, + "learning_rate": 0.001425, + "loss": 5.4971, + "step": 475 + }, + { + "epoch": 0.00476, + "grad_norm": 0.6365922795308678, + "learning_rate": 0.001428, + "loss": 5.5308, + "step": 476 + }, + { + "epoch": 0.00477, + "grad_norm": 0.6539516343537074, + "learning_rate": 0.001431, + "loss": 5.4846, + "step": 477 + }, + { + "epoch": 0.00478, + "grad_norm": 0.6446859909585969, + "learning_rate": 0.001434, + "loss": 5.5074, + "step": 478 + }, + { + "epoch": 0.00479, + "grad_norm": 0.7791938725908187, + "learning_rate": 0.001437, + "loss": 5.4897, + "step": 479 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7608695568360718, + "learning_rate": 0.0014399999999999999, + "loss": 5.4855, + "step": 480 + }, + { + "epoch": 0.00481, + "grad_norm": 0.5435552532069989, + "learning_rate": 0.001443, + "loss": 5.4813, + "step": 481 + }, + { + "epoch": 0.00482, + "grad_norm": 0.4961185149512517, + "learning_rate": 0.001446, + "loss": 5.4538, + "step": 482 + }, + { + "epoch": 0.00483, + "grad_norm": 0.5120902463904886, + "learning_rate": 0.001449, + "loss": 5.4636, + "step": 483 + }, + { + "epoch": 0.00484, + "grad_norm": 0.418197369903841, + "learning_rate": 0.001452, + "loss": 5.464, + "step": 484 + }, + { + "epoch": 0.00485, + "grad_norm": 0.36311406822078424, + "learning_rate": 0.001455, + "loss": 5.4671, + "step": 485 + }, + { + "epoch": 0.00486, + "grad_norm": 0.3913777576995821, + "learning_rate": 0.001458, + "loss": 5.4393, + "step": 486 + }, + { + "epoch": 0.00487, + "grad_norm": 0.36874474197662527, + "learning_rate": 0.001461, + "loss": 5.4517, + "step": 487 + }, + { + "epoch": 0.00488, + "grad_norm": 0.41593519822402414, + "learning_rate": 0.001464, + "loss": 5.43, + "step": 488 + }, + { + "epoch": 0.00489, + "grad_norm": 0.4051383375955623, + "learning_rate": 0.001467, + "loss": 5.4204, + "step": 489 + }, + { + "epoch": 0.0049, + "grad_norm": 0.4948319932753325, + "learning_rate": 0.00147, + "loss": 5.434, + "step": 490 + }, + { + "epoch": 0.00491, + "grad_norm": 0.6887805434617323, + "learning_rate": 0.001473, + "loss": 5.4357, + "step": 491 + }, + { + "epoch": 0.00492, + "grad_norm": 0.912047432270828, + "learning_rate": 0.001476, + "loss": 5.4432, + "step": 492 + }, + { + "epoch": 0.00493, + "grad_norm": 0.8950009277905591, + "learning_rate": 0.001479, + "loss": 5.4415, + "step": 493 + }, + { + "epoch": 0.00494, + "grad_norm": 0.807683799932231, + "learning_rate": 0.001482, + "loss": 5.4427, + "step": 494 + }, + { + "epoch": 0.00495, + "grad_norm": 0.976722169908224, + "learning_rate": 0.001485, + "loss": 5.4601, + "step": 495 + }, + { + "epoch": 0.00496, + "grad_norm": 0.7224780529872387, + "learning_rate": 0.001488, + "loss": 5.4314, + "step": 496 + }, + { + "epoch": 0.00497, + "grad_norm": 0.7139613919522917, + "learning_rate": 0.001491, + "loss": 5.4172, + "step": 497 + }, + { + "epoch": 0.00498, + "grad_norm": 0.7556637282468179, + "learning_rate": 0.001494, + "loss": 5.4443, + "step": 498 + }, + { + "epoch": 0.00499, + "grad_norm": 0.8519321120342865, + "learning_rate": 0.001497, + "loss": 5.4223, + "step": 499 + }, + { + "epoch": 0.005, + "grad_norm": 0.9868888032280079, + "learning_rate": 0.0015, + "loss": 5.4308, + "step": 500 + }, + { + "epoch": 0.00501, + "grad_norm": 1.2028724714017198, + "learning_rate": 0.001503, + "loss": 5.4458, + "step": 501 + }, + { + "epoch": 0.00502, + "grad_norm": 0.9548534640519003, + "learning_rate": 0.001506, + "loss": 5.4405, + "step": 502 + }, + { + "epoch": 0.00503, + "grad_norm": 0.9423651398338494, + "learning_rate": 0.0015090000000000001, + "loss": 5.4484, + "step": 503 + }, + { + "epoch": 0.00504, + "grad_norm": 0.9672711526274779, + "learning_rate": 0.001512, + "loss": 5.4328, + "step": 504 + }, + { + "epoch": 0.00505, + "grad_norm": 0.9474889600476256, + "learning_rate": 0.001515, + "loss": 5.4283, + "step": 505 + }, + { + "epoch": 0.00506, + "grad_norm": 1.3039451405080307, + "learning_rate": 0.001518, + "loss": 5.4432, + "step": 506 + }, + { + "epoch": 0.00507, + "grad_norm": 1.2269782118632737, + "learning_rate": 0.001521, + "loss": 5.4452, + "step": 507 + }, + { + "epoch": 0.00508, + "grad_norm": 0.8597426683067237, + "learning_rate": 0.001524, + "loss": 5.4268, + "step": 508 + }, + { + "epoch": 0.00509, + "grad_norm": 0.9388657090373522, + "learning_rate": 0.0015270000000000001, + "loss": 5.4178, + "step": 509 + }, + { + "epoch": 0.0051, + "grad_norm": 0.9594427813189665, + "learning_rate": 0.0015300000000000001, + "loss": 5.4356, + "step": 510 + }, + { + "epoch": 0.00511, + "grad_norm": 1.04563577032056, + "learning_rate": 0.001533, + "loss": 5.4212, + "step": 511 + }, + { + "epoch": 0.00512, + "grad_norm": 0.733703407645156, + "learning_rate": 0.001536, + "loss": 5.4003, + "step": 512 + }, + { + "epoch": 0.00513, + "grad_norm": 0.8415210942026606, + "learning_rate": 0.001539, + "loss": 5.423, + "step": 513 + }, + { + "epoch": 0.00514, + "grad_norm": 0.8791751992621939, + "learning_rate": 0.001542, + "loss": 5.4064, + "step": 514 + }, + { + "epoch": 0.00515, + "grad_norm": 0.8161499995578689, + "learning_rate": 0.0015450000000000001, + "loss": 5.4094, + "step": 515 + }, + { + "epoch": 0.00516, + "grad_norm": 0.878383695319614, + "learning_rate": 0.0015480000000000001, + "loss": 5.4087, + "step": 516 + }, + { + "epoch": 0.00517, + "grad_norm": 0.9768725869756134, + "learning_rate": 0.001551, + "loss": 5.4055, + "step": 517 + }, + { + "epoch": 0.00518, + "grad_norm": 0.8865017723772849, + "learning_rate": 0.001554, + "loss": 5.3907, + "step": 518 + }, + { + "epoch": 0.00519, + "grad_norm": 0.8308797688973832, + "learning_rate": 0.001557, + "loss": 5.3905, + "step": 519 + }, + { + "epoch": 0.0052, + "grad_norm": 0.6978413162257922, + "learning_rate": 0.0015600000000000002, + "loss": 5.3938, + "step": 520 + }, + { + "epoch": 0.00521, + "grad_norm": 0.6562689530690187, + "learning_rate": 0.0015630000000000002, + "loss": 5.3676, + "step": 521 + }, + { + "epoch": 0.00522, + "grad_norm": 0.5577523148431155, + "learning_rate": 0.0015660000000000001, + "loss": 5.3673, + "step": 522 + }, + { + "epoch": 0.00523, + "grad_norm": 0.5298728018270966, + "learning_rate": 0.001569, + "loss": 5.3784, + "step": 523 + }, + { + "epoch": 0.00524, + "grad_norm": 0.44216623389663734, + "learning_rate": 0.001572, + "loss": 5.3811, + "step": 524 + }, + { + "epoch": 0.00525, + "grad_norm": 0.3702182111689363, + "learning_rate": 0.001575, + "loss": 5.369, + "step": 525 + }, + { + "epoch": 0.00526, + "grad_norm": 0.40289552198632295, + "learning_rate": 0.0015780000000000002, + "loss": 5.3444, + "step": 526 + }, + { + "epoch": 0.00527, + "grad_norm": 0.38284579814689895, + "learning_rate": 0.0015810000000000002, + "loss": 5.3551, + "step": 527 + }, + { + "epoch": 0.00528, + "grad_norm": 0.33950473031510653, + "learning_rate": 0.0015840000000000001, + "loss": 5.3339, + "step": 528 + }, + { + "epoch": 0.00529, + "grad_norm": 0.3777758983585419, + "learning_rate": 0.001587, + "loss": 5.343, + "step": 529 + }, + { + "epoch": 0.0053, + "grad_norm": 0.4257451161382566, + "learning_rate": 0.00159, + "loss": 5.3483, + "step": 530 + }, + { + "epoch": 0.00531, + "grad_norm": 0.5472045550610978, + "learning_rate": 0.001593, + "loss": 5.3387, + "step": 531 + }, + { + "epoch": 0.00532, + "grad_norm": 0.696856419507981, + "learning_rate": 0.0015960000000000002, + "loss": 5.3455, + "step": 532 + }, + { + "epoch": 0.00533, + "grad_norm": 0.7975941430607876, + "learning_rate": 0.0015990000000000002, + "loss": 5.3447, + "step": 533 + }, + { + "epoch": 0.00534, + "grad_norm": 0.6576622452461942, + "learning_rate": 0.0016020000000000001, + "loss": 5.3339, + "step": 534 + }, + { + "epoch": 0.00535, + "grad_norm": 0.5879483820814444, + "learning_rate": 0.001605, + "loss": 5.3347, + "step": 535 + }, + { + "epoch": 0.00536, + "grad_norm": 0.8212636469840171, + "learning_rate": 0.001608, + "loss": 5.3454, + "step": 536 + }, + { + "epoch": 0.00537, + "grad_norm": 0.7740232031924225, + "learning_rate": 0.0016110000000000002, + "loss": 5.3418, + "step": 537 + }, + { + "epoch": 0.00538, + "grad_norm": 0.683098129060214, + "learning_rate": 0.0016140000000000002, + "loss": 5.3143, + "step": 538 + }, + { + "epoch": 0.00539, + "grad_norm": 0.9170551975741953, + "learning_rate": 0.0016170000000000002, + "loss": 5.3245, + "step": 539 + }, + { + "epoch": 0.0054, + "grad_norm": 0.765080131514484, + "learning_rate": 0.0016200000000000001, + "loss": 5.3262, + "step": 540 + }, + { + "epoch": 0.00541, + "grad_norm": 0.658457026305436, + "learning_rate": 0.001623, + "loss": 5.3137, + "step": 541 + }, + { + "epoch": 0.00542, + "grad_norm": 0.601942869875084, + "learning_rate": 0.001626, + "loss": 5.3315, + "step": 542 + }, + { + "epoch": 0.00543, + "grad_norm": 0.6751097730454854, + "learning_rate": 0.0016290000000000002, + "loss": 5.2998, + "step": 543 + }, + { + "epoch": 0.00544, + "grad_norm": 0.6943293389301006, + "learning_rate": 0.0016320000000000002, + "loss": 5.3191, + "step": 544 + }, + { + "epoch": 0.00545, + "grad_norm": 0.757591621302123, + "learning_rate": 0.0016350000000000002, + "loss": 5.3293, + "step": 545 + }, + { + "epoch": 0.00546, + "grad_norm": 0.825685624372282, + "learning_rate": 0.0016380000000000001, + "loss": 5.3061, + "step": 546 + }, + { + "epoch": 0.00547, + "grad_norm": 0.8411824339962438, + "learning_rate": 0.001641, + "loss": 5.3051, + "step": 547 + }, + { + "epoch": 0.00548, + "grad_norm": 0.8225125547688507, + "learning_rate": 0.001644, + "loss": 5.303, + "step": 548 + }, + { + "epoch": 0.00549, + "grad_norm": 0.7692066087665821, + "learning_rate": 0.0016470000000000002, + "loss": 5.3016, + "step": 549 + }, + { + "epoch": 0.0055, + "grad_norm": 0.7541680263658305, + "learning_rate": 0.0016500000000000002, + "loss": 5.3099, + "step": 550 + }, + { + "epoch": 0.00551, + "grad_norm": 0.8632990593818363, + "learning_rate": 0.0016530000000000002, + "loss": 5.3123, + "step": 551 + }, + { + "epoch": 0.00552, + "grad_norm": 0.9083583396116485, + "learning_rate": 0.0016560000000000001, + "loss": 5.3063, + "step": 552 + }, + { + "epoch": 0.00553, + "grad_norm": 0.9125681148017897, + "learning_rate": 0.001659, + "loss": 5.304, + "step": 553 + }, + { + "epoch": 0.00554, + "grad_norm": 0.9360850923631763, + "learning_rate": 0.0016620000000000003, + "loss": 5.3021, + "step": 554 + }, + { + "epoch": 0.00555, + "grad_norm": 0.9519012966124738, + "learning_rate": 0.0016650000000000002, + "loss": 5.3154, + "step": 555 + }, + { + "epoch": 0.00556, + "grad_norm": 0.9297335237671114, + "learning_rate": 0.0016680000000000002, + "loss": 5.2962, + "step": 556 + }, + { + "epoch": 0.00557, + "grad_norm": 1.1187109821510643, + "learning_rate": 0.0016710000000000002, + "loss": 5.2935, + "step": 557 + }, + { + "epoch": 0.00558, + "grad_norm": 0.9868599839032521, + "learning_rate": 0.0016740000000000001, + "loss": 5.3219, + "step": 558 + }, + { + "epoch": 0.00559, + "grad_norm": 0.8786610310513777, + "learning_rate": 0.001677, + "loss": 5.2938, + "step": 559 + }, + { + "epoch": 0.0056, + "grad_norm": 0.9118163270219521, + "learning_rate": 0.0016800000000000003, + "loss": 5.2767, + "step": 560 + }, + { + "epoch": 0.00561, + "grad_norm": 0.8649386991384733, + "learning_rate": 0.0016830000000000003, + "loss": 5.3004, + "step": 561 + }, + { + "epoch": 0.00562, + "grad_norm": 0.5982058914442256, + "learning_rate": 0.0016860000000000002, + "loss": 5.279, + "step": 562 + }, + { + "epoch": 0.00563, + "grad_norm": 0.5830709316445766, + "learning_rate": 0.001689, + "loss": 5.2821, + "step": 563 + }, + { + "epoch": 0.00564, + "grad_norm": 0.5564672250442253, + "learning_rate": 0.001692, + "loss": 5.2582, + "step": 564 + }, + { + "epoch": 0.00565, + "grad_norm": 0.5903966881939692, + "learning_rate": 0.001695, + "loss": 5.2619, + "step": 565 + }, + { + "epoch": 0.00566, + "grad_norm": 0.5777761648359326, + "learning_rate": 0.0016979999999999999, + "loss": 5.2552, + "step": 566 + }, + { + "epoch": 0.00567, + "grad_norm": 0.6069030603134064, + "learning_rate": 0.0017009999999999998, + "loss": 5.2491, + "step": 567 + }, + { + "epoch": 0.00568, + "grad_norm": 0.5693132087018719, + "learning_rate": 0.0017039999999999998, + "loss": 5.2604, + "step": 568 + }, + { + "epoch": 0.00569, + "grad_norm": 0.47662406778838745, + "learning_rate": 0.001707, + "loss": 5.2359, + "step": 569 + }, + { + "epoch": 0.0057, + "grad_norm": 0.49231021705037487, + "learning_rate": 0.00171, + "loss": 5.253, + "step": 570 + }, + { + "epoch": 0.00571, + "grad_norm": 0.4167352661720621, + "learning_rate": 0.001713, + "loss": 5.2491, + "step": 571 + }, + { + "epoch": 0.00572, + "grad_norm": 0.39011854138215074, + "learning_rate": 0.0017159999999999999, + "loss": 5.2258, + "step": 572 + }, + { + "epoch": 0.00573, + "grad_norm": 0.45971389455263184, + "learning_rate": 0.0017189999999999998, + "loss": 5.2313, + "step": 573 + }, + { + "epoch": 0.00574, + "grad_norm": 0.4643694238461635, + "learning_rate": 0.001722, + "loss": 5.2399, + "step": 574 + }, + { + "epoch": 0.00575, + "grad_norm": 0.4439840100405838, + "learning_rate": 0.001725, + "loss": 5.2168, + "step": 575 + }, + { + "epoch": 0.00576, + "grad_norm": 0.4794795740566699, + "learning_rate": 0.001728, + "loss": 5.2109, + "step": 576 + }, + { + "epoch": 0.00577, + "grad_norm": 0.5445912068882687, + "learning_rate": 0.001731, + "loss": 5.2354, + "step": 577 + }, + { + "epoch": 0.00578, + "grad_norm": 0.5881619532039025, + "learning_rate": 0.0017339999999999999, + "loss": 5.2156, + "step": 578 + }, + { + "epoch": 0.00579, + "grad_norm": 0.6688964155004951, + "learning_rate": 0.0017369999999999998, + "loss": 5.2064, + "step": 579 + }, + { + "epoch": 0.0058, + "grad_norm": 0.6446644513052245, + "learning_rate": 0.00174, + "loss": 5.2223, + "step": 580 + }, + { + "epoch": 0.00581, + "grad_norm": 0.7385299343409043, + "learning_rate": 0.001743, + "loss": 5.2222, + "step": 581 + }, + { + "epoch": 0.00582, + "grad_norm": 1.0486135149054512, + "learning_rate": 0.001746, + "loss": 5.2114, + "step": 582 + }, + { + "epoch": 0.00583, + "grad_norm": 0.9184260585056472, + "learning_rate": 0.001749, + "loss": 5.2374, + "step": 583 + }, + { + "epoch": 0.00584, + "grad_norm": 0.7724931135788974, + "learning_rate": 0.0017519999999999999, + "loss": 5.2235, + "step": 584 + }, + { + "epoch": 0.00585, + "grad_norm": 0.903942587317279, + "learning_rate": 0.0017549999999999998, + "loss": 5.2218, + "step": 585 + }, + { + "epoch": 0.00586, + "grad_norm": 0.8496888678331875, + "learning_rate": 0.001758, + "loss": 5.2272, + "step": 586 + }, + { + "epoch": 0.00587, + "grad_norm": 0.8580070219006531, + "learning_rate": 0.001761, + "loss": 5.2094, + "step": 587 + }, + { + "epoch": 0.00588, + "grad_norm": 1.0100984711915582, + "learning_rate": 0.001764, + "loss": 5.233, + "step": 588 + }, + { + "epoch": 0.00589, + "grad_norm": 0.9311738464832717, + "learning_rate": 0.001767, + "loss": 5.2289, + "step": 589 + }, + { + "epoch": 0.0059, + "grad_norm": 0.9363758859588256, + "learning_rate": 0.0017699999999999999, + "loss": 5.2155, + "step": 590 + }, + { + "epoch": 0.00591, + "grad_norm": 0.9695025163620248, + "learning_rate": 0.001773, + "loss": 5.2165, + "step": 591 + }, + { + "epoch": 0.00592, + "grad_norm": 0.9805216711841677, + "learning_rate": 0.001776, + "loss": 5.2417, + "step": 592 + }, + { + "epoch": 0.00593, + "grad_norm": 0.9827241162755458, + "learning_rate": 0.001779, + "loss": 5.2392, + "step": 593 + }, + { + "epoch": 0.00594, + "grad_norm": 1.1445433178010636, + "learning_rate": 0.001782, + "loss": 5.2522, + "step": 594 + }, + { + "epoch": 0.00595, + "grad_norm": 0.950699335886284, + "learning_rate": 0.001785, + "loss": 5.2351, + "step": 595 + }, + { + "epoch": 0.00596, + "grad_norm": 0.8373647963925929, + "learning_rate": 0.0017879999999999999, + "loss": 5.2269, + "step": 596 + }, + { + "epoch": 0.00597, + "grad_norm": 0.9799263768822638, + "learning_rate": 0.001791, + "loss": 5.2118, + "step": 597 + }, + { + "epoch": 0.00598, + "grad_norm": 0.9413379648187816, + "learning_rate": 0.001794, + "loss": 5.2215, + "step": 598 + }, + { + "epoch": 0.00599, + "grad_norm": 0.8392983563516706, + "learning_rate": 0.001797, + "loss": 5.2191, + "step": 599 + }, + { + "epoch": 0.006, + "grad_norm": 0.8414286288934597, + "learning_rate": 0.0018, + "loss": 5.2134, + "step": 600 + }, + { + "epoch": 0.00601, + "grad_norm": 0.8993201270126511, + "learning_rate": 0.001803, + "loss": 5.2089, + "step": 601 + }, + { + "epoch": 0.00602, + "grad_norm": 0.8678008919316418, + "learning_rate": 0.0018059999999999999, + "loss": 5.1957, + "step": 602 + }, + { + "epoch": 0.00603, + "grad_norm": 0.6967382011990112, + "learning_rate": 0.001809, + "loss": 5.2122, + "step": 603 + }, + { + "epoch": 0.00604, + "grad_norm": 0.5312401482691863, + "learning_rate": 0.001812, + "loss": 5.1788, + "step": 604 + }, + { + "epoch": 0.00605, + "grad_norm": 0.5795052729398412, + "learning_rate": 0.001815, + "loss": 5.1915, + "step": 605 + }, + { + "epoch": 0.00606, + "grad_norm": 0.6345039250131549, + "learning_rate": 0.001818, + "loss": 5.1894, + "step": 606 + }, + { + "epoch": 0.00607, + "grad_norm": 0.6716049737606092, + "learning_rate": 0.001821, + "loss": 5.1916, + "step": 607 + }, + { + "epoch": 0.00608, + "grad_norm": 0.631863512626499, + "learning_rate": 0.001824, + "loss": 5.1762, + "step": 608 + }, + { + "epoch": 0.00609, + "grad_norm": 0.5345713642167306, + "learning_rate": 0.001827, + "loss": 5.177, + "step": 609 + }, + { + "epoch": 0.0061, + "grad_norm": 0.455626202663485, + "learning_rate": 0.00183, + "loss": 5.1434, + "step": 610 + }, + { + "epoch": 0.00611, + "grad_norm": 0.46197362206927406, + "learning_rate": 0.001833, + "loss": 5.1489, + "step": 611 + }, + { + "epoch": 0.00612, + "grad_norm": 0.3943653846553885, + "learning_rate": 0.001836, + "loss": 5.1488, + "step": 612 + }, + { + "epoch": 0.00613, + "grad_norm": 0.4053887102586243, + "learning_rate": 0.001839, + "loss": 5.1409, + "step": 613 + }, + { + "epoch": 0.00614, + "grad_norm": 0.44363743610311057, + "learning_rate": 0.001842, + "loss": 5.1611, + "step": 614 + }, + { + "epoch": 0.00615, + "grad_norm": 0.40025757691173514, + "learning_rate": 0.001845, + "loss": 5.145, + "step": 615 + }, + { + "epoch": 0.00616, + "grad_norm": 0.4067321452673224, + "learning_rate": 0.001848, + "loss": 5.1299, + "step": 616 + }, + { + "epoch": 0.00617, + "grad_norm": 0.42403873536126996, + "learning_rate": 0.001851, + "loss": 5.1397, + "step": 617 + }, + { + "epoch": 0.00618, + "grad_norm": 0.44192084147381183, + "learning_rate": 0.001854, + "loss": 5.1265, + "step": 618 + }, + { + "epoch": 0.00619, + "grad_norm": 0.5104655865853382, + "learning_rate": 0.001857, + "loss": 5.1299, + "step": 619 + }, + { + "epoch": 0.0062, + "grad_norm": 0.6226145292933671, + "learning_rate": 0.00186, + "loss": 5.118, + "step": 620 + }, + { + "epoch": 0.00621, + "grad_norm": 0.745237667035521, + "learning_rate": 0.001863, + "loss": 5.1237, + "step": 621 + }, + { + "epoch": 0.00622, + "grad_norm": 0.7986865810762169, + "learning_rate": 0.001866, + "loss": 5.1267, + "step": 622 + }, + { + "epoch": 0.00623, + "grad_norm": 0.7723208155309241, + "learning_rate": 0.001869, + "loss": 5.1084, + "step": 623 + }, + { + "epoch": 0.00624, + "grad_norm": 0.8171878766946273, + "learning_rate": 0.001872, + "loss": 5.1246, + "step": 624 + }, + { + "epoch": 0.00625, + "grad_norm": 0.7607424856276187, + "learning_rate": 0.001875, + "loss": 5.1359, + "step": 625 + }, + { + "epoch": 0.00626, + "grad_norm": 0.7005729458198662, + "learning_rate": 0.0018780000000000001, + "loss": 5.1104, + "step": 626 + }, + { + "epoch": 0.00627, + "grad_norm": 0.6735260479679158, + "learning_rate": 0.001881, + "loss": 5.1015, + "step": 627 + }, + { + "epoch": 0.00628, + "grad_norm": 0.780486998163919, + "learning_rate": 0.001884, + "loss": 5.1349, + "step": 628 + }, + { + "epoch": 0.00629, + "grad_norm": 0.8746830311438225, + "learning_rate": 0.001887, + "loss": 5.1097, + "step": 629 + }, + { + "epoch": 0.0063, + "grad_norm": 0.9536711950620466, + "learning_rate": 0.00189, + "loss": 5.1137, + "step": 630 + }, + { + "epoch": 0.00631, + "grad_norm": 0.8628106812141149, + "learning_rate": 0.0018930000000000002, + "loss": 5.1035, + "step": 631 + }, + { + "epoch": 0.00632, + "grad_norm": 0.9320890333045916, + "learning_rate": 0.0018960000000000001, + "loss": 5.1281, + "step": 632 + }, + { + "epoch": 0.00633, + "grad_norm": 1.0593968691082751, + "learning_rate": 0.001899, + "loss": 5.1321, + "step": 633 + }, + { + "epoch": 0.00634, + "grad_norm": 0.943443350801409, + "learning_rate": 0.001902, + "loss": 5.1182, + "step": 634 + }, + { + "epoch": 0.00635, + "grad_norm": 0.9628484504873114, + "learning_rate": 0.001905, + "loss": 5.1089, + "step": 635 + }, + { + "epoch": 0.00636, + "grad_norm": 1.1043383962750646, + "learning_rate": 0.001908, + "loss": 5.1289, + "step": 636 + }, + { + "epoch": 0.00637, + "grad_norm": 0.8987493866500654, + "learning_rate": 0.0019110000000000002, + "loss": 5.1351, + "step": 637 + }, + { + "epoch": 0.00638, + "grad_norm": 0.9251804377428581, + "learning_rate": 0.0019140000000000001, + "loss": 5.1288, + "step": 638 + }, + { + "epoch": 0.00639, + "grad_norm": 0.854418425044198, + "learning_rate": 0.001917, + "loss": 5.0998, + "step": 639 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9324816679284724, + "learning_rate": 0.00192, + "loss": 5.1038, + "step": 640 + }, + { + "epoch": 0.00641, + "grad_norm": 0.9892889234371413, + "learning_rate": 0.001923, + "loss": 5.1163, + "step": 641 + }, + { + "epoch": 0.00642, + "grad_norm": 1.0346602459121752, + "learning_rate": 0.001926, + "loss": 5.106, + "step": 642 + }, + { + "epoch": 0.00643, + "grad_norm": 0.8661994645957561, + "learning_rate": 0.0019290000000000002, + "loss": 5.117, + "step": 643 + }, + { + "epoch": 0.00644, + "grad_norm": 0.8724056100423225, + "learning_rate": 0.0019320000000000001, + "loss": 5.0889, + "step": 644 + }, + { + "epoch": 0.00645, + "grad_norm": 0.8584186184200229, + "learning_rate": 0.001935, + "loss": 5.1004, + "step": 645 + }, + { + "epoch": 0.00646, + "grad_norm": 0.7360558672224548, + "learning_rate": 0.001938, + "loss": 5.0955, + "step": 646 + }, + { + "epoch": 0.00647, + "grad_norm": 0.7977702647925389, + "learning_rate": 0.001941, + "loss": 5.1058, + "step": 647 + }, + { + "epoch": 0.00648, + "grad_norm": 0.7872116543506851, + "learning_rate": 0.0019440000000000002, + "loss": 5.0908, + "step": 648 + }, + { + "epoch": 0.00649, + "grad_norm": 0.7104658813349117, + "learning_rate": 0.0019470000000000002, + "loss": 5.0718, + "step": 649 + }, + { + "epoch": 0.0065, + "grad_norm": 0.7453763255239747, + "learning_rate": 0.0019500000000000001, + "loss": 5.0953, + "step": 650 + }, + { + "epoch": 0.00651, + "grad_norm": 0.7781624388594444, + "learning_rate": 0.001953, + "loss": 5.0758, + "step": 651 + }, + { + "epoch": 0.00652, + "grad_norm": 0.7616046275009601, + "learning_rate": 0.0019560000000000003, + "loss": 5.0661, + "step": 652 + }, + { + "epoch": 0.00653, + "grad_norm": 0.5945469625366651, + "learning_rate": 0.0019590000000000002, + "loss": 5.0539, + "step": 653 + }, + { + "epoch": 0.00654, + "grad_norm": 0.6024408595794577, + "learning_rate": 0.001962, + "loss": 5.0374, + "step": 654 + }, + { + "epoch": 0.00655, + "grad_norm": 0.5905307565923603, + "learning_rate": 0.001965, + "loss": 5.048, + "step": 655 + }, + { + "epoch": 0.00656, + "grad_norm": 0.5236322372626927, + "learning_rate": 0.001968, + "loss": 5.04, + "step": 656 + }, + { + "epoch": 0.00657, + "grad_norm": 0.5283416618835216, + "learning_rate": 0.001971, + "loss": 5.0223, + "step": 657 + }, + { + "epoch": 0.00658, + "grad_norm": 0.5563146586062104, + "learning_rate": 0.001974, + "loss": 5.0415, + "step": 658 + }, + { + "epoch": 0.00659, + "grad_norm": 0.6297873363395704, + "learning_rate": 0.001977, + "loss": 5.0241, + "step": 659 + }, + { + "epoch": 0.0066, + "grad_norm": 0.5780538180580159, + "learning_rate": 0.00198, + "loss": 5.0197, + "step": 660 + }, + { + "epoch": 0.00661, + "grad_norm": 0.5505266007864265, + "learning_rate": 0.001983, + "loss": 4.9938, + "step": 661 + }, + { + "epoch": 0.00662, + "grad_norm": 0.6077995116545319, + "learning_rate": 0.0019860000000000004, + "loss": 4.9946, + "step": 662 + }, + { + "epoch": 0.00663, + "grad_norm": 0.693544741809416, + "learning_rate": 0.0019890000000000003, + "loss": 5.01, + "step": 663 + }, + { + "epoch": 0.00664, + "grad_norm": 0.801492715102265, + "learning_rate": 0.0019920000000000003, + "loss": 5.0017, + "step": 664 + }, + { + "epoch": 0.00665, + "grad_norm": 0.8803866066519176, + "learning_rate": 0.0019950000000000002, + "loss": 4.9845, + "step": 665 + }, + { + "epoch": 0.00666, + "grad_norm": 0.9133314823416234, + "learning_rate": 0.001998, + "loss": 5.0048, + "step": 666 + }, + { + "epoch": 0.00667, + "grad_norm": 0.9006055773427947, + "learning_rate": 0.002001, + "loss": 4.9967, + "step": 667 + }, + { + "epoch": 0.00668, + "grad_norm": 0.8268536663020751, + "learning_rate": 0.002004, + "loss": 5.0164, + "step": 668 + }, + { + "epoch": 0.00669, + "grad_norm": 1.1034218430158187, + "learning_rate": 0.002007, + "loss": 5.0358, + "step": 669 + }, + { + "epoch": 0.0067, + "grad_norm": 1.0710112933622913, + "learning_rate": 0.00201, + "loss": 5.0286, + "step": 670 + }, + { + "epoch": 0.00671, + "grad_norm": 1.0384170566197124, + "learning_rate": 0.002013, + "loss": 5.0085, + "step": 671 + }, + { + "epoch": 0.00672, + "grad_norm": 0.9247071512184438, + "learning_rate": 0.002016, + "loss": 4.9973, + "step": 672 + }, + { + "epoch": 0.00673, + "grad_norm": 0.9905174678816935, + "learning_rate": 0.002019, + "loss": 5.0151, + "step": 673 + }, + { + "epoch": 0.00674, + "grad_norm": 0.8930692065183647, + "learning_rate": 0.0020220000000000004, + "loss": 5.0014, + "step": 674 + }, + { + "epoch": 0.00675, + "grad_norm": 1.0067908289229996, + "learning_rate": 0.0020250000000000003, + "loss": 5.0182, + "step": 675 + }, + { + "epoch": 0.00676, + "grad_norm": 0.972209366764115, + "learning_rate": 0.0020280000000000003, + "loss": 5.0082, + "step": 676 + }, + { + "epoch": 0.00677, + "grad_norm": 1.0440090424594235, + "learning_rate": 0.0020310000000000003, + "loss": 5.0109, + "step": 677 + }, + { + "epoch": 0.00678, + "grad_norm": 1.1292207401563255, + "learning_rate": 0.0020340000000000002, + "loss": 5.0278, + "step": 678 + }, + { + "epoch": 0.00679, + "grad_norm": 0.9872089241398577, + "learning_rate": 0.002037, + "loss": 5.0054, + "step": 679 + }, + { + "epoch": 0.0068, + "grad_norm": 0.8748535789650627, + "learning_rate": 0.00204, + "loss": 5.0004, + "step": 680 + }, + { + "epoch": 0.00681, + "grad_norm": 1.0039141852392888, + "learning_rate": 0.002043, + "loss": 5.0018, + "step": 681 + }, + { + "epoch": 0.00682, + "grad_norm": 1.0414085118011525, + "learning_rate": 0.002046, + "loss": 5.0026, + "step": 682 + }, + { + "epoch": 0.00683, + "grad_norm": 0.8083819733286619, + "learning_rate": 0.002049, + "loss": 4.9746, + "step": 683 + }, + { + "epoch": 0.00684, + "grad_norm": 0.618682103661302, + "learning_rate": 0.002052, + "loss": 4.9803, + "step": 684 + }, + { + "epoch": 0.00685, + "grad_norm": 0.5775508276061283, + "learning_rate": 0.0020550000000000004, + "loss": 4.9792, + "step": 685 + }, + { + "epoch": 0.00686, + "grad_norm": 0.5359292821422027, + "learning_rate": 0.0020580000000000004, + "loss": 4.9508, + "step": 686 + }, + { + "epoch": 0.00687, + "grad_norm": 0.5439874489161504, + "learning_rate": 0.0020610000000000003, + "loss": 4.9456, + "step": 687 + }, + { + "epoch": 0.00688, + "grad_norm": 0.6521058673701751, + "learning_rate": 0.002064, + "loss": 4.9472, + "step": 688 + }, + { + "epoch": 0.00689, + "grad_norm": 0.7201992210148584, + "learning_rate": 0.002067, + "loss": 4.9514, + "step": 689 + }, + { + "epoch": 0.0069, + "grad_norm": 0.6329359289093167, + "learning_rate": 0.00207, + "loss": 4.9393, + "step": 690 + }, + { + "epoch": 0.00691, + "grad_norm": 0.5452341441548267, + "learning_rate": 0.0020729999999999998, + "loss": 4.9065, + "step": 691 + }, + { + "epoch": 0.00692, + "grad_norm": 0.524421270821715, + "learning_rate": 0.0020759999999999997, + "loss": 4.9264, + "step": 692 + }, + { + "epoch": 0.00693, + "grad_norm": 0.5237198583423548, + "learning_rate": 0.0020789999999999997, + "loss": 4.9275, + "step": 693 + }, + { + "epoch": 0.00694, + "grad_norm": 0.5943284166970615, + "learning_rate": 0.002082, + "loss": 4.9065, + "step": 694 + }, + { + "epoch": 0.00695, + "grad_norm": 0.6331066584123409, + "learning_rate": 0.002085, + "loss": 4.9096, + "step": 695 + }, + { + "epoch": 0.00696, + "grad_norm": 0.6552141599381052, + "learning_rate": 0.002088, + "loss": 4.9103, + "step": 696 + }, + { + "epoch": 0.00697, + "grad_norm": 0.7459258980378775, + "learning_rate": 0.002091, + "loss": 4.9188, + "step": 697 + }, + { + "epoch": 0.00698, + "grad_norm": 0.902413176094811, + "learning_rate": 0.002094, + "loss": 4.9109, + "step": 698 + }, + { + "epoch": 0.00699, + "grad_norm": 0.9488326760888935, + "learning_rate": 0.002097, + "loss": 4.9116, + "step": 699 + }, + { + "epoch": 0.007, + "grad_norm": 0.9189763967394823, + "learning_rate": 0.0021, + "loss": 4.929, + "step": 700 + }, + { + "epoch": 0.00701, + "grad_norm": 0.8924436046521577, + "learning_rate": 0.002103, + "loss": 4.9083, + "step": 701 + }, + { + "epoch": 0.00702, + "grad_norm": 0.8863614629240012, + "learning_rate": 0.002106, + "loss": 4.9092, + "step": 702 + }, + { + "epoch": 0.00703, + "grad_norm": 0.7940709541538681, + "learning_rate": 0.0021089999999999998, + "loss": 4.891, + "step": 703 + }, + { + "epoch": 0.00704, + "grad_norm": 0.7939787736751149, + "learning_rate": 0.0021119999999999997, + "loss": 4.8964, + "step": 704 + }, + { + "epoch": 0.00705, + "grad_norm": 1.0444656030359551, + "learning_rate": 0.002115, + "loss": 4.9151, + "step": 705 + }, + { + "epoch": 0.00706, + "grad_norm": 1.1431311909042268, + "learning_rate": 0.002118, + "loss": 4.9234, + "step": 706 + }, + { + "epoch": 0.00707, + "grad_norm": 0.8384635334186645, + "learning_rate": 0.002121, + "loss": 4.8902, + "step": 707 + }, + { + "epoch": 0.00708, + "grad_norm": 0.9594405079672866, + "learning_rate": 0.002124, + "loss": 4.8745, + "step": 708 + }, + { + "epoch": 0.00709, + "grad_norm": 0.8900382869322284, + "learning_rate": 0.002127, + "loss": 4.9161, + "step": 709 + }, + { + "epoch": 0.0071, + "grad_norm": 0.8868570415283396, + "learning_rate": 0.00213, + "loss": 4.9065, + "step": 710 + }, + { + "epoch": 0.00711, + "grad_norm": 0.8610490944817158, + "learning_rate": 0.002133, + "loss": 4.8804, + "step": 711 + }, + { + "epoch": 0.00712, + "grad_norm": 0.8727164938852855, + "learning_rate": 0.002136, + "loss": 4.9046, + "step": 712 + }, + { + "epoch": 0.00713, + "grad_norm": 0.8857525308493206, + "learning_rate": 0.002139, + "loss": 4.9135, + "step": 713 + }, + { + "epoch": 0.00714, + "grad_norm": 0.9495661806955594, + "learning_rate": 0.002142, + "loss": 4.918, + "step": 714 + }, + { + "epoch": 0.00715, + "grad_norm": 1.0263024097609161, + "learning_rate": 0.0021449999999999998, + "loss": 4.8857, + "step": 715 + }, + { + "epoch": 0.00716, + "grad_norm": 0.8876358680026493, + "learning_rate": 0.002148, + "loss": 4.8749, + "step": 716 + }, + { + "epoch": 0.00717, + "grad_norm": 0.8225498605776377, + "learning_rate": 0.002151, + "loss": 4.8925, + "step": 717 + }, + { + "epoch": 0.00718, + "grad_norm": 0.628552485265691, + "learning_rate": 0.002154, + "loss": 4.8659, + "step": 718 + }, + { + "epoch": 0.00719, + "grad_norm": 0.6584104654465238, + "learning_rate": 0.002157, + "loss": 4.8747, + "step": 719 + }, + { + "epoch": 0.0072, + "grad_norm": 0.6698592474865601, + "learning_rate": 0.00216, + "loss": 4.8635, + "step": 720 + }, + { + "epoch": 0.00721, + "grad_norm": 0.6673590176314685, + "learning_rate": 0.002163, + "loss": 4.8639, + "step": 721 + }, + { + "epoch": 0.00722, + "grad_norm": 0.6674098225397388, + "learning_rate": 0.002166, + "loss": 4.8386, + "step": 722 + }, + { + "epoch": 0.00723, + "grad_norm": 0.6090726175552883, + "learning_rate": 0.002169, + "loss": 4.8464, + "step": 723 + }, + { + "epoch": 0.00724, + "grad_norm": 0.6325507361418539, + "learning_rate": 0.002172, + "loss": 4.8403, + "step": 724 + }, + { + "epoch": 0.00725, + "grad_norm": 0.6927587431932604, + "learning_rate": 0.002175, + "loss": 4.8341, + "step": 725 + }, + { + "epoch": 0.00726, + "grad_norm": 0.7422551683158218, + "learning_rate": 0.002178, + "loss": 4.8448, + "step": 726 + }, + { + "epoch": 0.00727, + "grad_norm": 0.7946686392459241, + "learning_rate": 0.0021809999999999998, + "loss": 4.8264, + "step": 727 + }, + { + "epoch": 0.00728, + "grad_norm": 0.651194780867581, + "learning_rate": 0.002184, + "loss": 4.8373, + "step": 728 + }, + { + "epoch": 0.00729, + "grad_norm": 0.5507866158426874, + "learning_rate": 0.002187, + "loss": 4.8279, + "step": 729 + }, + { + "epoch": 0.0073, + "grad_norm": 0.5770531279665235, + "learning_rate": 0.00219, + "loss": 4.8256, + "step": 730 + }, + { + "epoch": 0.00731, + "grad_norm": 0.6604772562967653, + "learning_rate": 0.002193, + "loss": 4.8198, + "step": 731 + }, + { + "epoch": 0.00732, + "grad_norm": 0.7902754963422924, + "learning_rate": 0.002196, + "loss": 4.832, + "step": 732 + }, + { + "epoch": 0.00733, + "grad_norm": 0.8530754775219535, + "learning_rate": 0.002199, + "loss": 4.814, + "step": 733 + }, + { + "epoch": 0.00734, + "grad_norm": 0.808835939559151, + "learning_rate": 0.002202, + "loss": 4.8365, + "step": 734 + }, + { + "epoch": 0.00735, + "grad_norm": 0.7793455630355729, + "learning_rate": 0.002205, + "loss": 4.8484, + "step": 735 + }, + { + "epoch": 0.00736, + "grad_norm": 0.8899384636665534, + "learning_rate": 0.002208, + "loss": 4.8322, + "step": 736 + }, + { + "epoch": 0.00737, + "grad_norm": 1.0166382867407526, + "learning_rate": 0.002211, + "loss": 4.8145, + "step": 737 + }, + { + "epoch": 0.00738, + "grad_norm": 0.9857126416807526, + "learning_rate": 0.002214, + "loss": 4.8213, + "step": 738 + }, + { + "epoch": 0.00739, + "grad_norm": 0.8982016923721464, + "learning_rate": 0.0022170000000000002, + "loss": 4.809, + "step": 739 + }, + { + "epoch": 0.0074, + "grad_norm": 0.8517808259905928, + "learning_rate": 0.00222, + "loss": 4.8138, + "step": 740 + }, + { + "epoch": 0.00741, + "grad_norm": 0.6571636698582773, + "learning_rate": 0.002223, + "loss": 4.778, + "step": 741 + }, + { + "epoch": 0.00742, + "grad_norm": 0.5983530041008951, + "learning_rate": 0.002226, + "loss": 4.8043, + "step": 742 + }, + { + "epoch": 0.00743, + "grad_norm": 0.613767022264535, + "learning_rate": 0.002229, + "loss": 4.772, + "step": 743 + }, + { + "epoch": 0.00744, + "grad_norm": 0.5746332772613801, + "learning_rate": 0.002232, + "loss": 4.7755, + "step": 744 + }, + { + "epoch": 0.00745, + "grad_norm": 0.5491916195482714, + "learning_rate": 0.002235, + "loss": 4.7792, + "step": 745 + }, + { + "epoch": 0.00746, + "grad_norm": 0.6224704559098753, + "learning_rate": 0.002238, + "loss": 4.7785, + "step": 746 + }, + { + "epoch": 0.00747, + "grad_norm": 0.721423929849636, + "learning_rate": 0.002241, + "loss": 4.766, + "step": 747 + }, + { + "epoch": 0.00748, + "grad_norm": 0.8211263973312402, + "learning_rate": 0.002244, + "loss": 4.7825, + "step": 748 + }, + { + "epoch": 0.00749, + "grad_norm": 0.9630311220772746, + "learning_rate": 0.002247, + "loss": 4.7687, + "step": 749 + }, + { + "epoch": 0.0075, + "grad_norm": 1.0671208539128567, + "learning_rate": 0.0022500000000000003, + "loss": 4.8201, + "step": 750 + }, + { + "epoch": 0.00751, + "grad_norm": 1.0285377005887373, + "learning_rate": 0.0022530000000000002, + "loss": 4.8053, + "step": 751 + }, + { + "epoch": 0.00752, + "grad_norm": 0.9937387135055332, + "learning_rate": 0.002256, + "loss": 4.7635, + "step": 752 + }, + { + "epoch": 0.00753, + "grad_norm": 0.8939099354397223, + "learning_rate": 0.002259, + "loss": 4.8048, + "step": 753 + }, + { + "epoch": 0.00754, + "grad_norm": 0.958657384547811, + "learning_rate": 0.002262, + "loss": 4.816, + "step": 754 + }, + { + "epoch": 0.00755, + "grad_norm": 0.8579165829387244, + "learning_rate": 0.002265, + "loss": 4.7809, + "step": 755 + }, + { + "epoch": 0.00756, + "grad_norm": 0.8147493051985796, + "learning_rate": 0.002268, + "loss": 4.7687, + "step": 756 + }, + { + "epoch": 0.00757, + "grad_norm": 0.9899674342411924, + "learning_rate": 0.002271, + "loss": 4.7845, + "step": 757 + }, + { + "epoch": 0.00758, + "grad_norm": 1.2327208130150207, + "learning_rate": 0.002274, + "loss": 4.7967, + "step": 758 + }, + { + "epoch": 0.00759, + "grad_norm": 0.8760473410677282, + "learning_rate": 0.002277, + "loss": 4.7951, + "step": 759 + }, + { + "epoch": 0.0076, + "grad_norm": 0.9553393765090987, + "learning_rate": 0.00228, + "loss": 4.7842, + "step": 760 + }, + { + "epoch": 0.00761, + "grad_norm": 1.088855928225056, + "learning_rate": 0.002283, + "loss": 4.769, + "step": 761 + }, + { + "epoch": 0.00762, + "grad_norm": 0.9818480683263884, + "learning_rate": 0.0022860000000000003, + "loss": 4.7512, + "step": 762 + }, + { + "epoch": 0.00763, + "grad_norm": 0.9731540924632093, + "learning_rate": 0.0022890000000000002, + "loss": 4.7931, + "step": 763 + }, + { + "epoch": 0.00764, + "grad_norm": 1.0508884929557651, + "learning_rate": 0.002292, + "loss": 4.8167, + "step": 764 + }, + { + "epoch": 0.00765, + "grad_norm": 1.0020863769727308, + "learning_rate": 0.002295, + "loss": 4.7984, + "step": 765 + }, + { + "epoch": 0.00766, + "grad_norm": 1.1527463652354557, + "learning_rate": 0.002298, + "loss": 4.8085, + "step": 766 + }, + { + "epoch": 0.00767, + "grad_norm": 0.9657952239159258, + "learning_rate": 0.002301, + "loss": 4.7959, + "step": 767 + }, + { + "epoch": 0.00768, + "grad_norm": 1.0234917976922082, + "learning_rate": 0.002304, + "loss": 4.8012, + "step": 768 + }, + { + "epoch": 0.00769, + "grad_norm": 0.9850893067060651, + "learning_rate": 0.002307, + "loss": 4.8144, + "step": 769 + }, + { + "epoch": 0.0077, + "grad_norm": 0.9062134932024389, + "learning_rate": 0.00231, + "loss": 4.7653, + "step": 770 + }, + { + "epoch": 0.00771, + "grad_norm": 0.8476285286232204, + "learning_rate": 0.002313, + "loss": 4.7979, + "step": 771 + }, + { + "epoch": 0.00772, + "grad_norm": 0.9122213123018311, + "learning_rate": 0.002316, + "loss": 4.7851, + "step": 772 + }, + { + "epoch": 0.00773, + "grad_norm": 1.0718910624781612, + "learning_rate": 0.0023190000000000003, + "loss": 4.8052, + "step": 773 + }, + { + "epoch": 0.00774, + "grad_norm": 0.7792131883523417, + "learning_rate": 0.0023220000000000003, + "loss": 4.7945, + "step": 774 + }, + { + "epoch": 0.00775, + "grad_norm": 0.7995411986928386, + "learning_rate": 0.0023250000000000002, + "loss": 4.7914, + "step": 775 + }, + { + "epoch": 0.00776, + "grad_norm": 0.7054590225014301, + "learning_rate": 0.002328, + "loss": 4.7883, + "step": 776 + }, + { + "epoch": 0.00777, + "grad_norm": 0.6505869359405926, + "learning_rate": 0.002331, + "loss": 4.7585, + "step": 777 + }, + { + "epoch": 0.00778, + "grad_norm": 0.6484695284206986, + "learning_rate": 0.002334, + "loss": 4.7652, + "step": 778 + }, + { + "epoch": 0.00779, + "grad_norm": 0.6047799586124271, + "learning_rate": 0.002337, + "loss": 4.7239, + "step": 779 + }, + { + "epoch": 0.0078, + "grad_norm": 0.5436502526586032, + "learning_rate": 0.00234, + "loss": 4.7364, + "step": 780 + }, + { + "epoch": 0.00781, + "grad_norm": 0.5682167623371829, + "learning_rate": 0.002343, + "loss": 4.7355, + "step": 781 + }, + { + "epoch": 0.00782, + "grad_norm": 0.628910610744215, + "learning_rate": 0.002346, + "loss": 4.7356, + "step": 782 + }, + { + "epoch": 0.00783, + "grad_norm": 0.5845457532196663, + "learning_rate": 0.002349, + "loss": 4.7086, + "step": 783 + }, + { + "epoch": 0.00784, + "grad_norm": 0.6345965495249546, + "learning_rate": 0.002352, + "loss": 4.723, + "step": 784 + }, + { + "epoch": 0.00785, + "grad_norm": 0.4959017064429773, + "learning_rate": 0.0023550000000000003, + "loss": 4.7138, + "step": 785 + }, + { + "epoch": 0.00786, + "grad_norm": 0.4358915945164792, + "learning_rate": 0.0023580000000000003, + "loss": 4.7131, + "step": 786 + }, + { + "epoch": 0.00787, + "grad_norm": 0.4259891850743534, + "learning_rate": 0.0023610000000000003, + "loss": 4.7113, + "step": 787 + }, + { + "epoch": 0.00788, + "grad_norm": 0.4529949912379573, + "learning_rate": 0.002364, + "loss": 4.7077, + "step": 788 + }, + { + "epoch": 0.00789, + "grad_norm": 0.4944753699230628, + "learning_rate": 0.002367, + "loss": 4.7334, + "step": 789 + }, + { + "epoch": 0.0079, + "grad_norm": 0.6295031827770177, + "learning_rate": 0.00237, + "loss": 4.7144, + "step": 790 + }, + { + "epoch": 0.00791, + "grad_norm": 0.8474014146600959, + "learning_rate": 0.002373, + "loss": 4.6777, + "step": 791 + }, + { + "epoch": 0.00792, + "grad_norm": 0.8040402155973354, + "learning_rate": 0.002376, + "loss": 4.7267, + "step": 792 + }, + { + "epoch": 0.00793, + "grad_norm": 0.5568568781614048, + "learning_rate": 0.002379, + "loss": 4.7006, + "step": 793 + }, + { + "epoch": 0.00794, + "grad_norm": 0.8220014797505664, + "learning_rate": 0.002382, + "loss": 4.7246, + "step": 794 + }, + { + "epoch": 0.00795, + "grad_norm": 0.6551332917875898, + "learning_rate": 0.002385, + "loss": 4.6938, + "step": 795 + }, + { + "epoch": 0.00796, + "grad_norm": 0.5666814801389223, + "learning_rate": 0.0023880000000000004, + "loss": 4.6826, + "step": 796 + }, + { + "epoch": 0.00797, + "grad_norm": 0.654430341218369, + "learning_rate": 0.0023910000000000003, + "loss": 4.7248, + "step": 797 + }, + { + "epoch": 0.00798, + "grad_norm": 0.5511512463730408, + "learning_rate": 0.0023940000000000003, + "loss": 4.6864, + "step": 798 + }, + { + "epoch": 0.00799, + "grad_norm": 0.5084174359945534, + "learning_rate": 0.0023970000000000003, + "loss": 4.7161, + "step": 799 + }, + { + "epoch": 0.008, + "grad_norm": 0.41968398472595075, + "learning_rate": 0.0024000000000000002, + "loss": 4.6652, + "step": 800 + }, + { + "epoch": 0.00801, + "grad_norm": 0.4546309911468048, + "learning_rate": 0.002403, + "loss": 4.6967, + "step": 801 + }, + { + "epoch": 0.00802, + "grad_norm": 0.430018994115786, + "learning_rate": 0.002406, + "loss": 4.6827, + "step": 802 + }, + { + "epoch": 0.00803, + "grad_norm": 0.44614830716967085, + "learning_rate": 0.002409, + "loss": 4.6617, + "step": 803 + }, + { + "epoch": 0.00804, + "grad_norm": 0.44722400844593674, + "learning_rate": 0.002412, + "loss": 4.6598, + "step": 804 + }, + { + "epoch": 0.00805, + "grad_norm": 0.5179127215582825, + "learning_rate": 0.002415, + "loss": 4.6599, + "step": 805 + }, + { + "epoch": 0.00806, + "grad_norm": 0.5610832008078775, + "learning_rate": 0.002418, + "loss": 4.677, + "step": 806 + }, + { + "epoch": 0.00807, + "grad_norm": 0.5167453223410896, + "learning_rate": 0.0024210000000000004, + "loss": 4.6671, + "step": 807 + }, + { + "epoch": 0.00808, + "grad_norm": 0.46468933196331563, + "learning_rate": 0.0024240000000000004, + "loss": 4.6511, + "step": 808 + }, + { + "epoch": 0.00809, + "grad_norm": 0.5221883532574668, + "learning_rate": 0.0024270000000000003, + "loss": 4.6468, + "step": 809 + }, + { + "epoch": 0.0081, + "grad_norm": 0.4992566900849729, + "learning_rate": 0.0024300000000000003, + "loss": 4.6744, + "step": 810 + }, + { + "epoch": 0.00811, + "grad_norm": 0.4854147467055134, + "learning_rate": 0.0024330000000000003, + "loss": 4.646, + "step": 811 + }, + { + "epoch": 0.00812, + "grad_norm": 0.650970729431075, + "learning_rate": 0.0024360000000000002, + "loss": 4.6307, + "step": 812 + }, + { + "epoch": 0.00813, + "grad_norm": 0.8160691589494683, + "learning_rate": 0.0024389999999999998, + "loss": 4.6711, + "step": 813 + }, + { + "epoch": 0.00814, + "grad_norm": 0.9918101747931352, + "learning_rate": 0.0024419999999999997, + "loss": 4.6946, + "step": 814 + }, + { + "epoch": 0.00815, + "grad_norm": 1.247963175893729, + "learning_rate": 0.0024449999999999997, + "loss": 4.7226, + "step": 815 + }, + { + "epoch": 0.00816, + "grad_norm": 0.8376200515557375, + "learning_rate": 0.002448, + "loss": 4.6777, + "step": 816 + }, + { + "epoch": 0.00817, + "grad_norm": 0.9161032619759178, + "learning_rate": 0.002451, + "loss": 4.6939, + "step": 817 + }, + { + "epoch": 0.00818, + "grad_norm": 1.0914649908014256, + "learning_rate": 0.002454, + "loss": 4.6886, + "step": 818 + }, + { + "epoch": 0.00819, + "grad_norm": 0.9806171410774952, + "learning_rate": 0.002457, + "loss": 4.712, + "step": 819 + }, + { + "epoch": 0.0082, + "grad_norm": 0.992236077471004, + "learning_rate": 0.00246, + "loss": 4.6918, + "step": 820 + }, + { + "epoch": 0.00821, + "grad_norm": 1.0594557870263281, + "learning_rate": 0.002463, + "loss": 4.6759, + "step": 821 + }, + { + "epoch": 0.00822, + "grad_norm": 1.0346800919438124, + "learning_rate": 0.002466, + "loss": 4.6853, + "step": 822 + }, + { + "epoch": 0.00823, + "grad_norm": 0.9573573191186882, + "learning_rate": 0.002469, + "loss": 4.6833, + "step": 823 + }, + { + "epoch": 0.00824, + "grad_norm": 1.1123514933123841, + "learning_rate": 0.002472, + "loss": 4.714, + "step": 824 + }, + { + "epoch": 0.00825, + "grad_norm": 0.8463845700248506, + "learning_rate": 0.0024749999999999998, + "loss": 4.7191, + "step": 825 + }, + { + "epoch": 0.00826, + "grad_norm": 0.8444785606085857, + "learning_rate": 0.0024779999999999997, + "loss": 4.672, + "step": 826 + }, + { + "epoch": 0.00827, + "grad_norm": 0.9726341870117121, + "learning_rate": 0.002481, + "loss": 4.7078, + "step": 827 + }, + { + "epoch": 0.00828, + "grad_norm": 0.9106448417621353, + "learning_rate": 0.002484, + "loss": 4.7003, + "step": 828 + }, + { + "epoch": 0.00829, + "grad_norm": 0.7565680418878746, + "learning_rate": 0.002487, + "loss": 4.6856, + "step": 829 + }, + { + "epoch": 0.0083, + "grad_norm": 0.8537774465977133, + "learning_rate": 0.00249, + "loss": 4.7017, + "step": 830 + }, + { + "epoch": 0.00831, + "grad_norm": 0.9023323948099834, + "learning_rate": 0.002493, + "loss": 4.6871, + "step": 831 + }, + { + "epoch": 0.00832, + "grad_norm": 0.8524529451127855, + "learning_rate": 0.002496, + "loss": 4.6815, + "step": 832 + }, + { + "epoch": 0.00833, + "grad_norm": 0.9428655185832147, + "learning_rate": 0.002499, + "loss": 4.6808, + "step": 833 + }, + { + "epoch": 0.00834, + "grad_norm": 0.9597220185428569, + "learning_rate": 0.002502, + "loss": 4.6879, + "step": 834 + }, + { + "epoch": 0.00835, + "grad_norm": 0.7735101632354252, + "learning_rate": 0.002505, + "loss": 4.6724, + "step": 835 + }, + { + "epoch": 0.00836, + "grad_norm": 0.9597202731139803, + "learning_rate": 0.002508, + "loss": 4.7061, + "step": 836 + }, + { + "epoch": 0.00837, + "grad_norm": 0.9520863539431935, + "learning_rate": 0.0025109999999999998, + "loss": 4.6636, + "step": 837 + }, + { + "epoch": 0.00838, + "grad_norm": 0.7800128524395746, + "learning_rate": 0.0025139999999999997, + "loss": 4.6721, + "step": 838 + }, + { + "epoch": 0.00839, + "grad_norm": 0.8122589832425033, + "learning_rate": 0.002517, + "loss": 4.675, + "step": 839 + }, + { + "epoch": 0.0084, + "grad_norm": 0.8183344402395425, + "learning_rate": 0.00252, + "loss": 4.6669, + "step": 840 + }, + { + "epoch": 0.00841, + "grad_norm": 0.6918735110390536, + "learning_rate": 0.002523, + "loss": 4.6489, + "step": 841 + }, + { + "epoch": 0.00842, + "grad_norm": 0.6201385747244391, + "learning_rate": 0.002526, + "loss": 4.6423, + "step": 842 + }, + { + "epoch": 0.00843, + "grad_norm": 0.606127970479136, + "learning_rate": 0.002529, + "loss": 4.6465, + "step": 843 + }, + { + "epoch": 0.00844, + "grad_norm": 0.5515773209874846, + "learning_rate": 0.002532, + "loss": 4.6607, + "step": 844 + }, + { + "epoch": 0.00845, + "grad_norm": 0.6203742299859808, + "learning_rate": 0.002535, + "loss": 4.6293, + "step": 845 + }, + { + "epoch": 0.00846, + "grad_norm": 0.5875832865020281, + "learning_rate": 0.002538, + "loss": 4.6474, + "step": 846 + }, + { + "epoch": 0.00847, + "grad_norm": 0.5703256353430879, + "learning_rate": 0.002541, + "loss": 4.6282, + "step": 847 + }, + { + "epoch": 0.00848, + "grad_norm": 0.602830367643936, + "learning_rate": 0.002544, + "loss": 4.6269, + "step": 848 + }, + { + "epoch": 0.00849, + "grad_norm": 0.6741507039909044, + "learning_rate": 0.002547, + "loss": 4.6233, + "step": 849 + }, + { + "epoch": 0.0085, + "grad_norm": 0.6288739006540759, + "learning_rate": 0.00255, + "loss": 4.6341, + "step": 850 + }, + { + "epoch": 0.00851, + "grad_norm": 0.5820099008678455, + "learning_rate": 0.002553, + "loss": 4.644, + "step": 851 + }, + { + "epoch": 0.00852, + "grad_norm": 0.586123912558797, + "learning_rate": 0.002556, + "loss": 4.6367, + "step": 852 + }, + { + "epoch": 0.00853, + "grad_norm": 0.5127813487098001, + "learning_rate": 0.002559, + "loss": 4.6085, + "step": 853 + }, + { + "epoch": 0.00854, + "grad_norm": 0.4730499644759358, + "learning_rate": 0.002562, + "loss": 4.6029, + "step": 854 + }, + { + "epoch": 0.00855, + "grad_norm": 0.44708869980986227, + "learning_rate": 0.002565, + "loss": 4.5799, + "step": 855 + }, + { + "epoch": 0.00856, + "grad_norm": 0.466044480858233, + "learning_rate": 0.002568, + "loss": 4.6142, + "step": 856 + }, + { + "epoch": 0.00857, + "grad_norm": 0.5382201915945353, + "learning_rate": 0.002571, + "loss": 4.6036, + "step": 857 + }, + { + "epoch": 0.00858, + "grad_norm": 0.6780662034295477, + "learning_rate": 0.002574, + "loss": 4.609, + "step": 858 + }, + { + "epoch": 0.00859, + "grad_norm": 0.9086610382483981, + "learning_rate": 0.002577, + "loss": 4.6039, + "step": 859 + }, + { + "epoch": 0.0086, + "grad_norm": 0.8563688949272525, + "learning_rate": 0.00258, + "loss": 4.6531, + "step": 860 + }, + { + "epoch": 0.00861, + "grad_norm": 0.5965670098126366, + "learning_rate": 0.0025830000000000002, + "loss": 4.6461, + "step": 861 + }, + { + "epoch": 0.00862, + "grad_norm": 0.7975052365958228, + "learning_rate": 0.002586, + "loss": 4.611, + "step": 862 + }, + { + "epoch": 0.00863, + "grad_norm": 0.650099032572018, + "learning_rate": 0.002589, + "loss": 4.6284, + "step": 863 + }, + { + "epoch": 0.00864, + "grad_norm": 0.6277114763068243, + "learning_rate": 0.002592, + "loss": 4.5809, + "step": 864 + }, + { + "epoch": 0.00865, + "grad_norm": 0.7499269309750987, + "learning_rate": 0.002595, + "loss": 4.6024, + "step": 865 + }, + { + "epoch": 0.00866, + "grad_norm": 0.7085973518403954, + "learning_rate": 0.002598, + "loss": 4.6233, + "step": 866 + }, + { + "epoch": 0.00867, + "grad_norm": 0.605538925445329, + "learning_rate": 0.002601, + "loss": 4.6101, + "step": 867 + }, + { + "epoch": 0.00868, + "grad_norm": 0.48200776305054654, + "learning_rate": 0.002604, + "loss": 4.586, + "step": 868 + }, + { + "epoch": 0.00869, + "grad_norm": 0.5266950965425763, + "learning_rate": 0.002607, + "loss": 4.5966, + "step": 869 + }, + { + "epoch": 0.0087, + "grad_norm": 0.48953699231607295, + "learning_rate": 0.00261, + "loss": 4.5832, + "step": 870 + }, + { + "epoch": 0.00871, + "grad_norm": 0.5478274928438833, + "learning_rate": 0.002613, + "loss": 4.5992, + "step": 871 + }, + { + "epoch": 0.00872, + "grad_norm": 0.6263670684952429, + "learning_rate": 0.002616, + "loss": 4.5864, + "step": 872 + }, + { + "epoch": 0.00873, + "grad_norm": 0.6672951132542, + "learning_rate": 0.0026190000000000002, + "loss": 4.5977, + "step": 873 + }, + { + "epoch": 0.00874, + "grad_norm": 0.677096167715366, + "learning_rate": 0.002622, + "loss": 4.617, + "step": 874 + }, + { + "epoch": 0.00875, + "grad_norm": 0.6959913524482387, + "learning_rate": 0.002625, + "loss": 4.5696, + "step": 875 + }, + { + "epoch": 0.00876, + "grad_norm": 0.6682762743495083, + "learning_rate": 0.002628, + "loss": 4.5958, + "step": 876 + }, + { + "epoch": 0.00877, + "grad_norm": 0.6375266502117092, + "learning_rate": 0.002631, + "loss": 4.612, + "step": 877 + }, + { + "epoch": 0.00878, + "grad_norm": 0.7079418482290942, + "learning_rate": 0.002634, + "loss": 4.5486, + "step": 878 + }, + { + "epoch": 0.00879, + "grad_norm": 0.6282689223941402, + "learning_rate": 0.002637, + "loss": 4.578, + "step": 879 + }, + { + "epoch": 0.0088, + "grad_norm": 0.5460943252882049, + "learning_rate": 0.00264, + "loss": 4.5852, + "step": 880 + }, + { + "epoch": 0.00881, + "grad_norm": 0.5723972494402886, + "learning_rate": 0.002643, + "loss": 4.5869, + "step": 881 + }, + { + "epoch": 0.00882, + "grad_norm": 0.647818443655113, + "learning_rate": 0.002646, + "loss": 4.588, + "step": 882 + }, + { + "epoch": 0.00883, + "grad_norm": 0.8827068805337381, + "learning_rate": 0.002649, + "loss": 4.5935, + "step": 883 + }, + { + "epoch": 0.00884, + "grad_norm": 1.3000201706023533, + "learning_rate": 0.0026520000000000003, + "loss": 4.6052, + "step": 884 + }, + { + "epoch": 0.00885, + "grad_norm": 0.7527768384442359, + "learning_rate": 0.0026550000000000002, + "loss": 4.5797, + "step": 885 + }, + { + "epoch": 0.00886, + "grad_norm": 0.7313595200920677, + "learning_rate": 0.002658, + "loss": 4.6019, + "step": 886 + }, + { + "epoch": 0.00887, + "grad_norm": 0.553327654847044, + "learning_rate": 0.002661, + "loss": 4.5828, + "step": 887 + }, + { + "epoch": 0.00888, + "grad_norm": 0.6064219625843388, + "learning_rate": 0.002664, + "loss": 4.5894, + "step": 888 + }, + { + "epoch": 0.00889, + "grad_norm": 0.6392357596846293, + "learning_rate": 0.002667, + "loss": 4.5422, + "step": 889 + }, + { + "epoch": 0.0089, + "grad_norm": 0.6860123914477424, + "learning_rate": 0.00267, + "loss": 4.5989, + "step": 890 + }, + { + "epoch": 0.00891, + "grad_norm": 0.7088960904364014, + "learning_rate": 0.002673, + "loss": 4.5822, + "step": 891 + }, + { + "epoch": 0.00892, + "grad_norm": 0.7157207147763361, + "learning_rate": 0.002676, + "loss": 4.5934, + "step": 892 + }, + { + "epoch": 0.00893, + "grad_norm": 0.7412527752908875, + "learning_rate": 0.002679, + "loss": 4.5709, + "step": 893 + }, + { + "epoch": 0.00894, + "grad_norm": 0.8084836835989728, + "learning_rate": 0.002682, + "loss": 4.5639, + "step": 894 + }, + { + "epoch": 0.00895, + "grad_norm": 0.9923307111818513, + "learning_rate": 0.0026850000000000003, + "loss": 4.5864, + "step": 895 + }, + { + "epoch": 0.00896, + "grad_norm": 1.2171682577354312, + "learning_rate": 0.0026880000000000003, + "loss": 4.6057, + "step": 896 + }, + { + "epoch": 0.00897, + "grad_norm": 0.797478427208377, + "learning_rate": 0.0026910000000000002, + "loss": 4.5989, + "step": 897 + }, + { + "epoch": 0.00898, + "grad_norm": 0.7928728804117916, + "learning_rate": 0.002694, + "loss": 4.594, + "step": 898 + }, + { + "epoch": 0.00899, + "grad_norm": 0.8357403035452178, + "learning_rate": 0.002697, + "loss": 4.5983, + "step": 899 + }, + { + "epoch": 0.009, + "grad_norm": 0.8448290091163538, + "learning_rate": 0.0027, + "loss": 4.6292, + "step": 900 + }, + { + "epoch": 0.00901, + "grad_norm": 0.9488092229670547, + "learning_rate": 0.002703, + "loss": 4.5868, + "step": 901 + }, + { + "epoch": 0.00902, + "grad_norm": 0.9434404658743749, + "learning_rate": 0.002706, + "loss": 4.5999, + "step": 902 + }, + { + "epoch": 0.00903, + "grad_norm": 1.0122099567822476, + "learning_rate": 0.002709, + "loss": 4.6102, + "step": 903 + }, + { + "epoch": 0.00904, + "grad_norm": 0.9358691681287052, + "learning_rate": 0.002712, + "loss": 4.5848, + "step": 904 + }, + { + "epoch": 0.00905, + "grad_norm": 0.8321510442485943, + "learning_rate": 0.002715, + "loss": 4.5984, + "step": 905 + }, + { + "epoch": 0.00906, + "grad_norm": 0.8914473393947665, + "learning_rate": 0.002718, + "loss": 4.6112, + "step": 906 + }, + { + "epoch": 0.00907, + "grad_norm": 0.9883982303638487, + "learning_rate": 0.0027210000000000003, + "loss": 4.6386, + "step": 907 + }, + { + "epoch": 0.00908, + "grad_norm": 0.86073203349026, + "learning_rate": 0.0027240000000000003, + "loss": 4.6116, + "step": 908 + }, + { + "epoch": 0.00909, + "grad_norm": 0.7773747412069614, + "learning_rate": 0.0027270000000000003, + "loss": 4.6163, + "step": 909 + }, + { + "epoch": 0.0091, + "grad_norm": 0.7370585718531062, + "learning_rate": 0.0027300000000000002, + "loss": 4.6234, + "step": 910 + }, + { + "epoch": 0.00911, + "grad_norm": 0.6906269071273593, + "learning_rate": 0.002733, + "loss": 4.5785, + "step": 911 + }, + { + "epoch": 0.00912, + "grad_norm": 0.6578032292778252, + "learning_rate": 0.002736, + "loss": 4.5778, + "step": 912 + }, + { + "epoch": 0.00913, + "grad_norm": 0.6528626059582382, + "learning_rate": 0.002739, + "loss": 4.5704, + "step": 913 + }, + { + "epoch": 0.00914, + "grad_norm": 0.599731896856576, + "learning_rate": 0.002742, + "loss": 4.595, + "step": 914 + }, + { + "epoch": 0.00915, + "grad_norm": 0.5922054086035364, + "learning_rate": 0.002745, + "loss": 4.5555, + "step": 915 + }, + { + "epoch": 0.00916, + "grad_norm": 0.528646140228931, + "learning_rate": 0.002748, + "loss": 4.5304, + "step": 916 + }, + { + "epoch": 0.00917, + "grad_norm": 0.5305158198561161, + "learning_rate": 0.002751, + "loss": 4.5419, + "step": 917 + }, + { + "epoch": 0.00918, + "grad_norm": 0.4736382884122071, + "learning_rate": 0.0027540000000000004, + "loss": 4.5569, + "step": 918 + }, + { + "epoch": 0.00919, + "grad_norm": 0.45838817911808083, + "learning_rate": 0.0027570000000000003, + "loss": 4.5357, + "step": 919 + }, + { + "epoch": 0.0092, + "grad_norm": 0.4361472184478695, + "learning_rate": 0.0027600000000000003, + "loss": 4.5493, + "step": 920 + }, + { + "epoch": 0.00921, + "grad_norm": 0.43931126757575867, + "learning_rate": 0.0027630000000000003, + "loss": 4.5336, + "step": 921 + }, + { + "epoch": 0.00922, + "grad_norm": 0.46920285200840567, + "learning_rate": 0.0027660000000000002, + "loss": 4.5412, + "step": 922 + }, + { + "epoch": 0.00923, + "grad_norm": 0.5996209745093354, + "learning_rate": 0.002769, + "loss": 4.5218, + "step": 923 + }, + { + "epoch": 0.00924, + "grad_norm": 0.702528530852281, + "learning_rate": 0.002772, + "loss": 4.5564, + "step": 924 + }, + { + "epoch": 0.00925, + "grad_norm": 0.6651779892024297, + "learning_rate": 0.002775, + "loss": 4.5288, + "step": 925 + }, + { + "epoch": 0.00926, + "grad_norm": 0.5723855300218565, + "learning_rate": 0.002778, + "loss": 4.5426, + "step": 926 + }, + { + "epoch": 0.00927, + "grad_norm": 0.6080148945846373, + "learning_rate": 0.002781, + "loss": 4.5043, + "step": 927 + }, + { + "epoch": 0.00928, + "grad_norm": 0.6957035289314476, + "learning_rate": 0.002784, + "loss": 4.5502, + "step": 928 + }, + { + "epoch": 0.00929, + "grad_norm": 0.7366634335117053, + "learning_rate": 0.0027870000000000004, + "loss": 4.5403, + "step": 929 + }, + { + "epoch": 0.0093, + "grad_norm": 0.7796102692352119, + "learning_rate": 0.0027900000000000004, + "loss": 4.5265, + "step": 930 + }, + { + "epoch": 0.00931, + "grad_norm": 0.7116909602884058, + "learning_rate": 0.0027930000000000003, + "loss": 4.5333, + "step": 931 + }, + { + "epoch": 0.00932, + "grad_norm": 0.8319854024158545, + "learning_rate": 0.0027960000000000003, + "loss": 4.5481, + "step": 932 + }, + { + "epoch": 0.00933, + "grad_norm": 0.8001418279766108, + "learning_rate": 0.0027990000000000003, + "loss": 4.5286, + "step": 933 + }, + { + "epoch": 0.00934, + "grad_norm": 0.6519619232143173, + "learning_rate": 0.0028020000000000002, + "loss": 4.5422, + "step": 934 + }, + { + "epoch": 0.00935, + "grad_norm": 0.7580737482550882, + "learning_rate": 0.002805, + "loss": 4.5559, + "step": 935 + }, + { + "epoch": 0.00936, + "grad_norm": 0.8499038627491867, + "learning_rate": 0.002808, + "loss": 4.5679, + "step": 936 + }, + { + "epoch": 0.00937, + "grad_norm": 0.7597302495348821, + "learning_rate": 0.002811, + "loss": 4.5708, + "step": 937 + }, + { + "epoch": 0.00938, + "grad_norm": 0.9878821641788273, + "learning_rate": 0.002814, + "loss": 4.5609, + "step": 938 + }, + { + "epoch": 0.00939, + "grad_norm": 0.9691729918808772, + "learning_rate": 0.002817, + "loss": 4.563, + "step": 939 + }, + { + "epoch": 0.0094, + "grad_norm": 0.8937843559478598, + "learning_rate": 0.00282, + "loss": 4.56, + "step": 940 + }, + { + "epoch": 0.00941, + "grad_norm": 0.9477839045288606, + "learning_rate": 0.002823, + "loss": 4.5409, + "step": 941 + }, + { + "epoch": 0.00942, + "grad_norm": 1.088560613057821, + "learning_rate": 0.002826, + "loss": 4.5819, + "step": 942 + }, + { + "epoch": 0.00943, + "grad_norm": 0.8020128186220904, + "learning_rate": 0.002829, + "loss": 4.556, + "step": 943 + }, + { + "epoch": 0.00944, + "grad_norm": 0.7970499406732843, + "learning_rate": 0.002832, + "loss": 4.5652, + "step": 944 + }, + { + "epoch": 0.00945, + "grad_norm": 0.760430287307007, + "learning_rate": 0.002835, + "loss": 4.567, + "step": 945 + }, + { + "epoch": 0.00946, + "grad_norm": 0.8410168172764453, + "learning_rate": 0.002838, + "loss": 4.5808, + "step": 946 + }, + { + "epoch": 0.00947, + "grad_norm": 0.8502364092306604, + "learning_rate": 0.0028409999999999998, + "loss": 4.5581, + "step": 947 + }, + { + "epoch": 0.00948, + "grad_norm": 0.7534324730199542, + "learning_rate": 0.0028439999999999997, + "loss": 4.533, + "step": 948 + }, + { + "epoch": 0.00949, + "grad_norm": 0.8075715283027973, + "learning_rate": 0.002847, + "loss": 4.5789, + "step": 949 + }, + { + "epoch": 0.0095, + "grad_norm": 0.8790685187514339, + "learning_rate": 0.00285, + "loss": 4.5764, + "step": 950 + }, + { + "epoch": 0.00951, + "grad_norm": 0.8527621336415785, + "learning_rate": 0.002853, + "loss": 4.552, + "step": 951 + }, + { + "epoch": 0.00952, + "grad_norm": 0.793648162843131, + "learning_rate": 0.002856, + "loss": 4.5771, + "step": 952 + }, + { + "epoch": 0.00953, + "grad_norm": 0.7100823051409002, + "learning_rate": 0.002859, + "loss": 4.5151, + "step": 953 + }, + { + "epoch": 0.00954, + "grad_norm": 0.776086010454581, + "learning_rate": 0.002862, + "loss": 4.5748, + "step": 954 + }, + { + "epoch": 0.00955, + "grad_norm": 0.7357834745016256, + "learning_rate": 0.002865, + "loss": 4.5651, + "step": 955 + }, + { + "epoch": 0.00956, + "grad_norm": 0.6871788604084053, + "learning_rate": 0.002868, + "loss": 4.5378, + "step": 956 + }, + { + "epoch": 0.00957, + "grad_norm": 0.6293704920093642, + "learning_rate": 0.002871, + "loss": 4.5585, + "step": 957 + }, + { + "epoch": 0.00958, + "grad_norm": 0.6933721545151298, + "learning_rate": 0.002874, + "loss": 4.5402, + "step": 958 + }, + { + "epoch": 0.00959, + "grad_norm": 0.6216290945191316, + "learning_rate": 0.002877, + "loss": 4.5294, + "step": 959 + }, + { + "epoch": 0.0096, + "grad_norm": 0.44090482568449035, + "learning_rate": 0.0028799999999999997, + "loss": 4.5205, + "step": 960 + }, + { + "epoch": 0.00961, + "grad_norm": 0.5026549244936088, + "learning_rate": 0.002883, + "loss": 4.4973, + "step": 961 + }, + { + "epoch": 0.00962, + "grad_norm": 0.46550744372429714, + "learning_rate": 0.002886, + "loss": 4.5199, + "step": 962 + }, + { + "epoch": 0.00963, + "grad_norm": 0.4817462883709995, + "learning_rate": 0.002889, + "loss": 4.5204, + "step": 963 + }, + { + "epoch": 0.00964, + "grad_norm": 0.5021989893794454, + "learning_rate": 0.002892, + "loss": 4.5105, + "step": 964 + }, + { + "epoch": 0.00965, + "grad_norm": 0.6331237702649058, + "learning_rate": 0.002895, + "loss": 4.4888, + "step": 965 + }, + { + "epoch": 0.00966, + "grad_norm": 0.7186463225121739, + "learning_rate": 0.002898, + "loss": 4.5122, + "step": 966 + }, + { + "epoch": 0.00967, + "grad_norm": 0.835541291398658, + "learning_rate": 0.002901, + "loss": 4.5497, + "step": 967 + }, + { + "epoch": 0.00968, + "grad_norm": 0.7770950591910699, + "learning_rate": 0.002904, + "loss": 4.5246, + "step": 968 + }, + { + "epoch": 0.00969, + "grad_norm": 0.6421972738290654, + "learning_rate": 0.002907, + "loss": 4.5465, + "step": 969 + }, + { + "epoch": 0.0097, + "grad_norm": 0.6170493579190435, + "learning_rate": 0.00291, + "loss": 4.5014, + "step": 970 + }, + { + "epoch": 0.00971, + "grad_norm": 0.671893763395282, + "learning_rate": 0.002913, + "loss": 4.5134, + "step": 971 + }, + { + "epoch": 0.00972, + "grad_norm": 0.5384349268117217, + "learning_rate": 0.002916, + "loss": 4.51, + "step": 972 + }, + { + "epoch": 0.00973, + "grad_norm": 0.6570052261370841, + "learning_rate": 0.002919, + "loss": 4.5075, + "step": 973 + }, + { + "epoch": 0.00974, + "grad_norm": 0.6469437996214488, + "learning_rate": 0.002922, + "loss": 4.5042, + "step": 974 + }, + { + "epoch": 0.00975, + "grad_norm": 0.5139434995269291, + "learning_rate": 0.002925, + "loss": 4.5141, + "step": 975 + }, + { + "epoch": 0.00976, + "grad_norm": 0.540350404123188, + "learning_rate": 0.002928, + "loss": 4.4984, + "step": 976 + }, + { + "epoch": 0.00977, + "grad_norm": 0.5640158884340003, + "learning_rate": 0.002931, + "loss": 4.5359, + "step": 977 + }, + { + "epoch": 0.00978, + "grad_norm": 0.5471232322596488, + "learning_rate": 0.002934, + "loss": 4.5069, + "step": 978 + }, + { + "epoch": 0.00979, + "grad_norm": 0.6391692840311302, + "learning_rate": 0.002937, + "loss": 4.4972, + "step": 979 + }, + { + "epoch": 0.0098, + "grad_norm": 0.7065424241899814, + "learning_rate": 0.00294, + "loss": 4.4963, + "step": 980 + }, + { + "epoch": 0.00981, + "grad_norm": 0.6903724510426201, + "learning_rate": 0.002943, + "loss": 4.5078, + "step": 981 + }, + { + "epoch": 0.00982, + "grad_norm": 0.653302049548968, + "learning_rate": 0.002946, + "loss": 4.4991, + "step": 982 + }, + { + "epoch": 0.00983, + "grad_norm": 0.7786836590229197, + "learning_rate": 0.0029490000000000002, + "loss": 4.5061, + "step": 983 + }, + { + "epoch": 0.00984, + "grad_norm": 0.8944946184941699, + "learning_rate": 0.002952, + "loss": 4.5043, + "step": 984 + }, + { + "epoch": 0.00985, + "grad_norm": 0.9434362337974719, + "learning_rate": 0.002955, + "loss": 4.5335, + "step": 985 + }, + { + "epoch": 0.00986, + "grad_norm": 0.9312010498282314, + "learning_rate": 0.002958, + "loss": 4.4908, + "step": 986 + }, + { + "epoch": 0.00987, + "grad_norm": 0.7600059379317153, + "learning_rate": 0.002961, + "loss": 4.5038, + "step": 987 + }, + { + "epoch": 0.00988, + "grad_norm": 0.6949983731085813, + "learning_rate": 0.002964, + "loss": 4.533, + "step": 988 + }, + { + "epoch": 0.00989, + "grad_norm": 0.6958207764457225, + "learning_rate": 0.002967, + "loss": 4.4744, + "step": 989 + }, + { + "epoch": 0.0099, + "grad_norm": 0.8314159874496235, + "learning_rate": 0.00297, + "loss": 4.5218, + "step": 990 + }, + { + "epoch": 0.00991, + "grad_norm": 0.90109982550625, + "learning_rate": 0.002973, + "loss": 4.5224, + "step": 991 + }, + { + "epoch": 0.00992, + "grad_norm": 0.951208720196979, + "learning_rate": 0.002976, + "loss": 4.5281, + "step": 992 + }, + { + "epoch": 0.00993, + "grad_norm": 1.034654706401041, + "learning_rate": 0.002979, + "loss": 4.539, + "step": 993 + }, + { + "epoch": 0.00994, + "grad_norm": 1.0078120294806783, + "learning_rate": 0.002982, + "loss": 4.5459, + "step": 994 + }, + { + "epoch": 0.00995, + "grad_norm": 0.9905056664043064, + "learning_rate": 0.0029850000000000002, + "loss": 4.5461, + "step": 995 + }, + { + "epoch": 0.00996, + "grad_norm": 1.1603429750232241, + "learning_rate": 0.002988, + "loss": 4.5605, + "step": 996 + }, + { + "epoch": 0.00997, + "grad_norm": 0.8595677567729485, + "learning_rate": 0.002991, + "loss": 4.5366, + "step": 997 + }, + { + "epoch": 0.00998, + "grad_norm": 0.9672785170741734, + "learning_rate": 0.002994, + "loss": 4.5467, + "step": 998 + }, + { + "epoch": 0.00999, + "grad_norm": 0.8855476122040005, + "learning_rate": 0.002997, + "loss": 4.5306, + "step": 999 + }, + { + "epoch": 0.01, + "grad_norm": 0.7479993027173912, + "learning_rate": 0.003, + "loss": 4.5436, + "step": 1000 + }, + { + "epoch": 0.01001, + "grad_norm": 0.7121555496393158, + "learning_rate": 0.003, + "loss": 4.5401, + "step": 1001 + }, + { + "epoch": 0.01002, + "grad_norm": 0.72505396659877, + "learning_rate": 0.003, + "loss": 4.5099, + "step": 1002 + }, + { + "epoch": 0.01003, + "grad_norm": 0.7482409624375888, + "learning_rate": 0.003, + "loss": 4.5346, + "step": 1003 + }, + { + "epoch": 0.01004, + "grad_norm": 0.5922239695560899, + "learning_rate": 0.003, + "loss": 4.513, + "step": 1004 + }, + { + "epoch": 0.01005, + "grad_norm": 0.7163386651042714, + "learning_rate": 0.003, + "loss": 4.4897, + "step": 1005 + }, + { + "epoch": 0.01006, + "grad_norm": 0.7664341729228217, + "learning_rate": 0.003, + "loss": 4.4943, + "step": 1006 + }, + { + "epoch": 0.01007, + "grad_norm": 0.6325498197242884, + "learning_rate": 0.003, + "loss": 4.5316, + "step": 1007 + }, + { + "epoch": 0.01008, + "grad_norm": 0.5334181570982156, + "learning_rate": 0.003, + "loss": 4.5325, + "step": 1008 + }, + { + "epoch": 0.01009, + "grad_norm": 0.4756687788935663, + "learning_rate": 0.003, + "loss": 4.4998, + "step": 1009 + }, + { + "epoch": 0.0101, + "grad_norm": 0.44081077367096977, + "learning_rate": 0.003, + "loss": 4.5025, + "step": 1010 + }, + { + "epoch": 0.01011, + "grad_norm": 0.4706704982402646, + "learning_rate": 0.003, + "loss": 4.5082, + "step": 1011 + }, + { + "epoch": 0.01012, + "grad_norm": 0.47223639219621605, + "learning_rate": 0.003, + "loss": 4.492, + "step": 1012 + }, + { + "epoch": 0.01013, + "grad_norm": 0.4611333961201937, + "learning_rate": 0.003, + "loss": 4.5135, + "step": 1013 + }, + { + "epoch": 0.01014, + "grad_norm": 0.36636268932008986, + "learning_rate": 0.003, + "loss": 4.4528, + "step": 1014 + }, + { + "epoch": 0.01015, + "grad_norm": 0.4484711002206511, + "learning_rate": 0.003, + "loss": 4.4948, + "step": 1015 + }, + { + "epoch": 0.01016, + "grad_norm": 0.553209882568818, + "learning_rate": 0.003, + "loss": 4.5082, + "step": 1016 + }, + { + "epoch": 0.01017, + "grad_norm": 0.8898002239967567, + "learning_rate": 0.003, + "loss": 4.4579, + "step": 1017 + }, + { + "epoch": 0.01018, + "grad_norm": 0.9938726896514534, + "learning_rate": 0.003, + "loss": 4.5557, + "step": 1018 + }, + { + "epoch": 0.01019, + "grad_norm": 0.7008138530645361, + "learning_rate": 0.003, + "loss": 4.5003, + "step": 1019 + }, + { + "epoch": 0.0102, + "grad_norm": 1.006685411589756, + "learning_rate": 0.003, + "loss": 4.5244, + "step": 1020 + }, + { + "epoch": 0.01021, + "grad_norm": 0.6295405897908188, + "learning_rate": 0.003, + "loss": 4.4915, + "step": 1021 + }, + { + "epoch": 0.01022, + "grad_norm": 0.6613412501940705, + "learning_rate": 0.003, + "loss": 4.5063, + "step": 1022 + }, + { + "epoch": 0.01023, + "grad_norm": 0.6284055579476905, + "learning_rate": 0.003, + "loss": 4.4914, + "step": 1023 + }, + { + "epoch": 0.01024, + "grad_norm": 0.6735486877543648, + "learning_rate": 0.003, + "loss": 4.4573, + "step": 1024 + }, + { + "epoch": 0.01025, + "grad_norm": 0.6253815453908939, + "learning_rate": 0.003, + "loss": 4.4832, + "step": 1025 + }, + { + "epoch": 0.01026, + "grad_norm": 0.5932239048767913, + "learning_rate": 0.003, + "loss": 4.4735, + "step": 1026 + }, + { + "epoch": 0.01027, + "grad_norm": 0.6318859057234201, + "learning_rate": 0.003, + "loss": 4.4952, + "step": 1027 + }, + { + "epoch": 0.01028, + "grad_norm": 0.6588163663127304, + "learning_rate": 0.003, + "loss": 4.4763, + "step": 1028 + }, + { + "epoch": 0.01029, + "grad_norm": 0.55721560335989, + "learning_rate": 0.003, + "loss": 4.4524, + "step": 1029 + }, + { + "epoch": 0.0103, + "grad_norm": 0.5468199872259095, + "learning_rate": 0.003, + "loss": 4.4809, + "step": 1030 + }, + { + "epoch": 0.01031, + "grad_norm": 0.5601179932704955, + "learning_rate": 0.003, + "loss": 4.4934, + "step": 1031 + }, + { + "epoch": 0.01032, + "grad_norm": 0.5514443283118416, + "learning_rate": 0.003, + "loss": 4.464, + "step": 1032 + }, + { + "epoch": 0.01033, + "grad_norm": 0.49777794102419176, + "learning_rate": 0.003, + "loss": 4.496, + "step": 1033 + }, + { + "epoch": 0.01034, + "grad_norm": 0.5163173300909295, + "learning_rate": 0.003, + "loss": 4.4625, + "step": 1034 + }, + { + "epoch": 0.01035, + "grad_norm": 0.4798196088286999, + "learning_rate": 0.003, + "loss": 4.4573, + "step": 1035 + }, + { + "epoch": 0.01036, + "grad_norm": 0.3958499097440651, + "learning_rate": 0.003, + "loss": 4.4459, + "step": 1036 + }, + { + "epoch": 0.01037, + "grad_norm": 0.3643003447603396, + "learning_rate": 0.003, + "loss": 4.4698, + "step": 1037 + }, + { + "epoch": 0.01038, + "grad_norm": 0.40989678679265595, + "learning_rate": 0.003, + "loss": 4.4667, + "step": 1038 + }, + { + "epoch": 0.01039, + "grad_norm": 0.3544910886173235, + "learning_rate": 0.003, + "loss": 4.4556, + "step": 1039 + }, + { + "epoch": 0.0104, + "grad_norm": 0.3719879786284374, + "learning_rate": 0.003, + "loss": 4.4667, + "step": 1040 + }, + { + "epoch": 0.01041, + "grad_norm": 0.3770068025965155, + "learning_rate": 0.003, + "loss": 4.4469, + "step": 1041 + }, + { + "epoch": 0.01042, + "grad_norm": 0.4469030721524434, + "learning_rate": 0.003, + "loss": 4.4752, + "step": 1042 + }, + { + "epoch": 0.01043, + "grad_norm": 0.7272268008417163, + "learning_rate": 0.003, + "loss": 4.4274, + "step": 1043 + }, + { + "epoch": 0.01044, + "grad_norm": 0.8992907359513324, + "learning_rate": 0.003, + "loss": 4.4949, + "step": 1044 + }, + { + "epoch": 0.01045, + "grad_norm": 0.8302310255488462, + "learning_rate": 0.003, + "loss": 4.4953, + "step": 1045 + }, + { + "epoch": 0.01046, + "grad_norm": 0.9975061123002056, + "learning_rate": 0.003, + "loss": 4.4816, + "step": 1046 + }, + { + "epoch": 0.01047, + "grad_norm": 0.8067658442263902, + "learning_rate": 0.003, + "loss": 4.4954, + "step": 1047 + }, + { + "epoch": 0.01048, + "grad_norm": 0.7572637033913173, + "learning_rate": 0.003, + "loss": 4.4741, + "step": 1048 + }, + { + "epoch": 0.01049, + "grad_norm": 0.8994748129621607, + "learning_rate": 0.003, + "loss": 4.4647, + "step": 1049 + }, + { + "epoch": 0.0105, + "grad_norm": 0.7962039985766673, + "learning_rate": 0.003, + "loss": 4.5006, + "step": 1050 + }, + { + "epoch": 0.01051, + "grad_norm": 0.8207643682537104, + "learning_rate": 0.003, + "loss": 4.4811, + "step": 1051 + }, + { + "epoch": 0.01052, + "grad_norm": 0.7458558375660994, + "learning_rate": 0.003, + "loss": 4.4881, + "step": 1052 + }, + { + "epoch": 0.01053, + "grad_norm": 0.6759075645027314, + "learning_rate": 0.003, + "loss": 4.4946, + "step": 1053 + }, + { + "epoch": 0.01054, + "grad_norm": 0.6790074754239308, + "learning_rate": 0.003, + "loss": 4.4852, + "step": 1054 + }, + { + "epoch": 0.01055, + "grad_norm": 0.6920845821894167, + "learning_rate": 0.003, + "loss": 4.4644, + "step": 1055 + }, + { + "epoch": 0.01056, + "grad_norm": 0.5219544124128492, + "learning_rate": 0.003, + "loss": 4.4569, + "step": 1056 + }, + { + "epoch": 0.01057, + "grad_norm": 0.6021767240952963, + "learning_rate": 0.003, + "loss": 4.4651, + "step": 1057 + }, + { + "epoch": 0.01058, + "grad_norm": 0.5219741420101096, + "learning_rate": 0.003, + "loss": 4.4552, + "step": 1058 + }, + { + "epoch": 0.01059, + "grad_norm": 0.509217286199775, + "learning_rate": 0.003, + "loss": 4.4557, + "step": 1059 + }, + { + "epoch": 0.0106, + "grad_norm": 0.6119681984085901, + "learning_rate": 0.003, + "loss": 4.4506, + "step": 1060 + }, + { + "epoch": 0.01061, + "grad_norm": 0.6018802462486371, + "learning_rate": 0.003, + "loss": 4.4451, + "step": 1061 + }, + { + "epoch": 0.01062, + "grad_norm": 0.6072220138777784, + "learning_rate": 0.003, + "loss": 4.4398, + "step": 1062 + }, + { + "epoch": 0.01063, + "grad_norm": 0.5793152524763621, + "learning_rate": 0.003, + "loss": 4.4372, + "step": 1063 + }, + { + "epoch": 0.01064, + "grad_norm": 0.740973950389461, + "learning_rate": 0.003, + "loss": 4.4576, + "step": 1064 + }, + { + "epoch": 0.01065, + "grad_norm": 0.8998946238241571, + "learning_rate": 0.003, + "loss": 4.4756, + "step": 1065 + }, + { + "epoch": 0.01066, + "grad_norm": 0.889057289027363, + "learning_rate": 0.003, + "loss": 4.4727, + "step": 1066 + }, + { + "epoch": 0.01067, + "grad_norm": 0.9442468447502006, + "learning_rate": 0.003, + "loss": 4.4902, + "step": 1067 + }, + { + "epoch": 0.01068, + "grad_norm": 1.018364097076677, + "learning_rate": 0.003, + "loss": 4.4857, + "step": 1068 + }, + { + "epoch": 0.01069, + "grad_norm": 0.9799434684826753, + "learning_rate": 0.003, + "loss": 4.4924, + "step": 1069 + }, + { + "epoch": 0.0107, + "grad_norm": 0.8992499083099482, + "learning_rate": 0.003, + "loss": 4.5446, + "step": 1070 + }, + { + "epoch": 0.01071, + "grad_norm": 0.8735440073637682, + "learning_rate": 0.003, + "loss": 4.4651, + "step": 1071 + }, + { + "epoch": 0.01072, + "grad_norm": 0.8443267804596191, + "learning_rate": 0.003, + "loss": 4.5031, + "step": 1072 + }, + { + "epoch": 0.01073, + "grad_norm": 0.7866639405001601, + "learning_rate": 0.003, + "loss": 4.4993, + "step": 1073 + }, + { + "epoch": 0.01074, + "grad_norm": 0.7412592488158714, + "learning_rate": 0.003, + "loss": 4.4612, + "step": 1074 + }, + { + "epoch": 0.01075, + "grad_norm": 0.7055777296919327, + "learning_rate": 0.003, + "loss": 4.4949, + "step": 1075 + }, + { + "epoch": 0.01076, + "grad_norm": 0.6602103525167045, + "learning_rate": 0.003, + "loss": 4.4903, + "step": 1076 + }, + { + "epoch": 0.01077, + "grad_norm": 0.6221585201435259, + "learning_rate": 0.003, + "loss": 4.4791, + "step": 1077 + }, + { + "epoch": 0.01078, + "grad_norm": 0.6312740380808073, + "learning_rate": 0.003, + "loss": 4.4373, + "step": 1078 + }, + { + "epoch": 0.01079, + "grad_norm": 0.5465751942660528, + "learning_rate": 0.003, + "loss": 4.4641, + "step": 1079 + }, + { + "epoch": 0.0108, + "grad_norm": 0.5040258102852178, + "learning_rate": 0.003, + "loss": 4.4392, + "step": 1080 + }, + { + "epoch": 0.01081, + "grad_norm": 0.4557353176092044, + "learning_rate": 0.003, + "loss": 4.4439, + "step": 1081 + }, + { + "epoch": 0.01082, + "grad_norm": 0.43657211030445375, + "learning_rate": 0.003, + "loss": 4.4598, + "step": 1082 + }, + { + "epoch": 0.01083, + "grad_norm": 0.37468227521199704, + "learning_rate": 0.003, + "loss": 4.4419, + "step": 1083 + }, + { + "epoch": 0.01084, + "grad_norm": 0.3702365616744669, + "learning_rate": 0.003, + "loss": 4.4465, + "step": 1084 + }, + { + "epoch": 0.01085, + "grad_norm": 0.3248887668649726, + "learning_rate": 0.003, + "loss": 4.4323, + "step": 1085 + }, + { + "epoch": 0.01086, + "grad_norm": 0.28646275942866467, + "learning_rate": 0.003, + "loss": 4.4059, + "step": 1086 + }, + { + "epoch": 0.01087, + "grad_norm": 0.3109295903469056, + "learning_rate": 0.003, + "loss": 4.4215, + "step": 1087 + }, + { + "epoch": 0.01088, + "grad_norm": 0.31305777007935864, + "learning_rate": 0.003, + "loss": 4.454, + "step": 1088 + }, + { + "epoch": 0.01089, + "grad_norm": 0.3202194647994524, + "learning_rate": 0.003, + "loss": 4.423, + "step": 1089 + }, + { + "epoch": 0.0109, + "grad_norm": 0.3546167101166988, + "learning_rate": 0.003, + "loss": 4.4466, + "step": 1090 + }, + { + "epoch": 0.01091, + "grad_norm": 0.40979643182380626, + "learning_rate": 0.003, + "loss": 4.4162, + "step": 1091 + }, + { + "epoch": 0.01092, + "grad_norm": 0.44852044367674554, + "learning_rate": 0.003, + "loss": 4.4131, + "step": 1092 + }, + { + "epoch": 0.01093, + "grad_norm": 0.5395428853759895, + "learning_rate": 0.003, + "loss": 4.4351, + "step": 1093 + }, + { + "epoch": 0.01094, + "grad_norm": 0.7419472173607777, + "learning_rate": 0.003, + "loss": 4.4344, + "step": 1094 + }, + { + "epoch": 0.01095, + "grad_norm": 1.3268112301670232, + "learning_rate": 0.003, + "loss": 4.4875, + "step": 1095 + }, + { + "epoch": 0.01096, + "grad_norm": 0.8107867356113418, + "learning_rate": 0.003, + "loss": 4.4576, + "step": 1096 + }, + { + "epoch": 0.01097, + "grad_norm": 0.6635063888056452, + "learning_rate": 0.003, + "loss": 4.4388, + "step": 1097 + }, + { + "epoch": 0.01098, + "grad_norm": 0.9662363707419442, + "learning_rate": 0.003, + "loss": 4.474, + "step": 1098 + }, + { + "epoch": 0.01099, + "grad_norm": 0.7696101380141932, + "learning_rate": 0.003, + "loss": 4.4405, + "step": 1099 + }, + { + "epoch": 0.011, + "grad_norm": 0.7450236895824817, + "learning_rate": 0.003, + "loss": 4.4451, + "step": 1100 + }, + { + "epoch": 0.01101, + "grad_norm": 0.7254968126029079, + "learning_rate": 0.003, + "loss": 4.4513, + "step": 1101 + }, + { + "epoch": 0.01102, + "grad_norm": 0.7537628081039499, + "learning_rate": 0.003, + "loss": 4.4588, + "step": 1102 + }, + { + "epoch": 0.01103, + "grad_norm": 0.7123173358553468, + "learning_rate": 0.003, + "loss": 4.4551, + "step": 1103 + }, + { + "epoch": 0.01104, + "grad_norm": 0.6033148725235706, + "learning_rate": 0.003, + "loss": 4.4384, + "step": 1104 + }, + { + "epoch": 0.01105, + "grad_norm": 0.7223359575565769, + "learning_rate": 0.003, + "loss": 4.4643, + "step": 1105 + }, + { + "epoch": 0.01106, + "grad_norm": 0.6749769688136343, + "learning_rate": 0.003, + "loss": 4.4355, + "step": 1106 + }, + { + "epoch": 0.01107, + "grad_norm": 0.6519921729558845, + "learning_rate": 0.003, + "loss": 4.443, + "step": 1107 + }, + { + "epoch": 0.01108, + "grad_norm": 0.6442390747026363, + "learning_rate": 0.003, + "loss": 4.4155, + "step": 1108 + }, + { + "epoch": 0.01109, + "grad_norm": 0.5210761074063659, + "learning_rate": 0.003, + "loss": 4.4258, + "step": 1109 + }, + { + "epoch": 0.0111, + "grad_norm": 0.49112961811162964, + "learning_rate": 0.003, + "loss": 4.4439, + "step": 1110 + }, + { + "epoch": 0.01111, + "grad_norm": 0.5154689325771941, + "learning_rate": 0.003, + "loss": 4.4228, + "step": 1111 + }, + { + "epoch": 0.01112, + "grad_norm": 0.5147734027357755, + "learning_rate": 0.003, + "loss": 4.4383, + "step": 1112 + }, + { + "epoch": 0.01113, + "grad_norm": 0.4905886669705696, + "learning_rate": 0.003, + "loss": 4.4046, + "step": 1113 + }, + { + "epoch": 0.01114, + "grad_norm": 0.47555859430135783, + "learning_rate": 0.003, + "loss": 4.4166, + "step": 1114 + }, + { + "epoch": 0.01115, + "grad_norm": 0.42768329796719207, + "learning_rate": 0.003, + "loss": 4.4369, + "step": 1115 + }, + { + "epoch": 0.01116, + "grad_norm": 0.4281709726428523, + "learning_rate": 0.003, + "loss": 4.4286, + "step": 1116 + }, + { + "epoch": 0.01117, + "grad_norm": 0.4124788023594352, + "learning_rate": 0.003, + "loss": 4.414, + "step": 1117 + }, + { + "epoch": 0.01118, + "grad_norm": 0.4531459371875401, + "learning_rate": 0.003, + "loss": 4.415, + "step": 1118 + }, + { + "epoch": 0.01119, + "grad_norm": 0.5115410769007107, + "learning_rate": 0.003, + "loss": 4.4247, + "step": 1119 + }, + { + "epoch": 0.0112, + "grad_norm": 0.6598099600325713, + "learning_rate": 0.003, + "loss": 4.4115, + "step": 1120 + }, + { + "epoch": 0.01121, + "grad_norm": 0.7298362819040065, + "learning_rate": 0.003, + "loss": 4.4356, + "step": 1121 + }, + { + "epoch": 0.01122, + "grad_norm": 0.6383918217981295, + "learning_rate": 0.003, + "loss": 4.4021, + "step": 1122 + }, + { + "epoch": 0.01123, + "grad_norm": 0.7157274101955465, + "learning_rate": 0.003, + "loss": 4.3942, + "step": 1123 + }, + { + "epoch": 0.01124, + "grad_norm": 0.6763302773515354, + "learning_rate": 0.003, + "loss": 4.4444, + "step": 1124 + }, + { + "epoch": 0.01125, + "grad_norm": 0.5890253028490829, + "learning_rate": 0.003, + "loss": 4.4062, + "step": 1125 + }, + { + "epoch": 0.01126, + "grad_norm": 0.6745452077137704, + "learning_rate": 0.003, + "loss": 4.4204, + "step": 1126 + }, + { + "epoch": 0.01127, + "grad_norm": 0.6212265837956056, + "learning_rate": 0.003, + "loss": 4.446, + "step": 1127 + }, + { + "epoch": 0.01128, + "grad_norm": 0.6579375902442356, + "learning_rate": 0.003, + "loss": 4.4285, + "step": 1128 + }, + { + "epoch": 0.01129, + "grad_norm": 0.7038241088934594, + "learning_rate": 0.003, + "loss": 4.4197, + "step": 1129 + }, + { + "epoch": 0.0113, + "grad_norm": 0.8537912734504166, + "learning_rate": 0.003, + "loss": 4.4099, + "step": 1130 + }, + { + "epoch": 0.01131, + "grad_norm": 0.6676066708341881, + "learning_rate": 0.003, + "loss": 4.4184, + "step": 1131 + }, + { + "epoch": 0.01132, + "grad_norm": 0.6262944206433371, + "learning_rate": 0.003, + "loss": 4.4418, + "step": 1132 + }, + { + "epoch": 0.01133, + "grad_norm": 0.7096964978520276, + "learning_rate": 0.003, + "loss": 4.4077, + "step": 1133 + }, + { + "epoch": 0.01134, + "grad_norm": 0.7063046600590702, + "learning_rate": 0.003, + "loss": 4.4219, + "step": 1134 + }, + { + "epoch": 0.01135, + "grad_norm": 0.7389408975678705, + "learning_rate": 0.003, + "loss": 4.4511, + "step": 1135 + }, + { + "epoch": 0.01136, + "grad_norm": 0.902932943298548, + "learning_rate": 0.003, + "loss": 4.4334, + "step": 1136 + }, + { + "epoch": 0.01137, + "grad_norm": 0.8104098050855381, + "learning_rate": 0.003, + "loss": 4.4615, + "step": 1137 + }, + { + "epoch": 0.01138, + "grad_norm": 0.6304382373186352, + "learning_rate": 0.003, + "loss": 4.4061, + "step": 1138 + }, + { + "epoch": 0.01139, + "grad_norm": 0.6573985975094263, + "learning_rate": 0.003, + "loss": 4.4174, + "step": 1139 + }, + { + "epoch": 0.0114, + "grad_norm": 0.6243548176788066, + "learning_rate": 0.003, + "loss": 4.4015, + "step": 1140 + }, + { + "epoch": 0.01141, + "grad_norm": 0.6941761654842915, + "learning_rate": 0.003, + "loss": 4.4278, + "step": 1141 + }, + { + "epoch": 0.01142, + "grad_norm": 0.6422841400280502, + "learning_rate": 0.003, + "loss": 4.4042, + "step": 1142 + }, + { + "epoch": 0.01143, + "grad_norm": 0.6274623913648985, + "learning_rate": 0.003, + "loss": 4.4193, + "step": 1143 + }, + { + "epoch": 0.01144, + "grad_norm": 0.6449672978265372, + "learning_rate": 0.003, + "loss": 4.3861, + "step": 1144 + }, + { + "epoch": 0.01145, + "grad_norm": 0.8094555842673681, + "learning_rate": 0.003, + "loss": 4.4312, + "step": 1145 + }, + { + "epoch": 0.01146, + "grad_norm": 0.8440842425424007, + "learning_rate": 0.003, + "loss": 4.4247, + "step": 1146 + }, + { + "epoch": 0.01147, + "grad_norm": 0.8250289231813209, + "learning_rate": 0.003, + "loss": 4.4121, + "step": 1147 + }, + { + "epoch": 0.01148, + "grad_norm": 0.9842608879774175, + "learning_rate": 0.003, + "loss": 4.4248, + "step": 1148 + }, + { + "epoch": 0.01149, + "grad_norm": 1.140890319957312, + "learning_rate": 0.003, + "loss": 4.4515, + "step": 1149 + }, + { + "epoch": 0.0115, + "grad_norm": 1.0049560913231448, + "learning_rate": 0.003, + "loss": 4.454, + "step": 1150 + }, + { + "epoch": 0.01151, + "grad_norm": 0.9241096807179571, + "learning_rate": 0.003, + "loss": 4.4438, + "step": 1151 + }, + { + "epoch": 0.01152, + "grad_norm": 0.8197935874101315, + "learning_rate": 0.003, + "loss": 4.4496, + "step": 1152 + }, + { + "epoch": 0.01153, + "grad_norm": 0.8964418195355868, + "learning_rate": 0.003, + "loss": 4.4768, + "step": 1153 + }, + { + "epoch": 0.01154, + "grad_norm": 0.7052475574958695, + "learning_rate": 0.003, + "loss": 4.4381, + "step": 1154 + }, + { + "epoch": 0.01155, + "grad_norm": 0.6994161040159336, + "learning_rate": 0.003, + "loss": 4.425, + "step": 1155 + }, + { + "epoch": 0.01156, + "grad_norm": 0.5866991021029688, + "learning_rate": 0.003, + "loss": 4.4267, + "step": 1156 + }, + { + "epoch": 0.01157, + "grad_norm": 0.5890541796764595, + "learning_rate": 0.003, + "loss": 4.4503, + "step": 1157 + }, + { + "epoch": 0.01158, + "grad_norm": 0.5558417103751193, + "learning_rate": 0.003, + "loss": 4.4146, + "step": 1158 + }, + { + "epoch": 0.01159, + "grad_norm": 0.5494688252527891, + "learning_rate": 0.003, + "loss": 4.41, + "step": 1159 + }, + { + "epoch": 0.0116, + "grad_norm": 0.5721755744732883, + "learning_rate": 0.003, + "loss": 4.4228, + "step": 1160 + }, + { + "epoch": 0.01161, + "grad_norm": 0.6717218566295324, + "learning_rate": 0.003, + "loss": 4.4019, + "step": 1161 + }, + { + "epoch": 0.01162, + "grad_norm": 0.7268973886687413, + "learning_rate": 0.003, + "loss": 4.4138, + "step": 1162 + }, + { + "epoch": 0.01163, + "grad_norm": 0.5842283691456034, + "learning_rate": 0.003, + "loss": 4.4284, + "step": 1163 + }, + { + "epoch": 0.01164, + "grad_norm": 0.4855856513804374, + "learning_rate": 0.003, + "loss": 4.4134, + "step": 1164 + }, + { + "epoch": 0.01165, + "grad_norm": 0.4188604676147223, + "learning_rate": 0.003, + "loss": 4.4035, + "step": 1165 + }, + { + "epoch": 0.01166, + "grad_norm": 0.3540156009620338, + "learning_rate": 0.003, + "loss": 4.4019, + "step": 1166 + }, + { + "epoch": 0.01167, + "grad_norm": 0.34211903868248933, + "learning_rate": 0.003, + "loss": 4.3663, + "step": 1167 + }, + { + "epoch": 0.01168, + "grad_norm": 0.3891962127459521, + "learning_rate": 0.003, + "loss": 4.374, + "step": 1168 + }, + { + "epoch": 0.01169, + "grad_norm": 0.43062143986497, + "learning_rate": 0.003, + "loss": 4.4147, + "step": 1169 + }, + { + "epoch": 0.0117, + "grad_norm": 0.5082945055708723, + "learning_rate": 0.003, + "loss": 4.404, + "step": 1170 + }, + { + "epoch": 0.01171, + "grad_norm": 0.6470102757597544, + "learning_rate": 0.003, + "loss": 4.3885, + "step": 1171 + }, + { + "epoch": 0.01172, + "grad_norm": 0.7362820078056241, + "learning_rate": 0.003, + "loss": 4.3869, + "step": 1172 + }, + { + "epoch": 0.01173, + "grad_norm": 0.693269388291317, + "learning_rate": 0.003, + "loss": 4.4171, + "step": 1173 + }, + { + "epoch": 0.01174, + "grad_norm": 0.5646769911878592, + "learning_rate": 0.003, + "loss": 4.3891, + "step": 1174 + }, + { + "epoch": 0.01175, + "grad_norm": 0.5675605646992437, + "learning_rate": 0.003, + "loss": 4.3751, + "step": 1175 + }, + { + "epoch": 0.01176, + "grad_norm": 0.5874888204753396, + "learning_rate": 0.003, + "loss": 4.4094, + "step": 1176 + }, + { + "epoch": 0.01177, + "grad_norm": 0.4402531057426568, + "learning_rate": 0.003, + "loss": 4.3711, + "step": 1177 + }, + { + "epoch": 0.01178, + "grad_norm": 0.542910388266956, + "learning_rate": 0.003, + "loss": 4.3959, + "step": 1178 + }, + { + "epoch": 0.01179, + "grad_norm": 0.6077990883816204, + "learning_rate": 0.003, + "loss": 4.3751, + "step": 1179 + }, + { + "epoch": 0.0118, + "grad_norm": 0.5935696360755632, + "learning_rate": 0.003, + "loss": 4.4393, + "step": 1180 + }, + { + "epoch": 0.01181, + "grad_norm": 0.5077497618150401, + "learning_rate": 0.003, + "loss": 4.4227, + "step": 1181 + }, + { + "epoch": 0.01182, + "grad_norm": 0.4525800009003507, + "learning_rate": 0.003, + "loss": 4.3791, + "step": 1182 + }, + { + "epoch": 0.01183, + "grad_norm": 0.465730915302176, + "learning_rate": 0.003, + "loss": 4.3741, + "step": 1183 + }, + { + "epoch": 0.01184, + "grad_norm": 0.5691375544130988, + "learning_rate": 0.003, + "loss": 4.4172, + "step": 1184 + }, + { + "epoch": 0.01185, + "grad_norm": 0.6220094109188331, + "learning_rate": 0.003, + "loss": 4.3896, + "step": 1185 + }, + { + "epoch": 0.01186, + "grad_norm": 0.7180162404965751, + "learning_rate": 0.003, + "loss": 4.417, + "step": 1186 + }, + { + "epoch": 0.01187, + "grad_norm": 0.7918251317294935, + "learning_rate": 0.003, + "loss": 4.389, + "step": 1187 + }, + { + "epoch": 0.01188, + "grad_norm": 0.8521642061482538, + "learning_rate": 0.003, + "loss": 4.4081, + "step": 1188 + }, + { + "epoch": 0.01189, + "grad_norm": 0.884743262111852, + "learning_rate": 0.003, + "loss": 4.4173, + "step": 1189 + }, + { + "epoch": 0.0119, + "grad_norm": 0.809315268231435, + "learning_rate": 0.003, + "loss": 4.3961, + "step": 1190 + }, + { + "epoch": 0.01191, + "grad_norm": 0.8426026711506704, + "learning_rate": 0.003, + "loss": 4.3805, + "step": 1191 + }, + { + "epoch": 0.01192, + "grad_norm": 0.8039017391990436, + "learning_rate": 0.003, + "loss": 4.4134, + "step": 1192 + }, + { + "epoch": 0.01193, + "grad_norm": 1.0091767993853729, + "learning_rate": 0.003, + "loss": 4.4278, + "step": 1193 + }, + { + "epoch": 0.01194, + "grad_norm": 1.021446612913599, + "learning_rate": 0.003, + "loss": 4.4458, + "step": 1194 + }, + { + "epoch": 0.01195, + "grad_norm": 0.8197186317539255, + "learning_rate": 0.003, + "loss": 4.4136, + "step": 1195 + }, + { + "epoch": 0.01196, + "grad_norm": 1.0663075325190898, + "learning_rate": 0.003, + "loss": 4.4346, + "step": 1196 + }, + { + "epoch": 0.01197, + "grad_norm": 1.1163749964317249, + "learning_rate": 0.003, + "loss": 4.42, + "step": 1197 + }, + { + "epoch": 0.01198, + "grad_norm": 0.7911818503418244, + "learning_rate": 0.003, + "loss": 4.4756, + "step": 1198 + }, + { + "epoch": 0.01199, + "grad_norm": 0.7094954502714929, + "learning_rate": 0.003, + "loss": 4.4429, + "step": 1199 + }, + { + "epoch": 0.012, + "grad_norm": 0.6371349856097592, + "learning_rate": 0.003, + "loss": 4.4311, + "step": 1200 + }, + { + "epoch": 0.01201, + "grad_norm": 0.6670059038710348, + "learning_rate": 0.003, + "loss": 4.4586, + "step": 1201 + }, + { + "epoch": 0.01202, + "grad_norm": 0.7057344621036731, + "learning_rate": 0.003, + "loss": 4.4253, + "step": 1202 + }, + { + "epoch": 0.01203, + "grad_norm": 0.6813638781778574, + "learning_rate": 0.003, + "loss": 4.4314, + "step": 1203 + }, + { + "epoch": 0.01204, + "grad_norm": 0.6487927748495244, + "learning_rate": 0.003, + "loss": 4.4252, + "step": 1204 + }, + { + "epoch": 0.01205, + "grad_norm": 0.5702088035511471, + "learning_rate": 0.003, + "loss": 4.4134, + "step": 1205 + }, + { + "epoch": 0.01206, + "grad_norm": 0.5185412999807271, + "learning_rate": 0.003, + "loss": 4.4527, + "step": 1206 + }, + { + "epoch": 0.01207, + "grad_norm": 0.5529206635637681, + "learning_rate": 0.003, + "loss": 4.3748, + "step": 1207 + }, + { + "epoch": 0.01208, + "grad_norm": 0.6027594396835911, + "learning_rate": 0.003, + "loss": 4.4355, + "step": 1208 + }, + { + "epoch": 0.01209, + "grad_norm": 0.6294683615242583, + "learning_rate": 0.003, + "loss": 4.4139, + "step": 1209 + }, + { + "epoch": 0.0121, + "grad_norm": 0.5634425369332556, + "learning_rate": 0.003, + "loss": 4.4142, + "step": 1210 + }, + { + "epoch": 0.01211, + "grad_norm": 0.489983773731107, + "learning_rate": 0.003, + "loss": 4.3953, + "step": 1211 + }, + { + "epoch": 0.01212, + "grad_norm": 0.48590408635878257, + "learning_rate": 0.003, + "loss": 4.4163, + "step": 1212 + }, + { + "epoch": 0.01213, + "grad_norm": 0.4654031475329597, + "learning_rate": 0.003, + "loss": 4.3921, + "step": 1213 + }, + { + "epoch": 0.01214, + "grad_norm": 0.4519325610344299, + "learning_rate": 0.003, + "loss": 4.3755, + "step": 1214 + }, + { + "epoch": 0.01215, + "grad_norm": 0.4986453835908023, + "learning_rate": 0.003, + "loss": 4.3833, + "step": 1215 + }, + { + "epoch": 0.01216, + "grad_norm": 0.6836671205807361, + "learning_rate": 0.003, + "loss": 4.3816, + "step": 1216 + }, + { + "epoch": 0.01217, + "grad_norm": 0.8634560994181636, + "learning_rate": 0.003, + "loss": 4.4035, + "step": 1217 + }, + { + "epoch": 0.01218, + "grad_norm": 0.7430166358012542, + "learning_rate": 0.003, + "loss": 4.3918, + "step": 1218 + }, + { + "epoch": 0.01219, + "grad_norm": 0.5693399114743081, + "learning_rate": 0.003, + "loss": 4.3834, + "step": 1219 + }, + { + "epoch": 0.0122, + "grad_norm": 0.6203041570130704, + "learning_rate": 0.003, + "loss": 4.3803, + "step": 1220 + }, + { + "epoch": 0.01221, + "grad_norm": 0.6221216866599081, + "learning_rate": 0.003, + "loss": 4.3932, + "step": 1221 + }, + { + "epoch": 0.01222, + "grad_norm": 0.5678017035835031, + "learning_rate": 0.003, + "loss": 4.3744, + "step": 1222 + }, + { + "epoch": 0.01223, + "grad_norm": 0.4544260730538049, + "learning_rate": 0.003, + "loss": 4.3669, + "step": 1223 + }, + { + "epoch": 0.01224, + "grad_norm": 0.4498657047747612, + "learning_rate": 0.003, + "loss": 4.3528, + "step": 1224 + }, + { + "epoch": 0.01225, + "grad_norm": 0.44954702929076595, + "learning_rate": 0.003, + "loss": 4.3751, + "step": 1225 + }, + { + "epoch": 0.01226, + "grad_norm": 0.45922987677597427, + "learning_rate": 0.003, + "loss": 4.3678, + "step": 1226 + }, + { + "epoch": 0.01227, + "grad_norm": 0.444736072897216, + "learning_rate": 0.003, + "loss": 4.3955, + "step": 1227 + }, + { + "epoch": 0.01228, + "grad_norm": 0.39964234776344315, + "learning_rate": 0.003, + "loss": 4.3749, + "step": 1228 + }, + { + "epoch": 0.01229, + "grad_norm": 0.4304709469798317, + "learning_rate": 0.003, + "loss": 4.3636, + "step": 1229 + }, + { + "epoch": 0.0123, + "grad_norm": 0.45096371519587236, + "learning_rate": 0.003, + "loss": 4.3946, + "step": 1230 + }, + { + "epoch": 0.01231, + "grad_norm": 0.45260016324469293, + "learning_rate": 0.003, + "loss": 4.3662, + "step": 1231 + }, + { + "epoch": 0.01232, + "grad_norm": 0.5056071407476121, + "learning_rate": 0.003, + "loss": 4.3416, + "step": 1232 + }, + { + "epoch": 0.01233, + "grad_norm": 0.6137961234604853, + "learning_rate": 0.003, + "loss": 4.3562, + "step": 1233 + }, + { + "epoch": 0.01234, + "grad_norm": 0.7580924108092045, + "learning_rate": 0.003, + "loss": 4.3771, + "step": 1234 + }, + { + "epoch": 0.01235, + "grad_norm": 0.7682739117958468, + "learning_rate": 0.003, + "loss": 4.377, + "step": 1235 + }, + { + "epoch": 0.01236, + "grad_norm": 0.7726095308103765, + "learning_rate": 0.003, + "loss": 4.379, + "step": 1236 + }, + { + "epoch": 0.01237, + "grad_norm": 0.6515520396776402, + "learning_rate": 0.003, + "loss": 4.3868, + "step": 1237 + }, + { + "epoch": 0.01238, + "grad_norm": 0.5911401797128396, + "learning_rate": 0.003, + "loss": 4.3811, + "step": 1238 + }, + { + "epoch": 0.01239, + "grad_norm": 0.6868743136698521, + "learning_rate": 0.003, + "loss": 4.3689, + "step": 1239 + }, + { + "epoch": 0.0124, + "grad_norm": 0.7235342138686123, + "learning_rate": 0.003, + "loss": 4.3578, + "step": 1240 + }, + { + "epoch": 0.01241, + "grad_norm": 0.7086323715527123, + "learning_rate": 0.003, + "loss": 4.3714, + "step": 1241 + }, + { + "epoch": 0.01242, + "grad_norm": 0.7273279516261576, + "learning_rate": 0.003, + "loss": 4.3772, + "step": 1242 + }, + { + "epoch": 0.01243, + "grad_norm": 0.7241329186524991, + "learning_rate": 0.003, + "loss": 4.358, + "step": 1243 + }, + { + "epoch": 0.01244, + "grad_norm": 0.659589685877625, + "learning_rate": 0.003, + "loss": 4.3697, + "step": 1244 + }, + { + "epoch": 0.01245, + "grad_norm": 0.5679488489092839, + "learning_rate": 0.003, + "loss": 4.3588, + "step": 1245 + }, + { + "epoch": 0.01246, + "grad_norm": 0.5843246373122641, + "learning_rate": 0.003, + "loss": 4.3356, + "step": 1246 + }, + { + "epoch": 0.01247, + "grad_norm": 0.5386207185407424, + "learning_rate": 0.003, + "loss": 4.3884, + "step": 1247 + }, + { + "epoch": 0.01248, + "grad_norm": 0.4947062329390487, + "learning_rate": 0.003, + "loss": 4.3718, + "step": 1248 + }, + { + "epoch": 0.01249, + "grad_norm": 0.47539230004779465, + "learning_rate": 0.003, + "loss": 4.3597, + "step": 1249 + }, + { + "epoch": 0.0125, + "grad_norm": 0.4876155686681362, + "learning_rate": 0.003, + "loss": 4.3353, + "step": 1250 + }, + { + "epoch": 0.01251, + "grad_norm": 0.587984128950104, + "learning_rate": 0.003, + "loss": 4.3725, + "step": 1251 + }, + { + "epoch": 0.01252, + "grad_norm": 0.74609447185642, + "learning_rate": 0.003, + "loss": 4.3585, + "step": 1252 + }, + { + "epoch": 0.01253, + "grad_norm": 0.8297104750282515, + "learning_rate": 0.003, + "loss": 4.3426, + "step": 1253 + }, + { + "epoch": 0.01254, + "grad_norm": 0.6714602053035584, + "learning_rate": 0.003, + "loss": 4.3879, + "step": 1254 + }, + { + "epoch": 0.01255, + "grad_norm": 0.5575284697102026, + "learning_rate": 0.003, + "loss": 4.3328, + "step": 1255 + }, + { + "epoch": 0.01256, + "grad_norm": 0.6531801875316168, + "learning_rate": 0.003, + "loss": 4.3617, + "step": 1256 + }, + { + "epoch": 0.01257, + "grad_norm": 0.6725833064810157, + "learning_rate": 0.003, + "loss": 4.3523, + "step": 1257 + }, + { + "epoch": 0.01258, + "grad_norm": 0.5415379584452896, + "learning_rate": 0.003, + "loss": 4.3533, + "step": 1258 + }, + { + "epoch": 0.01259, + "grad_norm": 0.6189096611706483, + "learning_rate": 0.003, + "loss": 4.3565, + "step": 1259 + }, + { + "epoch": 0.0126, + "grad_norm": 0.7719709473721972, + "learning_rate": 0.003, + "loss": 4.3943, + "step": 1260 + }, + { + "epoch": 0.01261, + "grad_norm": 0.7141309260632757, + "learning_rate": 0.003, + "loss": 4.3828, + "step": 1261 + }, + { + "epoch": 0.01262, + "grad_norm": 0.82201580215985, + "learning_rate": 0.003, + "loss": 4.3803, + "step": 1262 + }, + { + "epoch": 0.01263, + "grad_norm": 0.7648394097891381, + "learning_rate": 0.003, + "loss": 4.3764, + "step": 1263 + }, + { + "epoch": 0.01264, + "grad_norm": 0.6534936022081808, + "learning_rate": 0.003, + "loss": 4.3615, + "step": 1264 + }, + { + "epoch": 0.01265, + "grad_norm": 0.8083064566897021, + "learning_rate": 0.003, + "loss": 4.3795, + "step": 1265 + }, + { + "epoch": 0.01266, + "grad_norm": 0.7625627444580204, + "learning_rate": 0.003, + "loss": 4.3817, + "step": 1266 + }, + { + "epoch": 0.01267, + "grad_norm": 0.7144669763786441, + "learning_rate": 0.003, + "loss": 4.3961, + "step": 1267 + }, + { + "epoch": 0.01268, + "grad_norm": 0.6176763363932604, + "learning_rate": 0.003, + "loss": 4.3815, + "step": 1268 + }, + { + "epoch": 0.01269, + "grad_norm": 0.5844219741679575, + "learning_rate": 0.003, + "loss": 4.3442, + "step": 1269 + }, + { + "epoch": 0.0127, + "grad_norm": 0.5570920646541462, + "learning_rate": 0.003, + "loss": 4.3764, + "step": 1270 + }, + { + "epoch": 0.01271, + "grad_norm": 0.665679672510672, + "learning_rate": 0.003, + "loss": 4.3628, + "step": 1271 + }, + { + "epoch": 0.01272, + "grad_norm": 0.7700987517359916, + "learning_rate": 0.003, + "loss": 4.3638, + "step": 1272 + }, + { + "epoch": 0.01273, + "grad_norm": 0.7473118276250652, + "learning_rate": 0.003, + "loss": 4.3932, + "step": 1273 + }, + { + "epoch": 0.01274, + "grad_norm": 0.6491851354934152, + "learning_rate": 0.003, + "loss": 4.3789, + "step": 1274 + }, + { + "epoch": 0.01275, + "grad_norm": 0.6744635149292406, + "learning_rate": 0.003, + "loss": 4.3494, + "step": 1275 + }, + { + "epoch": 0.01276, + "grad_norm": 0.652401206080157, + "learning_rate": 0.003, + "loss": 4.3572, + "step": 1276 + }, + { + "epoch": 0.01277, + "grad_norm": 0.8226392169345714, + "learning_rate": 0.003, + "loss": 4.3706, + "step": 1277 + }, + { + "epoch": 0.01278, + "grad_norm": 0.8325255271008436, + "learning_rate": 0.003, + "loss": 4.3906, + "step": 1278 + }, + { + "epoch": 0.01279, + "grad_norm": 0.7029576010205191, + "learning_rate": 0.003, + "loss": 4.3612, + "step": 1279 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6056600803694466, + "learning_rate": 0.003, + "loss": 4.3548, + "step": 1280 + }, + { + "epoch": 0.01281, + "grad_norm": 0.6068824045406035, + "learning_rate": 0.003, + "loss": 4.3433, + "step": 1281 + }, + { + "epoch": 0.01282, + "grad_norm": 0.5449477323585487, + "learning_rate": 0.003, + "loss": 4.3801, + "step": 1282 + }, + { + "epoch": 0.01283, + "grad_norm": 0.5086376593278678, + "learning_rate": 0.003, + "loss": 4.3465, + "step": 1283 + }, + { + "epoch": 0.01284, + "grad_norm": 0.49507526795006285, + "learning_rate": 0.003, + "loss": 4.3484, + "step": 1284 + }, + { + "epoch": 0.01285, + "grad_norm": 0.4794063886412805, + "learning_rate": 0.003, + "loss": 4.3499, + "step": 1285 + }, + { + "epoch": 0.01286, + "grad_norm": 0.4314216967223394, + "learning_rate": 0.003, + "loss": 4.37, + "step": 1286 + }, + { + "epoch": 0.01287, + "grad_norm": 0.43751633400555473, + "learning_rate": 0.003, + "loss": 4.3355, + "step": 1287 + }, + { + "epoch": 0.01288, + "grad_norm": 0.4402964980769675, + "learning_rate": 0.003, + "loss": 4.3477, + "step": 1288 + }, + { + "epoch": 0.01289, + "grad_norm": 0.4330370123851316, + "learning_rate": 0.003, + "loss": 4.34, + "step": 1289 + }, + { + "epoch": 0.0129, + "grad_norm": 0.4577360505134826, + "learning_rate": 0.003, + "loss": 4.3613, + "step": 1290 + }, + { + "epoch": 0.01291, + "grad_norm": 0.4908761222350223, + "learning_rate": 0.003, + "loss": 4.3434, + "step": 1291 + }, + { + "epoch": 0.01292, + "grad_norm": 0.5197962656056466, + "learning_rate": 0.003, + "loss": 4.3769, + "step": 1292 + }, + { + "epoch": 0.01293, + "grad_norm": 0.5249407732060195, + "learning_rate": 0.003, + "loss": 4.3636, + "step": 1293 + }, + { + "epoch": 0.01294, + "grad_norm": 0.611698986416238, + "learning_rate": 0.003, + "loss": 4.3337, + "step": 1294 + }, + { + "epoch": 0.01295, + "grad_norm": 0.7156402524759599, + "learning_rate": 0.003, + "loss": 4.369, + "step": 1295 + }, + { + "epoch": 0.01296, + "grad_norm": 0.8400871327514794, + "learning_rate": 0.003, + "loss": 4.3429, + "step": 1296 + }, + { + "epoch": 0.01297, + "grad_norm": 0.9437097734689512, + "learning_rate": 0.003, + "loss": 4.3707, + "step": 1297 + }, + { + "epoch": 0.01298, + "grad_norm": 0.9584374674542437, + "learning_rate": 0.003, + "loss": 4.3505, + "step": 1298 + }, + { + "epoch": 0.01299, + "grad_norm": 0.8671660761915297, + "learning_rate": 0.003, + "loss": 4.3625, + "step": 1299 + }, + { + "epoch": 0.013, + "grad_norm": 0.7502528073695865, + "learning_rate": 0.003, + "loss": 4.3495, + "step": 1300 + }, + { + "epoch": 0.01301, + "grad_norm": 0.7063515582852367, + "learning_rate": 0.003, + "loss": 4.3619, + "step": 1301 + }, + { + "epoch": 0.01302, + "grad_norm": 0.6149436229664508, + "learning_rate": 0.003, + "loss": 4.4034, + "step": 1302 + }, + { + "epoch": 0.01303, + "grad_norm": 0.6398205689943912, + "learning_rate": 0.003, + "loss": 4.3511, + "step": 1303 + }, + { + "epoch": 0.01304, + "grad_norm": 0.5125654671319883, + "learning_rate": 0.003, + "loss": 4.3743, + "step": 1304 + }, + { + "epoch": 0.01305, + "grad_norm": 0.49151327704885306, + "learning_rate": 0.003, + "loss": 4.357, + "step": 1305 + }, + { + "epoch": 0.01306, + "grad_norm": 0.449035259984806, + "learning_rate": 0.003, + "loss": 4.3496, + "step": 1306 + }, + { + "epoch": 0.01307, + "grad_norm": 0.3786564732477656, + "learning_rate": 0.003, + "loss": 4.365, + "step": 1307 + }, + { + "epoch": 0.01308, + "grad_norm": 0.40747339154306583, + "learning_rate": 0.003, + "loss": 4.3407, + "step": 1308 + }, + { + "epoch": 0.01309, + "grad_norm": 0.3477777170477395, + "learning_rate": 0.003, + "loss": 4.3771, + "step": 1309 + }, + { + "epoch": 0.0131, + "grad_norm": 0.33985557007437, + "learning_rate": 0.003, + "loss": 4.3221, + "step": 1310 + }, + { + "epoch": 0.01311, + "grad_norm": 0.3781744690142208, + "learning_rate": 0.003, + "loss": 4.3621, + "step": 1311 + }, + { + "epoch": 0.01312, + "grad_norm": 0.390430095045798, + "learning_rate": 0.003, + "loss": 4.3295, + "step": 1312 + }, + { + "epoch": 0.01313, + "grad_norm": 0.4411339748954226, + "learning_rate": 0.003, + "loss": 4.3219, + "step": 1313 + }, + { + "epoch": 0.01314, + "grad_norm": 0.4398621851217481, + "learning_rate": 0.003, + "loss": 4.3182, + "step": 1314 + }, + { + "epoch": 0.01315, + "grad_norm": 0.4543520117536255, + "learning_rate": 0.003, + "loss": 4.3454, + "step": 1315 + }, + { + "epoch": 0.01316, + "grad_norm": 0.4647119001484631, + "learning_rate": 0.003, + "loss": 4.3315, + "step": 1316 + }, + { + "epoch": 0.01317, + "grad_norm": 0.6634726948283797, + "learning_rate": 0.003, + "loss": 4.3607, + "step": 1317 + }, + { + "epoch": 0.01318, + "grad_norm": 1.0496539085022096, + "learning_rate": 0.003, + "loss": 4.3666, + "step": 1318 + }, + { + "epoch": 0.01319, + "grad_norm": 1.3165114729988252, + "learning_rate": 0.003, + "loss": 4.392, + "step": 1319 + }, + { + "epoch": 0.0132, + "grad_norm": 0.7650097283623507, + "learning_rate": 0.003, + "loss": 4.3723, + "step": 1320 + }, + { + "epoch": 0.01321, + "grad_norm": 0.7692901736546863, + "learning_rate": 0.003, + "loss": 4.3563, + "step": 1321 + }, + { + "epoch": 0.01322, + "grad_norm": 0.8430579356733313, + "learning_rate": 0.003, + "loss": 4.3811, + "step": 1322 + }, + { + "epoch": 0.01323, + "grad_norm": 0.7787983772827455, + "learning_rate": 0.003, + "loss": 4.3749, + "step": 1323 + }, + { + "epoch": 0.01324, + "grad_norm": 0.7585640630062677, + "learning_rate": 0.003, + "loss": 4.3595, + "step": 1324 + }, + { + "epoch": 0.01325, + "grad_norm": 0.662871102926864, + "learning_rate": 0.003, + "loss": 4.3604, + "step": 1325 + }, + { + "epoch": 0.01326, + "grad_norm": 0.6552542644296041, + "learning_rate": 0.003, + "loss": 4.3471, + "step": 1326 + }, + { + "epoch": 0.01327, + "grad_norm": 0.6597442221568894, + "learning_rate": 0.003, + "loss": 4.3636, + "step": 1327 + }, + { + "epoch": 0.01328, + "grad_norm": 0.6991774516177431, + "learning_rate": 0.003, + "loss": 4.3572, + "step": 1328 + }, + { + "epoch": 0.01329, + "grad_norm": 0.7004568547673504, + "learning_rate": 0.003, + "loss": 4.323, + "step": 1329 + }, + { + "epoch": 0.0133, + "grad_norm": 0.6451796491929548, + "learning_rate": 0.003, + "loss": 4.3619, + "step": 1330 + }, + { + "epoch": 0.01331, + "grad_norm": 0.5603586741834496, + "learning_rate": 0.003, + "loss": 4.3461, + "step": 1331 + }, + { + "epoch": 0.01332, + "grad_norm": 0.6118296137650349, + "learning_rate": 0.003, + "loss": 4.3674, + "step": 1332 + }, + { + "epoch": 0.01333, + "grad_norm": 0.6556746462798643, + "learning_rate": 0.003, + "loss": 4.3385, + "step": 1333 + }, + { + "epoch": 0.01334, + "grad_norm": 0.6569171248898434, + "learning_rate": 0.003, + "loss": 4.3292, + "step": 1334 + }, + { + "epoch": 0.01335, + "grad_norm": 0.7308490257197903, + "learning_rate": 0.003, + "loss": 4.355, + "step": 1335 + }, + { + "epoch": 0.01336, + "grad_norm": 0.7694263760249205, + "learning_rate": 0.003, + "loss": 4.3562, + "step": 1336 + }, + { + "epoch": 0.01337, + "grad_norm": 0.829484027358136, + "learning_rate": 0.003, + "loss": 4.3739, + "step": 1337 + }, + { + "epoch": 0.01338, + "grad_norm": 0.7394720058707402, + "learning_rate": 0.003, + "loss": 4.3375, + "step": 1338 + }, + { + "epoch": 0.01339, + "grad_norm": 0.7663877331658486, + "learning_rate": 0.003, + "loss": 4.3614, + "step": 1339 + }, + { + "epoch": 0.0134, + "grad_norm": 0.6814802672525545, + "learning_rate": 0.003, + "loss": 4.3431, + "step": 1340 + }, + { + "epoch": 0.01341, + "grad_norm": 0.5804799698607723, + "learning_rate": 0.003, + "loss": 4.3566, + "step": 1341 + }, + { + "epoch": 0.01342, + "grad_norm": 0.5489998904660267, + "learning_rate": 0.003, + "loss": 4.3242, + "step": 1342 + }, + { + "epoch": 0.01343, + "grad_norm": 0.5528070757921824, + "learning_rate": 0.003, + "loss": 4.3308, + "step": 1343 + }, + { + "epoch": 0.01344, + "grad_norm": 0.5249427916401641, + "learning_rate": 0.003, + "loss": 4.3286, + "step": 1344 + }, + { + "epoch": 0.01345, + "grad_norm": 0.5796492875823627, + "learning_rate": 0.003, + "loss": 4.3698, + "step": 1345 + }, + { + "epoch": 0.01346, + "grad_norm": 0.7791779206443923, + "learning_rate": 0.003, + "loss": 4.349, + "step": 1346 + }, + { + "epoch": 0.01347, + "grad_norm": 0.9787862373774305, + "learning_rate": 0.003, + "loss": 4.3601, + "step": 1347 + }, + { + "epoch": 0.01348, + "grad_norm": 1.124691751963475, + "learning_rate": 0.003, + "loss": 4.3695, + "step": 1348 + }, + { + "epoch": 0.01349, + "grad_norm": 0.7239534112442599, + "learning_rate": 0.003, + "loss": 4.3703, + "step": 1349 + }, + { + "epoch": 0.0135, + "grad_norm": 0.7058824966707771, + "learning_rate": 0.003, + "loss": 4.3665, + "step": 1350 + }, + { + "epoch": 0.01351, + "grad_norm": 0.7269767228946674, + "learning_rate": 0.003, + "loss": 4.3913, + "step": 1351 + }, + { + "epoch": 0.01352, + "grad_norm": 0.629762174967035, + "learning_rate": 0.003, + "loss": 4.3652, + "step": 1352 + }, + { + "epoch": 0.01353, + "grad_norm": 0.46709847900974316, + "learning_rate": 0.003, + "loss": 4.3426, + "step": 1353 + }, + { + "epoch": 0.01354, + "grad_norm": 0.4348570974243878, + "learning_rate": 0.003, + "loss": 4.33, + "step": 1354 + }, + { + "epoch": 0.01355, + "grad_norm": 0.4058626528377844, + "learning_rate": 0.003, + "loss": 4.3397, + "step": 1355 + }, + { + "epoch": 0.01356, + "grad_norm": 0.4077904728688183, + "learning_rate": 0.003, + "loss": 4.3329, + "step": 1356 + }, + { + "epoch": 0.01357, + "grad_norm": 0.3881492908104237, + "learning_rate": 0.003, + "loss": 4.3196, + "step": 1357 + }, + { + "epoch": 0.01358, + "grad_norm": 0.37883341535998916, + "learning_rate": 0.003, + "loss": 4.3135, + "step": 1358 + }, + { + "epoch": 0.01359, + "grad_norm": 0.3673079033329598, + "learning_rate": 0.003, + "loss": 4.3515, + "step": 1359 + }, + { + "epoch": 0.0136, + "grad_norm": 0.37976733712994504, + "learning_rate": 0.003, + "loss": 4.3177, + "step": 1360 + }, + { + "epoch": 0.01361, + "grad_norm": 0.4118348378412375, + "learning_rate": 0.003, + "loss": 4.3313, + "step": 1361 + }, + { + "epoch": 0.01362, + "grad_norm": 0.4407593683397167, + "learning_rate": 0.003, + "loss": 4.345, + "step": 1362 + }, + { + "epoch": 0.01363, + "grad_norm": 0.5058671106167254, + "learning_rate": 0.003, + "loss": 4.3464, + "step": 1363 + }, + { + "epoch": 0.01364, + "grad_norm": 0.6807238620694973, + "learning_rate": 0.003, + "loss": 4.3446, + "step": 1364 + }, + { + "epoch": 0.01365, + "grad_norm": 0.8032111226209311, + "learning_rate": 0.003, + "loss": 4.3702, + "step": 1365 + }, + { + "epoch": 0.01366, + "grad_norm": 0.7976336994683142, + "learning_rate": 0.003, + "loss": 4.3233, + "step": 1366 + }, + { + "epoch": 0.01367, + "grad_norm": 0.7881501972797743, + "learning_rate": 0.003, + "loss": 4.3766, + "step": 1367 + }, + { + "epoch": 0.01368, + "grad_norm": 0.6437176795201012, + "learning_rate": 0.003, + "loss": 4.346, + "step": 1368 + }, + { + "epoch": 0.01369, + "grad_norm": 0.7544973412872852, + "learning_rate": 0.003, + "loss": 4.3393, + "step": 1369 + }, + { + "epoch": 0.0137, + "grad_norm": 0.9164166166999201, + "learning_rate": 0.003, + "loss": 4.3794, + "step": 1370 + }, + { + "epoch": 0.01371, + "grad_norm": 0.9303263432444765, + "learning_rate": 0.003, + "loss": 4.3692, + "step": 1371 + }, + { + "epoch": 0.01372, + "grad_norm": 0.7939151912920627, + "learning_rate": 0.003, + "loss": 4.3176, + "step": 1372 + }, + { + "epoch": 0.01373, + "grad_norm": 0.8088750323088586, + "learning_rate": 0.003, + "loss": 4.3626, + "step": 1373 + }, + { + "epoch": 0.01374, + "grad_norm": 0.6721224672262793, + "learning_rate": 0.003, + "loss": 4.3585, + "step": 1374 + }, + { + "epoch": 0.01375, + "grad_norm": 0.6771097427365759, + "learning_rate": 0.003, + "loss": 4.3249, + "step": 1375 + }, + { + "epoch": 0.01376, + "grad_norm": 0.6762752470113543, + "learning_rate": 0.003, + "loss": 4.347, + "step": 1376 + }, + { + "epoch": 0.01377, + "grad_norm": 0.6604690846076395, + "learning_rate": 0.003, + "loss": 4.3466, + "step": 1377 + }, + { + "epoch": 0.01378, + "grad_norm": 0.8027631418948692, + "learning_rate": 0.003, + "loss": 4.3812, + "step": 1378 + }, + { + "epoch": 0.01379, + "grad_norm": 0.8058991494927565, + "learning_rate": 0.003, + "loss": 4.3543, + "step": 1379 + }, + { + "epoch": 0.0138, + "grad_norm": 0.6522516144796072, + "learning_rate": 0.003, + "loss": 4.3459, + "step": 1380 + }, + { + "epoch": 0.01381, + "grad_norm": 0.6569912385058241, + "learning_rate": 0.003, + "loss": 4.3524, + "step": 1381 + }, + { + "epoch": 0.01382, + "grad_norm": 0.5996275495095534, + "learning_rate": 0.003, + "loss": 4.3471, + "step": 1382 + }, + { + "epoch": 0.01383, + "grad_norm": 0.7099037071072233, + "learning_rate": 0.003, + "loss": 4.3442, + "step": 1383 + }, + { + "epoch": 0.01384, + "grad_norm": 0.9143332820449893, + "learning_rate": 0.003, + "loss": 4.3854, + "step": 1384 + }, + { + "epoch": 0.01385, + "grad_norm": 1.0346171765908472, + "learning_rate": 0.003, + "loss": 4.3815, + "step": 1385 + }, + { + "epoch": 0.01386, + "grad_norm": 0.8985740978326894, + "learning_rate": 0.003, + "loss": 4.3558, + "step": 1386 + }, + { + "epoch": 0.01387, + "grad_norm": 0.6958974730086266, + "learning_rate": 0.003, + "loss": 4.335, + "step": 1387 + }, + { + "epoch": 0.01388, + "grad_norm": 0.7497022428211844, + "learning_rate": 0.003, + "loss": 4.3735, + "step": 1388 + }, + { + "epoch": 0.01389, + "grad_norm": 0.7334888360507559, + "learning_rate": 0.003, + "loss": 4.3541, + "step": 1389 + }, + { + "epoch": 0.0139, + "grad_norm": 0.6634716599193375, + "learning_rate": 0.003, + "loss": 4.3533, + "step": 1390 + }, + { + "epoch": 0.01391, + "grad_norm": 0.5884420459817105, + "learning_rate": 0.003, + "loss": 4.3483, + "step": 1391 + }, + { + "epoch": 0.01392, + "grad_norm": 0.536202889592441, + "learning_rate": 0.003, + "loss": 4.3599, + "step": 1392 + }, + { + "epoch": 0.01393, + "grad_norm": 0.5041805750347179, + "learning_rate": 0.003, + "loss": 4.327, + "step": 1393 + }, + { + "epoch": 0.01394, + "grad_norm": 0.46187226586984687, + "learning_rate": 0.003, + "loss": 4.3118, + "step": 1394 + }, + { + "epoch": 0.01395, + "grad_norm": 0.4642311356997612, + "learning_rate": 0.003, + "loss": 4.343, + "step": 1395 + }, + { + "epoch": 0.01396, + "grad_norm": 0.49219234547480073, + "learning_rate": 0.003, + "loss": 4.3333, + "step": 1396 + }, + { + "epoch": 0.01397, + "grad_norm": 0.535253115362672, + "learning_rate": 0.003, + "loss": 4.3471, + "step": 1397 + }, + { + "epoch": 0.01398, + "grad_norm": 0.6052714387106423, + "learning_rate": 0.003, + "loss": 4.3267, + "step": 1398 + }, + { + "epoch": 0.01399, + "grad_norm": 0.6277477924110956, + "learning_rate": 0.003, + "loss": 4.3504, + "step": 1399 + }, + { + "epoch": 0.014, + "grad_norm": 0.6846592906181149, + "learning_rate": 0.003, + "loss": 4.3103, + "step": 1400 + }, + { + "epoch": 0.01401, + "grad_norm": 0.8200984983857136, + "learning_rate": 0.003, + "loss": 4.3387, + "step": 1401 + }, + { + "epoch": 0.01402, + "grad_norm": 0.8742108124137586, + "learning_rate": 0.003, + "loss": 4.3557, + "step": 1402 + }, + { + "epoch": 0.01403, + "grad_norm": 0.7096844776193044, + "learning_rate": 0.003, + "loss": 4.3233, + "step": 1403 + }, + { + "epoch": 0.01404, + "grad_norm": 0.5388679123577896, + "learning_rate": 0.003, + "loss": 4.3401, + "step": 1404 + }, + { + "epoch": 0.01405, + "grad_norm": 0.4910540359683292, + "learning_rate": 0.003, + "loss": 4.3559, + "step": 1405 + }, + { + "epoch": 0.01406, + "grad_norm": 0.5213447941957257, + "learning_rate": 0.003, + "loss": 4.3366, + "step": 1406 + }, + { + "epoch": 0.01407, + "grad_norm": 0.555926299222398, + "learning_rate": 0.003, + "loss": 4.3521, + "step": 1407 + }, + { + "epoch": 0.01408, + "grad_norm": 0.692327807092523, + "learning_rate": 0.003, + "loss": 4.3137, + "step": 1408 + }, + { + "epoch": 0.01409, + "grad_norm": 0.8697281888439876, + "learning_rate": 0.003, + "loss": 4.3588, + "step": 1409 + }, + { + "epoch": 0.0141, + "grad_norm": 0.9566955718451174, + "learning_rate": 0.003, + "loss": 4.3446, + "step": 1410 + }, + { + "epoch": 0.01411, + "grad_norm": 0.7215244847418736, + "learning_rate": 0.003, + "loss": 4.3131, + "step": 1411 + }, + { + "epoch": 0.01412, + "grad_norm": 0.6315141391735041, + "learning_rate": 0.003, + "loss": 4.3535, + "step": 1412 + }, + { + "epoch": 0.01413, + "grad_norm": 0.6841381857718712, + "learning_rate": 0.003, + "loss": 4.342, + "step": 1413 + }, + { + "epoch": 0.01414, + "grad_norm": 0.5798611042820683, + "learning_rate": 0.003, + "loss": 4.3235, + "step": 1414 + }, + { + "epoch": 0.01415, + "grad_norm": 0.547398429036414, + "learning_rate": 0.003, + "loss": 4.3157, + "step": 1415 + }, + { + "epoch": 0.01416, + "grad_norm": 0.532688568072748, + "learning_rate": 0.003, + "loss": 4.3388, + "step": 1416 + }, + { + "epoch": 0.01417, + "grad_norm": 0.5474366342351658, + "learning_rate": 0.003, + "loss": 4.3317, + "step": 1417 + }, + { + "epoch": 0.01418, + "grad_norm": 0.5499547169322332, + "learning_rate": 0.003, + "loss": 4.3477, + "step": 1418 + }, + { + "epoch": 0.01419, + "grad_norm": 0.58527195779541, + "learning_rate": 0.003, + "loss": 4.3403, + "step": 1419 + }, + { + "epoch": 0.0142, + "grad_norm": 0.4513568933907034, + "learning_rate": 0.003, + "loss": 4.3401, + "step": 1420 + }, + { + "epoch": 0.01421, + "grad_norm": 0.433334289258551, + "learning_rate": 0.003, + "loss": 4.3072, + "step": 1421 + }, + { + "epoch": 0.01422, + "grad_norm": 0.39636752625840327, + "learning_rate": 0.003, + "loss": 4.3095, + "step": 1422 + }, + { + "epoch": 0.01423, + "grad_norm": 0.38308187149456013, + "learning_rate": 0.003, + "loss": 4.3141, + "step": 1423 + }, + { + "epoch": 0.01424, + "grad_norm": 0.3793175762281477, + "learning_rate": 0.003, + "loss": 4.3274, + "step": 1424 + }, + { + "epoch": 0.01425, + "grad_norm": 0.4426704955835019, + "learning_rate": 0.003, + "loss": 4.334, + "step": 1425 + }, + { + "epoch": 0.01426, + "grad_norm": 0.5006373442027768, + "learning_rate": 0.003, + "loss": 4.3106, + "step": 1426 + }, + { + "epoch": 0.01427, + "grad_norm": 0.654459732986379, + "learning_rate": 0.003, + "loss": 4.3121, + "step": 1427 + }, + { + "epoch": 0.01428, + "grad_norm": 0.8067469794918197, + "learning_rate": 0.003, + "loss": 4.3172, + "step": 1428 + }, + { + "epoch": 0.01429, + "grad_norm": 0.8729241550485854, + "learning_rate": 0.003, + "loss": 4.3306, + "step": 1429 + }, + { + "epoch": 0.0143, + "grad_norm": 0.7278485959649278, + "learning_rate": 0.003, + "loss": 4.3432, + "step": 1430 + }, + { + "epoch": 0.01431, + "grad_norm": 0.7004237859708793, + "learning_rate": 0.003, + "loss": 4.3301, + "step": 1431 + }, + { + "epoch": 0.01432, + "grad_norm": 0.8171598559675952, + "learning_rate": 0.003, + "loss": 4.3267, + "step": 1432 + }, + { + "epoch": 0.01433, + "grad_norm": 0.7196947939958487, + "learning_rate": 0.003, + "loss": 4.323, + "step": 1433 + }, + { + "epoch": 0.01434, + "grad_norm": 0.6846858190137799, + "learning_rate": 0.003, + "loss": 4.3048, + "step": 1434 + }, + { + "epoch": 0.01435, + "grad_norm": 0.8455490886566032, + "learning_rate": 0.003, + "loss": 4.3242, + "step": 1435 + }, + { + "epoch": 0.01436, + "grad_norm": 0.83940902394985, + "learning_rate": 0.003, + "loss": 4.3453, + "step": 1436 + }, + { + "epoch": 0.01437, + "grad_norm": 0.7328785180307785, + "learning_rate": 0.003, + "loss": 4.3547, + "step": 1437 + }, + { + "epoch": 0.01438, + "grad_norm": 0.7369569320140708, + "learning_rate": 0.003, + "loss": 4.349, + "step": 1438 + }, + { + "epoch": 0.01439, + "grad_norm": 0.7139149757788127, + "learning_rate": 0.003, + "loss": 4.3129, + "step": 1439 + }, + { + "epoch": 0.0144, + "grad_norm": 0.7299011882512658, + "learning_rate": 0.003, + "loss": 4.3275, + "step": 1440 + }, + { + "epoch": 0.01441, + "grad_norm": 0.670750314419178, + "learning_rate": 0.003, + "loss": 4.3309, + "step": 1441 + }, + { + "epoch": 0.01442, + "grad_norm": 0.7691195459017623, + "learning_rate": 0.003, + "loss": 4.3219, + "step": 1442 + }, + { + "epoch": 0.01443, + "grad_norm": 0.7471914299871487, + "learning_rate": 0.003, + "loss": 4.3829, + "step": 1443 + }, + { + "epoch": 0.01444, + "grad_norm": 0.7410199276775529, + "learning_rate": 0.003, + "loss": 4.3445, + "step": 1444 + }, + { + "epoch": 0.01445, + "grad_norm": 0.6515002318773166, + "learning_rate": 0.003, + "loss": 4.3041, + "step": 1445 + }, + { + "epoch": 0.01446, + "grad_norm": 0.6781090959776701, + "learning_rate": 0.003, + "loss": 4.3207, + "step": 1446 + }, + { + "epoch": 0.01447, + "grad_norm": 0.6643065072159584, + "learning_rate": 0.003, + "loss": 4.3293, + "step": 1447 + }, + { + "epoch": 0.01448, + "grad_norm": 0.6532713033801402, + "learning_rate": 0.003, + "loss": 4.3375, + "step": 1448 + }, + { + "epoch": 0.01449, + "grad_norm": 0.6344501403492658, + "learning_rate": 0.003, + "loss": 4.3302, + "step": 1449 + }, + { + "epoch": 0.0145, + "grad_norm": 0.5404283744793761, + "learning_rate": 0.003, + "loss": 4.3401, + "step": 1450 + }, + { + "epoch": 0.01451, + "grad_norm": 0.5381895515345636, + "learning_rate": 0.003, + "loss": 4.3181, + "step": 1451 + }, + { + "epoch": 0.01452, + "grad_norm": 0.5085114651371503, + "learning_rate": 0.003, + "loss": 4.3176, + "step": 1452 + }, + { + "epoch": 0.01453, + "grad_norm": 0.5297212710147206, + "learning_rate": 0.003, + "loss": 4.3302, + "step": 1453 + }, + { + "epoch": 0.01454, + "grad_norm": 0.6082304279549751, + "learning_rate": 0.003, + "loss": 4.299, + "step": 1454 + }, + { + "epoch": 0.01455, + "grad_norm": 0.799761378996593, + "learning_rate": 0.003, + "loss": 4.3379, + "step": 1455 + }, + { + "epoch": 0.01456, + "grad_norm": 0.8200199061255905, + "learning_rate": 0.003, + "loss": 4.3248, + "step": 1456 + }, + { + "epoch": 0.01457, + "grad_norm": 0.6729328173983412, + "learning_rate": 0.003, + "loss": 4.3054, + "step": 1457 + }, + { + "epoch": 0.01458, + "grad_norm": 0.5761787071333488, + "learning_rate": 0.003, + "loss": 4.2955, + "step": 1458 + }, + { + "epoch": 0.01459, + "grad_norm": 0.6468019060138231, + "learning_rate": 0.003, + "loss": 4.3289, + "step": 1459 + }, + { + "epoch": 0.0146, + "grad_norm": 0.6012651007343053, + "learning_rate": 0.003, + "loss": 4.3283, + "step": 1460 + }, + { + "epoch": 0.01461, + "grad_norm": 0.4832305210154182, + "learning_rate": 0.003, + "loss": 4.3037, + "step": 1461 + }, + { + "epoch": 0.01462, + "grad_norm": 0.4933402571331549, + "learning_rate": 0.003, + "loss": 4.3088, + "step": 1462 + }, + { + "epoch": 0.01463, + "grad_norm": 0.4774020866387011, + "learning_rate": 0.003, + "loss": 4.3072, + "step": 1463 + }, + { + "epoch": 0.01464, + "grad_norm": 0.4604171826341042, + "learning_rate": 0.003, + "loss": 4.2821, + "step": 1464 + }, + { + "epoch": 0.01465, + "grad_norm": 0.4294089797917102, + "learning_rate": 0.003, + "loss": 4.3245, + "step": 1465 + }, + { + "epoch": 0.01466, + "grad_norm": 0.41929661999999984, + "learning_rate": 0.003, + "loss": 4.2919, + "step": 1466 + }, + { + "epoch": 0.01467, + "grad_norm": 0.4391785760355703, + "learning_rate": 0.003, + "loss": 4.3283, + "step": 1467 + }, + { + "epoch": 0.01468, + "grad_norm": 0.4538846759969807, + "learning_rate": 0.003, + "loss": 4.3037, + "step": 1468 + }, + { + "epoch": 0.01469, + "grad_norm": 0.5119886030394242, + "learning_rate": 0.003, + "loss": 4.3053, + "step": 1469 + }, + { + "epoch": 0.0147, + "grad_norm": 0.716327286811815, + "learning_rate": 0.003, + "loss": 4.3041, + "step": 1470 + }, + { + "epoch": 0.01471, + "grad_norm": 0.9150052503540761, + "learning_rate": 0.003, + "loss": 4.3274, + "step": 1471 + }, + { + "epoch": 0.01472, + "grad_norm": 0.7809983181295278, + "learning_rate": 0.003, + "loss": 4.3143, + "step": 1472 + }, + { + "epoch": 0.01473, + "grad_norm": 0.8086105929105535, + "learning_rate": 0.003, + "loss": 4.3289, + "step": 1473 + }, + { + "epoch": 0.01474, + "grad_norm": 0.8656063557310713, + "learning_rate": 0.003, + "loss": 4.3258, + "step": 1474 + }, + { + "epoch": 0.01475, + "grad_norm": 0.8637306499434225, + "learning_rate": 0.003, + "loss": 4.3398, + "step": 1475 + }, + { + "epoch": 0.01476, + "grad_norm": 0.6743755524690542, + "learning_rate": 0.003, + "loss": 4.3227, + "step": 1476 + }, + { + "epoch": 0.01477, + "grad_norm": 0.6414161546100536, + "learning_rate": 0.003, + "loss": 4.3256, + "step": 1477 + }, + { + "epoch": 0.01478, + "grad_norm": 0.7067557912668699, + "learning_rate": 0.003, + "loss": 4.3113, + "step": 1478 + }, + { + "epoch": 0.01479, + "grad_norm": 0.7427717638657159, + "learning_rate": 0.003, + "loss": 4.3499, + "step": 1479 + }, + { + "epoch": 0.0148, + "grad_norm": 0.7972486147312002, + "learning_rate": 0.003, + "loss": 4.3212, + "step": 1480 + }, + { + "epoch": 0.01481, + "grad_norm": 0.8238319142257903, + "learning_rate": 0.003, + "loss": 4.3261, + "step": 1481 + }, + { + "epoch": 0.01482, + "grad_norm": 0.7010334962030825, + "learning_rate": 0.003, + "loss": 4.3089, + "step": 1482 + }, + { + "epoch": 0.01483, + "grad_norm": 0.6287754589698554, + "learning_rate": 0.003, + "loss": 4.303, + "step": 1483 + }, + { + "epoch": 0.01484, + "grad_norm": 0.5953574922658559, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1484 + }, + { + "epoch": 0.01485, + "grad_norm": 0.6859671225358986, + "learning_rate": 0.003, + "loss": 4.3172, + "step": 1485 + }, + { + "epoch": 0.01486, + "grad_norm": 0.7393843625917039, + "learning_rate": 0.003, + "loss": 4.3181, + "step": 1486 + }, + { + "epoch": 0.01487, + "grad_norm": 0.6948333501030851, + "learning_rate": 0.003, + "loss": 4.3225, + "step": 1487 + }, + { + "epoch": 0.01488, + "grad_norm": 0.5806619385051136, + "learning_rate": 0.003, + "loss": 4.3259, + "step": 1488 + }, + { + "epoch": 0.01489, + "grad_norm": 0.6342989775695094, + "learning_rate": 0.003, + "loss": 4.3221, + "step": 1489 + }, + { + "epoch": 0.0149, + "grad_norm": 0.6704129451979153, + "learning_rate": 0.003, + "loss": 4.3147, + "step": 1490 + }, + { + "epoch": 0.01491, + "grad_norm": 0.7017206728292057, + "learning_rate": 0.003, + "loss": 4.3305, + "step": 1491 + }, + { + "epoch": 0.01492, + "grad_norm": 0.6813188063148864, + "learning_rate": 0.003, + "loss": 4.2961, + "step": 1492 + }, + { + "epoch": 0.01493, + "grad_norm": 0.8525559699799811, + "learning_rate": 0.003, + "loss": 4.3324, + "step": 1493 + }, + { + "epoch": 0.01494, + "grad_norm": 0.8687834663259932, + "learning_rate": 0.003, + "loss": 4.3213, + "step": 1494 + }, + { + "epoch": 0.01495, + "grad_norm": 0.7274136348295218, + "learning_rate": 0.003, + "loss": 4.3243, + "step": 1495 + }, + { + "epoch": 0.01496, + "grad_norm": 0.5434712910501038, + "learning_rate": 0.003, + "loss": 4.2923, + "step": 1496 + }, + { + "epoch": 0.01497, + "grad_norm": 0.6148760530691155, + "learning_rate": 0.003, + "loss": 4.2701, + "step": 1497 + }, + { + "epoch": 0.01498, + "grad_norm": 0.5665585769325068, + "learning_rate": 0.003, + "loss": 4.3224, + "step": 1498 + }, + { + "epoch": 0.01499, + "grad_norm": 0.543044486095261, + "learning_rate": 0.003, + "loss": 4.3408, + "step": 1499 + }, + { + "epoch": 0.015, + "grad_norm": 0.5868245294183819, + "learning_rate": 0.003, + "loss": 4.2952, + "step": 1500 + }, + { + "epoch": 0.01501, + "grad_norm": 0.6488345831013783, + "learning_rate": 0.003, + "loss": 4.2943, + "step": 1501 + }, + { + "epoch": 0.01502, + "grad_norm": 0.5560720951051904, + "learning_rate": 0.003, + "loss": 4.3322, + "step": 1502 + }, + { + "epoch": 0.01503, + "grad_norm": 0.6170651368212409, + "learning_rate": 0.003, + "loss": 4.3142, + "step": 1503 + }, + { + "epoch": 0.01504, + "grad_norm": 0.7253173696408437, + "learning_rate": 0.003, + "loss": 4.3199, + "step": 1504 + }, + { + "epoch": 0.01505, + "grad_norm": 0.7573152828389973, + "learning_rate": 0.003, + "loss": 4.3067, + "step": 1505 + }, + { + "epoch": 0.01506, + "grad_norm": 0.7119425101662294, + "learning_rate": 0.003, + "loss": 4.3233, + "step": 1506 + }, + { + "epoch": 0.01507, + "grad_norm": 0.7408719494100054, + "learning_rate": 0.003, + "loss": 4.3394, + "step": 1507 + }, + { + "epoch": 0.01508, + "grad_norm": 0.6023053184100382, + "learning_rate": 0.003, + "loss": 4.3108, + "step": 1508 + }, + { + "epoch": 0.01509, + "grad_norm": 0.5054062616818538, + "learning_rate": 0.003, + "loss": 4.3257, + "step": 1509 + }, + { + "epoch": 0.0151, + "grad_norm": 0.4440427822667444, + "learning_rate": 0.003, + "loss": 4.2952, + "step": 1510 + }, + { + "epoch": 0.01511, + "grad_norm": 0.503043960784533, + "learning_rate": 0.003, + "loss": 4.2838, + "step": 1511 + }, + { + "epoch": 0.01512, + "grad_norm": 0.4478910644887885, + "learning_rate": 0.003, + "loss": 4.2937, + "step": 1512 + }, + { + "epoch": 0.01513, + "grad_norm": 0.46313433916934627, + "learning_rate": 0.003, + "loss": 4.2989, + "step": 1513 + }, + { + "epoch": 0.01514, + "grad_norm": 0.5237062692888148, + "learning_rate": 0.003, + "loss": 4.3066, + "step": 1514 + }, + { + "epoch": 0.01515, + "grad_norm": 0.6686824710181497, + "learning_rate": 0.003, + "loss": 4.3056, + "step": 1515 + }, + { + "epoch": 0.01516, + "grad_norm": 0.8162150436928566, + "learning_rate": 0.003, + "loss": 4.3198, + "step": 1516 + }, + { + "epoch": 0.01517, + "grad_norm": 0.8250386268904906, + "learning_rate": 0.003, + "loss": 4.274, + "step": 1517 + }, + { + "epoch": 0.01518, + "grad_norm": 0.8551846044396628, + "learning_rate": 0.003, + "loss": 4.3102, + "step": 1518 + }, + { + "epoch": 0.01519, + "grad_norm": 0.7778994522735176, + "learning_rate": 0.003, + "loss": 4.3471, + "step": 1519 + }, + { + "epoch": 0.0152, + "grad_norm": 0.6407333772962789, + "learning_rate": 0.003, + "loss": 4.2901, + "step": 1520 + }, + { + "epoch": 0.01521, + "grad_norm": 0.590099483112598, + "learning_rate": 0.003, + "loss": 4.3053, + "step": 1521 + }, + { + "epoch": 0.01522, + "grad_norm": 0.6220605799864671, + "learning_rate": 0.003, + "loss": 4.3014, + "step": 1522 + }, + { + "epoch": 0.01523, + "grad_norm": 0.6378640000910417, + "learning_rate": 0.003, + "loss": 4.2957, + "step": 1523 + }, + { + "epoch": 0.01524, + "grad_norm": 0.6800153761279158, + "learning_rate": 0.003, + "loss": 4.3332, + "step": 1524 + }, + { + "epoch": 0.01525, + "grad_norm": 0.6552309843606878, + "learning_rate": 0.003, + "loss": 4.3306, + "step": 1525 + }, + { + "epoch": 0.01526, + "grad_norm": 0.5832236683536917, + "learning_rate": 0.003, + "loss": 4.3154, + "step": 1526 + }, + { + "epoch": 0.01527, + "grad_norm": 0.5923964968536319, + "learning_rate": 0.003, + "loss": 4.2868, + "step": 1527 + }, + { + "epoch": 0.01528, + "grad_norm": 0.6438689697714796, + "learning_rate": 0.003, + "loss": 4.3067, + "step": 1528 + }, + { + "epoch": 0.01529, + "grad_norm": 0.7863523458278316, + "learning_rate": 0.003, + "loss": 4.3071, + "step": 1529 + }, + { + "epoch": 0.0153, + "grad_norm": 0.8716684688082496, + "learning_rate": 0.003, + "loss": 4.3236, + "step": 1530 + }, + { + "epoch": 0.01531, + "grad_norm": 0.7632243352103186, + "learning_rate": 0.003, + "loss": 4.314, + "step": 1531 + }, + { + "epoch": 0.01532, + "grad_norm": 0.5830764209700208, + "learning_rate": 0.003, + "loss": 4.2863, + "step": 1532 + }, + { + "epoch": 0.01533, + "grad_norm": 0.6925578088506626, + "learning_rate": 0.003, + "loss": 4.3193, + "step": 1533 + }, + { + "epoch": 0.01534, + "grad_norm": 0.6884106279584329, + "learning_rate": 0.003, + "loss": 4.3131, + "step": 1534 + }, + { + "epoch": 0.01535, + "grad_norm": 0.612086877528724, + "learning_rate": 0.003, + "loss": 4.3216, + "step": 1535 + }, + { + "epoch": 0.01536, + "grad_norm": 0.6277711721176384, + "learning_rate": 0.003, + "loss": 4.3039, + "step": 1536 + }, + { + "epoch": 0.01537, + "grad_norm": 0.7930101311629847, + "learning_rate": 0.003, + "loss": 4.3078, + "step": 1537 + }, + { + "epoch": 0.01538, + "grad_norm": 0.8987557100700295, + "learning_rate": 0.003, + "loss": 4.2971, + "step": 1538 + }, + { + "epoch": 0.01539, + "grad_norm": 0.9391352557198052, + "learning_rate": 0.003, + "loss": 4.334, + "step": 1539 + }, + { + "epoch": 0.0154, + "grad_norm": 0.9440902063787158, + "learning_rate": 0.003, + "loss": 4.3487, + "step": 1540 + }, + { + "epoch": 0.01541, + "grad_norm": 0.9127476278073733, + "learning_rate": 0.003, + "loss": 4.328, + "step": 1541 + }, + { + "epoch": 0.01542, + "grad_norm": 0.7753546344208377, + "learning_rate": 0.003, + "loss": 4.3528, + "step": 1542 + }, + { + "epoch": 0.01543, + "grad_norm": 0.7317390533493494, + "learning_rate": 0.003, + "loss": 4.3318, + "step": 1543 + }, + { + "epoch": 0.01544, + "grad_norm": 0.5946634053593371, + "learning_rate": 0.003, + "loss": 4.3162, + "step": 1544 + }, + { + "epoch": 0.01545, + "grad_norm": 0.49808121797063876, + "learning_rate": 0.003, + "loss": 4.375, + "step": 1545 + }, + { + "epoch": 0.01546, + "grad_norm": 0.5189277397467036, + "learning_rate": 0.003, + "loss": 4.3435, + "step": 1546 + }, + { + "epoch": 0.01547, + "grad_norm": 0.5075409907094708, + "learning_rate": 0.003, + "loss": 4.3431, + "step": 1547 + }, + { + "epoch": 0.01548, + "grad_norm": 0.4269782583856954, + "learning_rate": 0.003, + "loss": 4.3016, + "step": 1548 + }, + { + "epoch": 0.01549, + "grad_norm": 0.44259739876123444, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1549 + }, + { + "epoch": 0.0155, + "grad_norm": 0.39476104723697186, + "learning_rate": 0.003, + "loss": 4.3072, + "step": 1550 + }, + { + "epoch": 0.01551, + "grad_norm": 0.39719568498427116, + "learning_rate": 0.003, + "loss": 4.2836, + "step": 1551 + }, + { + "epoch": 0.01552, + "grad_norm": 0.5112856286097273, + "learning_rate": 0.003, + "loss": 4.3059, + "step": 1552 + }, + { + "epoch": 0.01553, + "grad_norm": 0.6994822828905559, + "learning_rate": 0.003, + "loss": 4.3366, + "step": 1553 + }, + { + "epoch": 0.01554, + "grad_norm": 0.9918562900778292, + "learning_rate": 0.003, + "loss": 4.3337, + "step": 1554 + }, + { + "epoch": 0.01555, + "grad_norm": 0.8597749628867734, + "learning_rate": 0.003, + "loss": 4.3154, + "step": 1555 + }, + { + "epoch": 0.01556, + "grad_norm": 0.4859845337872918, + "learning_rate": 0.003, + "loss": 4.3299, + "step": 1556 + }, + { + "epoch": 0.01557, + "grad_norm": 0.8105266239108289, + "learning_rate": 0.003, + "loss": 4.317, + "step": 1557 + }, + { + "epoch": 0.01558, + "grad_norm": 0.6797261746777549, + "learning_rate": 0.003, + "loss": 4.3032, + "step": 1558 + }, + { + "epoch": 0.01559, + "grad_norm": 0.5540991472031954, + "learning_rate": 0.003, + "loss": 4.3349, + "step": 1559 + }, + { + "epoch": 0.0156, + "grad_norm": 0.5222478848499021, + "learning_rate": 0.003, + "loss": 4.3207, + "step": 1560 + }, + { + "epoch": 0.01561, + "grad_norm": 0.4547690370956997, + "learning_rate": 0.003, + "loss": 4.3006, + "step": 1561 + }, + { + "epoch": 0.01562, + "grad_norm": 0.3960981969234329, + "learning_rate": 0.003, + "loss": 4.2916, + "step": 1562 + }, + { + "epoch": 0.01563, + "grad_norm": 0.4365008440994641, + "learning_rate": 0.003, + "loss": 4.2998, + "step": 1563 + }, + { + "epoch": 0.01564, + "grad_norm": 0.4153235041333595, + "learning_rate": 0.003, + "loss": 4.3237, + "step": 1564 + }, + { + "epoch": 0.01565, + "grad_norm": 0.4172430370227481, + "learning_rate": 0.003, + "loss": 4.2984, + "step": 1565 + }, + { + "epoch": 0.01566, + "grad_norm": 0.4030631404816366, + "learning_rate": 0.003, + "loss": 4.3153, + "step": 1566 + }, + { + "epoch": 0.01567, + "grad_norm": 0.4006086575381243, + "learning_rate": 0.003, + "loss": 4.3082, + "step": 1567 + }, + { + "epoch": 0.01568, + "grad_norm": 0.4692595605101829, + "learning_rate": 0.003, + "loss": 4.2921, + "step": 1568 + }, + { + "epoch": 0.01569, + "grad_norm": 0.5156092105425303, + "learning_rate": 0.003, + "loss": 4.2975, + "step": 1569 + }, + { + "epoch": 0.0157, + "grad_norm": 0.5574730520335043, + "learning_rate": 0.003, + "loss": 4.3166, + "step": 1570 + }, + { + "epoch": 0.01571, + "grad_norm": 0.7147942584489586, + "learning_rate": 0.003, + "loss": 4.306, + "step": 1571 + }, + { + "epoch": 0.01572, + "grad_norm": 0.8176596681794123, + "learning_rate": 0.003, + "loss": 4.3202, + "step": 1572 + }, + { + "epoch": 0.01573, + "grad_norm": 0.8859299700656846, + "learning_rate": 0.003, + "loss": 4.311, + "step": 1573 + }, + { + "epoch": 0.01574, + "grad_norm": 1.0167563301248608, + "learning_rate": 0.003, + "loss": 4.3365, + "step": 1574 + }, + { + "epoch": 0.01575, + "grad_norm": 0.9276915846212054, + "learning_rate": 0.003, + "loss": 4.3012, + "step": 1575 + }, + { + "epoch": 0.01576, + "grad_norm": 0.7519027259067711, + "learning_rate": 0.003, + "loss": 4.317, + "step": 1576 + }, + { + "epoch": 0.01577, + "grad_norm": 0.5990656751852772, + "learning_rate": 0.003, + "loss": 4.2743, + "step": 1577 + }, + { + "epoch": 0.01578, + "grad_norm": 0.652141379980981, + "learning_rate": 0.003, + "loss": 4.3047, + "step": 1578 + }, + { + "epoch": 0.01579, + "grad_norm": 0.6130941880481341, + "learning_rate": 0.003, + "loss": 4.3001, + "step": 1579 + }, + { + "epoch": 0.0158, + "grad_norm": 0.6566738181349618, + "learning_rate": 0.003, + "loss": 4.3101, + "step": 1580 + }, + { + "epoch": 0.01581, + "grad_norm": 0.5602300919728743, + "learning_rate": 0.003, + "loss": 4.3154, + "step": 1581 + }, + { + "epoch": 0.01582, + "grad_norm": 0.6614515024819504, + "learning_rate": 0.003, + "loss": 4.3018, + "step": 1582 + }, + { + "epoch": 0.01583, + "grad_norm": 0.6685116238668469, + "learning_rate": 0.003, + "loss": 4.2884, + "step": 1583 + }, + { + "epoch": 0.01584, + "grad_norm": 0.7663983884039552, + "learning_rate": 0.003, + "loss": 4.3227, + "step": 1584 + }, + { + "epoch": 0.01585, + "grad_norm": 0.8471607724061854, + "learning_rate": 0.003, + "loss": 4.3183, + "step": 1585 + }, + { + "epoch": 0.01586, + "grad_norm": 0.8029205674223586, + "learning_rate": 0.003, + "loss": 4.2999, + "step": 1586 + }, + { + "epoch": 0.01587, + "grad_norm": 0.7335558361580696, + "learning_rate": 0.003, + "loss": 4.3092, + "step": 1587 + }, + { + "epoch": 0.01588, + "grad_norm": 0.6576426322376321, + "learning_rate": 0.003, + "loss": 4.3341, + "step": 1588 + }, + { + "epoch": 0.01589, + "grad_norm": 0.7314604361721292, + "learning_rate": 0.003, + "loss": 4.328, + "step": 1589 + }, + { + "epoch": 0.0159, + "grad_norm": 0.7484813941023133, + "learning_rate": 0.003, + "loss": 4.323, + "step": 1590 + }, + { + "epoch": 0.01591, + "grad_norm": 0.6836450786052791, + "learning_rate": 0.003, + "loss": 4.3264, + "step": 1591 + }, + { + "epoch": 0.01592, + "grad_norm": 0.6522043667699231, + "learning_rate": 0.003, + "loss": 4.3127, + "step": 1592 + }, + { + "epoch": 0.01593, + "grad_norm": 0.6311012555499182, + "learning_rate": 0.003, + "loss": 4.3174, + "step": 1593 + }, + { + "epoch": 0.01594, + "grad_norm": 0.5510623120073839, + "learning_rate": 0.003, + "loss": 4.3009, + "step": 1594 + }, + { + "epoch": 0.01595, + "grad_norm": 0.48793948155833994, + "learning_rate": 0.003, + "loss": 4.3248, + "step": 1595 + }, + { + "epoch": 0.01596, + "grad_norm": 0.42717068818596066, + "learning_rate": 0.003, + "loss": 4.3142, + "step": 1596 + }, + { + "epoch": 0.01597, + "grad_norm": 0.43677376148637953, + "learning_rate": 0.003, + "loss": 4.2773, + "step": 1597 + }, + { + "epoch": 0.01598, + "grad_norm": 0.47981748858846524, + "learning_rate": 0.003, + "loss": 4.2706, + "step": 1598 + }, + { + "epoch": 0.01599, + "grad_norm": 0.5068183957178387, + "learning_rate": 0.003, + "loss": 4.2944, + "step": 1599 + }, + { + "epoch": 0.016, + "grad_norm": 0.570411039839509, + "learning_rate": 0.003, + "loss": 4.261, + "step": 1600 + }, + { + "epoch": 0.01601, + "grad_norm": 0.7428825863420937, + "learning_rate": 0.003, + "loss": 4.3164, + "step": 1601 + }, + { + "epoch": 0.01602, + "grad_norm": 1.0316614845943484, + "learning_rate": 0.003, + "loss": 4.3056, + "step": 1602 + }, + { + "epoch": 0.01603, + "grad_norm": 0.9265092099146023, + "learning_rate": 0.003, + "loss": 4.3273, + "step": 1603 + }, + { + "epoch": 0.01604, + "grad_norm": 0.6714138671323193, + "learning_rate": 0.003, + "loss": 4.3099, + "step": 1604 + }, + { + "epoch": 0.01605, + "grad_norm": 0.7449186454711473, + "learning_rate": 0.003, + "loss": 4.2937, + "step": 1605 + }, + { + "epoch": 0.01606, + "grad_norm": 0.6551508425933098, + "learning_rate": 0.003, + "loss": 4.29, + "step": 1606 + }, + { + "epoch": 0.01607, + "grad_norm": 0.6368386814210635, + "learning_rate": 0.003, + "loss": 4.3196, + "step": 1607 + }, + { + "epoch": 0.01608, + "grad_norm": 0.6007545339051799, + "learning_rate": 0.003, + "loss": 4.287, + "step": 1608 + }, + { + "epoch": 0.01609, + "grad_norm": 0.5685481620694516, + "learning_rate": 0.003, + "loss": 4.2973, + "step": 1609 + }, + { + "epoch": 0.0161, + "grad_norm": 0.5084184586189073, + "learning_rate": 0.003, + "loss": 4.2592, + "step": 1610 + }, + { + "epoch": 0.01611, + "grad_norm": 0.4840833495127669, + "learning_rate": 0.003, + "loss": 4.2802, + "step": 1611 + }, + { + "epoch": 0.01612, + "grad_norm": 0.48942028074279065, + "learning_rate": 0.003, + "loss": 4.2702, + "step": 1612 + }, + { + "epoch": 0.01613, + "grad_norm": 0.5011285237133881, + "learning_rate": 0.003, + "loss": 4.315, + "step": 1613 + }, + { + "epoch": 0.01614, + "grad_norm": 0.43995362094592044, + "learning_rate": 0.003, + "loss": 4.3364, + "step": 1614 + }, + { + "epoch": 0.01615, + "grad_norm": 0.42434808753845177, + "learning_rate": 0.003, + "loss": 4.3023, + "step": 1615 + }, + { + "epoch": 0.01616, + "grad_norm": 0.40673555464346545, + "learning_rate": 0.003, + "loss": 4.2826, + "step": 1616 + }, + { + "epoch": 0.01617, + "grad_norm": 0.3578337742407294, + "learning_rate": 0.003, + "loss": 4.2706, + "step": 1617 + }, + { + "epoch": 0.01618, + "grad_norm": 0.41081917922167766, + "learning_rate": 0.003, + "loss": 4.2753, + "step": 1618 + }, + { + "epoch": 0.01619, + "grad_norm": 0.47886760935645833, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 1619 + }, + { + "epoch": 0.0162, + "grad_norm": 0.6222321239813, + "learning_rate": 0.003, + "loss": 4.2767, + "step": 1620 + }, + { + "epoch": 0.01621, + "grad_norm": 0.6903491585345826, + "learning_rate": 0.003, + "loss": 4.2786, + "step": 1621 + }, + { + "epoch": 0.01622, + "grad_norm": 0.7573622136296652, + "learning_rate": 0.003, + "loss": 4.3118, + "step": 1622 + }, + { + "epoch": 0.01623, + "grad_norm": 0.9256995453444036, + "learning_rate": 0.003, + "loss": 4.3026, + "step": 1623 + }, + { + "epoch": 0.01624, + "grad_norm": 0.9558305805696413, + "learning_rate": 0.003, + "loss": 4.3216, + "step": 1624 + }, + { + "epoch": 0.01625, + "grad_norm": 0.8114385487348762, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1625 + }, + { + "epoch": 0.01626, + "grad_norm": 0.8322826717160557, + "learning_rate": 0.003, + "loss": 4.3177, + "step": 1626 + }, + { + "epoch": 0.01627, + "grad_norm": 0.7727713367753223, + "learning_rate": 0.003, + "loss": 4.2816, + "step": 1627 + }, + { + "epoch": 0.01628, + "grad_norm": 0.7109224896248458, + "learning_rate": 0.003, + "loss": 4.2929, + "step": 1628 + }, + { + "epoch": 0.01629, + "grad_norm": 0.560897124973334, + "learning_rate": 0.003, + "loss": 4.2858, + "step": 1629 + }, + { + "epoch": 0.0163, + "grad_norm": 0.6779380557060044, + "learning_rate": 0.003, + "loss": 4.2826, + "step": 1630 + }, + { + "epoch": 0.01631, + "grad_norm": 0.6886638086492523, + "learning_rate": 0.003, + "loss": 4.3011, + "step": 1631 + }, + { + "epoch": 0.01632, + "grad_norm": 0.6589349584439033, + "learning_rate": 0.003, + "loss": 4.3124, + "step": 1632 + }, + { + "epoch": 0.01633, + "grad_norm": 0.7155276302347654, + "learning_rate": 0.003, + "loss": 4.2893, + "step": 1633 + }, + { + "epoch": 0.01634, + "grad_norm": 0.7783909364325312, + "learning_rate": 0.003, + "loss": 4.3094, + "step": 1634 + }, + { + "epoch": 0.01635, + "grad_norm": 0.8288192042951168, + "learning_rate": 0.003, + "loss": 4.2952, + "step": 1635 + }, + { + "epoch": 0.01636, + "grad_norm": 0.8309682765627266, + "learning_rate": 0.003, + "loss": 4.3157, + "step": 1636 + }, + { + "epoch": 0.01637, + "grad_norm": 0.7885672766922769, + "learning_rate": 0.003, + "loss": 4.2985, + "step": 1637 + }, + { + "epoch": 0.01638, + "grad_norm": 0.6820787883112478, + "learning_rate": 0.003, + "loss": 4.3026, + "step": 1638 + }, + { + "epoch": 0.01639, + "grad_norm": 0.5912531628997083, + "learning_rate": 0.003, + "loss": 4.3221, + "step": 1639 + }, + { + "epoch": 0.0164, + "grad_norm": 0.7081315099543188, + "learning_rate": 0.003, + "loss": 4.3062, + "step": 1640 + }, + { + "epoch": 0.01641, + "grad_norm": 0.7390837101488571, + "learning_rate": 0.003, + "loss": 4.279, + "step": 1641 + }, + { + "epoch": 0.01642, + "grad_norm": 0.8019499352702314, + "learning_rate": 0.003, + "loss": 4.3244, + "step": 1642 + }, + { + "epoch": 0.01643, + "grad_norm": 0.7922382978586373, + "learning_rate": 0.003, + "loss": 4.3126, + "step": 1643 + }, + { + "epoch": 0.01644, + "grad_norm": 0.7260441686104165, + "learning_rate": 0.003, + "loss": 4.3282, + "step": 1644 + }, + { + "epoch": 0.01645, + "grad_norm": 0.5765166219346322, + "learning_rate": 0.003, + "loss": 4.2686, + "step": 1645 + }, + { + "epoch": 0.01646, + "grad_norm": 0.4846869130781467, + "learning_rate": 0.003, + "loss": 4.298, + "step": 1646 + }, + { + "epoch": 0.01647, + "grad_norm": 0.4324516045425632, + "learning_rate": 0.003, + "loss": 4.294, + "step": 1647 + }, + { + "epoch": 0.01648, + "grad_norm": 0.45801960619516824, + "learning_rate": 0.003, + "loss": 4.2725, + "step": 1648 + }, + { + "epoch": 0.01649, + "grad_norm": 0.4519054523297515, + "learning_rate": 0.003, + "loss": 4.3034, + "step": 1649 + }, + { + "epoch": 0.0165, + "grad_norm": 0.5030069407293299, + "learning_rate": 0.003, + "loss": 4.2846, + "step": 1650 + }, + { + "epoch": 0.01651, + "grad_norm": 0.7338143580585036, + "learning_rate": 0.003, + "loss": 4.2988, + "step": 1651 + }, + { + "epoch": 0.01652, + "grad_norm": 0.9179472565177694, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 1652 + }, + { + "epoch": 0.01653, + "grad_norm": 0.7974402781604697, + "learning_rate": 0.003, + "loss": 4.307, + "step": 1653 + }, + { + "epoch": 0.01654, + "grad_norm": 0.8094858670947136, + "learning_rate": 0.003, + "loss": 4.2972, + "step": 1654 + }, + { + "epoch": 0.01655, + "grad_norm": 0.8204525750124729, + "learning_rate": 0.003, + "loss": 4.313, + "step": 1655 + }, + { + "epoch": 0.01656, + "grad_norm": 0.6293302149986666, + "learning_rate": 0.003, + "loss": 4.271, + "step": 1656 + }, + { + "epoch": 0.01657, + "grad_norm": 0.5706278835378605, + "learning_rate": 0.003, + "loss": 4.293, + "step": 1657 + }, + { + "epoch": 0.01658, + "grad_norm": 0.6110254736121277, + "learning_rate": 0.003, + "loss": 4.2809, + "step": 1658 + }, + { + "epoch": 0.01659, + "grad_norm": 0.4984603935459828, + "learning_rate": 0.003, + "loss": 4.2824, + "step": 1659 + }, + { + "epoch": 0.0166, + "grad_norm": 0.46804303085612786, + "learning_rate": 0.003, + "loss": 4.2649, + "step": 1660 + }, + { + "epoch": 0.01661, + "grad_norm": 0.48879681023532395, + "learning_rate": 0.003, + "loss": 4.2671, + "step": 1661 + }, + { + "epoch": 0.01662, + "grad_norm": 0.5571765680768687, + "learning_rate": 0.003, + "loss": 4.2719, + "step": 1662 + }, + { + "epoch": 0.01663, + "grad_norm": 0.532882366154707, + "learning_rate": 0.003, + "loss": 4.294, + "step": 1663 + }, + { + "epoch": 0.01664, + "grad_norm": 0.5088338427561463, + "learning_rate": 0.003, + "loss": 4.3094, + "step": 1664 + }, + { + "epoch": 0.01665, + "grad_norm": 0.4963301596331138, + "learning_rate": 0.003, + "loss": 4.2712, + "step": 1665 + }, + { + "epoch": 0.01666, + "grad_norm": 0.5472162068898109, + "learning_rate": 0.003, + "loss": 4.2732, + "step": 1666 + }, + { + "epoch": 0.01667, + "grad_norm": 0.6863444215859392, + "learning_rate": 0.003, + "loss": 4.2951, + "step": 1667 + }, + { + "epoch": 0.01668, + "grad_norm": 0.7945747889757943, + "learning_rate": 0.003, + "loss": 4.3019, + "step": 1668 + }, + { + "epoch": 0.01669, + "grad_norm": 0.7029756676891371, + "learning_rate": 0.003, + "loss": 4.2868, + "step": 1669 + }, + { + "epoch": 0.0167, + "grad_norm": 0.564779057565769, + "learning_rate": 0.003, + "loss": 4.2701, + "step": 1670 + }, + { + "epoch": 0.01671, + "grad_norm": 0.5946383928046679, + "learning_rate": 0.003, + "loss": 4.2602, + "step": 1671 + }, + { + "epoch": 0.01672, + "grad_norm": 0.7328231067257597, + "learning_rate": 0.003, + "loss": 4.2779, + "step": 1672 + }, + { + "epoch": 0.01673, + "grad_norm": 0.7730808965686631, + "learning_rate": 0.003, + "loss": 4.2759, + "step": 1673 + }, + { + "epoch": 0.01674, + "grad_norm": 0.8250670120229329, + "learning_rate": 0.003, + "loss": 4.3013, + "step": 1674 + }, + { + "epoch": 0.01675, + "grad_norm": 0.7145691985141881, + "learning_rate": 0.003, + "loss": 4.2806, + "step": 1675 + }, + { + "epoch": 0.01676, + "grad_norm": 0.6268084931910631, + "learning_rate": 0.003, + "loss": 4.2872, + "step": 1676 + }, + { + "epoch": 0.01677, + "grad_norm": 0.6170941899124056, + "learning_rate": 0.003, + "loss": 4.2772, + "step": 1677 + }, + { + "epoch": 0.01678, + "grad_norm": 0.5951034196601164, + "learning_rate": 0.003, + "loss": 4.2879, + "step": 1678 + }, + { + "epoch": 0.01679, + "grad_norm": 0.5254502211849061, + "learning_rate": 0.003, + "loss": 4.2679, + "step": 1679 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6096842133031645, + "learning_rate": 0.003, + "loss": 4.2892, + "step": 1680 + }, + { + "epoch": 0.01681, + "grad_norm": 0.6218177000062469, + "learning_rate": 0.003, + "loss": 4.2892, + "step": 1681 + }, + { + "epoch": 0.01682, + "grad_norm": 0.5832829705689185, + "learning_rate": 0.003, + "loss": 4.2817, + "step": 1682 + }, + { + "epoch": 0.01683, + "grad_norm": 0.5910768318994952, + "learning_rate": 0.003, + "loss": 4.2959, + "step": 1683 + }, + { + "epoch": 0.01684, + "grad_norm": 0.6050344929193974, + "learning_rate": 0.003, + "loss": 4.2707, + "step": 1684 + }, + { + "epoch": 0.01685, + "grad_norm": 0.6382986956510986, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1685 + }, + { + "epoch": 0.01686, + "grad_norm": 0.8077022270811443, + "learning_rate": 0.003, + "loss": 4.2942, + "step": 1686 + }, + { + "epoch": 0.01687, + "grad_norm": 0.8169553039156104, + "learning_rate": 0.003, + "loss": 4.2925, + "step": 1687 + }, + { + "epoch": 0.01688, + "grad_norm": 0.8368087154638648, + "learning_rate": 0.003, + "loss": 4.2843, + "step": 1688 + }, + { + "epoch": 0.01689, + "grad_norm": 0.869445348031783, + "learning_rate": 0.003, + "loss": 4.3002, + "step": 1689 + }, + { + "epoch": 0.0169, + "grad_norm": 0.9062293741017381, + "learning_rate": 0.003, + "loss": 4.2906, + "step": 1690 + }, + { + "epoch": 0.01691, + "grad_norm": 0.8487759964259872, + "learning_rate": 0.003, + "loss": 4.3075, + "step": 1691 + }, + { + "epoch": 0.01692, + "grad_norm": 0.8483641489848167, + "learning_rate": 0.003, + "loss": 4.3088, + "step": 1692 + }, + { + "epoch": 0.01693, + "grad_norm": 0.8279189520542338, + "learning_rate": 0.003, + "loss": 4.3369, + "step": 1693 + }, + { + "epoch": 0.01694, + "grad_norm": 0.8089873138093994, + "learning_rate": 0.003, + "loss": 4.2887, + "step": 1694 + }, + { + "epoch": 0.01695, + "grad_norm": 0.9455670720071184, + "learning_rate": 0.003, + "loss": 4.3248, + "step": 1695 + }, + { + "epoch": 0.01696, + "grad_norm": 1.064016692244757, + "learning_rate": 0.003, + "loss": 4.3315, + "step": 1696 + }, + { + "epoch": 0.01697, + "grad_norm": 0.9264587036694925, + "learning_rate": 0.003, + "loss": 4.302, + "step": 1697 + }, + { + "epoch": 0.01698, + "grad_norm": 0.6997503897611733, + "learning_rate": 0.003, + "loss": 4.289, + "step": 1698 + }, + { + "epoch": 0.01699, + "grad_norm": 0.6394457308265625, + "learning_rate": 0.003, + "loss": 4.2969, + "step": 1699 + }, + { + "epoch": 0.017, + "grad_norm": 0.5891910529153762, + "learning_rate": 0.003, + "loss": 4.3094, + "step": 1700 + }, + { + "epoch": 0.01701, + "grad_norm": 0.6435550929707158, + "learning_rate": 0.003, + "loss": 4.2939, + "step": 1701 + }, + { + "epoch": 0.01702, + "grad_norm": 0.6810306253220882, + "learning_rate": 0.003, + "loss": 4.3163, + "step": 1702 + }, + { + "epoch": 0.01703, + "grad_norm": 0.6849363850272787, + "learning_rate": 0.003, + "loss": 4.3034, + "step": 1703 + }, + { + "epoch": 0.01704, + "grad_norm": 0.6708561417744424, + "learning_rate": 0.003, + "loss": 4.2869, + "step": 1704 + }, + { + "epoch": 0.01705, + "grad_norm": 0.5968699215981799, + "learning_rate": 0.003, + "loss": 4.2879, + "step": 1705 + }, + { + "epoch": 0.01706, + "grad_norm": 0.505702133399309, + "learning_rate": 0.003, + "loss": 4.2792, + "step": 1706 + }, + { + "epoch": 0.01707, + "grad_norm": 0.5302615087387699, + "learning_rate": 0.003, + "loss": 4.3011, + "step": 1707 + }, + { + "epoch": 0.01708, + "grad_norm": 0.5210003345198112, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 1708 + }, + { + "epoch": 0.01709, + "grad_norm": 0.46584995061994, + "learning_rate": 0.003, + "loss": 4.2953, + "step": 1709 + }, + { + "epoch": 0.0171, + "grad_norm": 0.42345394468243425, + "learning_rate": 0.003, + "loss": 4.2414, + "step": 1710 + }, + { + "epoch": 0.01711, + "grad_norm": 0.43754438467909607, + "learning_rate": 0.003, + "loss": 4.2717, + "step": 1711 + }, + { + "epoch": 0.01712, + "grad_norm": 0.41267034274673536, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 1712 + }, + { + "epoch": 0.01713, + "grad_norm": 0.3568885732041511, + "learning_rate": 0.003, + "loss": 4.2523, + "step": 1713 + }, + { + "epoch": 0.01714, + "grad_norm": 0.3457637645495226, + "learning_rate": 0.003, + "loss": 4.2876, + "step": 1714 + }, + { + "epoch": 0.01715, + "grad_norm": 0.3722023563902025, + "learning_rate": 0.003, + "loss": 4.2958, + "step": 1715 + }, + { + "epoch": 0.01716, + "grad_norm": 0.3390050605620623, + "learning_rate": 0.003, + "loss": 4.2697, + "step": 1716 + }, + { + "epoch": 0.01717, + "grad_norm": 0.4093688383515523, + "learning_rate": 0.003, + "loss": 4.3028, + "step": 1717 + }, + { + "epoch": 0.01718, + "grad_norm": 0.5445666194239754, + "learning_rate": 0.003, + "loss": 4.2735, + "step": 1718 + }, + { + "epoch": 0.01719, + "grad_norm": 0.8818554274614839, + "learning_rate": 0.003, + "loss": 4.3037, + "step": 1719 + }, + { + "epoch": 0.0172, + "grad_norm": 1.123998698583038, + "learning_rate": 0.003, + "loss": 4.2879, + "step": 1720 + }, + { + "epoch": 0.01721, + "grad_norm": 0.6518188394707584, + "learning_rate": 0.003, + "loss": 4.2815, + "step": 1721 + }, + { + "epoch": 0.01722, + "grad_norm": 0.7473961890284897, + "learning_rate": 0.003, + "loss": 4.281, + "step": 1722 + }, + { + "epoch": 0.01723, + "grad_norm": 0.7241082541176558, + "learning_rate": 0.003, + "loss": 4.2942, + "step": 1723 + }, + { + "epoch": 0.01724, + "grad_norm": 0.5406243852034243, + "learning_rate": 0.003, + "loss": 4.2839, + "step": 1724 + }, + { + "epoch": 0.01725, + "grad_norm": 0.7661047717663526, + "learning_rate": 0.003, + "loss": 4.2758, + "step": 1725 + }, + { + "epoch": 0.01726, + "grad_norm": 0.6519733535814134, + "learning_rate": 0.003, + "loss": 4.2678, + "step": 1726 + }, + { + "epoch": 0.01727, + "grad_norm": 0.5095447102684948, + "learning_rate": 0.003, + "loss": 4.2681, + "step": 1727 + }, + { + "epoch": 0.01728, + "grad_norm": 0.5716779713985857, + "learning_rate": 0.003, + "loss": 4.2703, + "step": 1728 + }, + { + "epoch": 0.01729, + "grad_norm": 0.5506488662519434, + "learning_rate": 0.003, + "loss": 4.2777, + "step": 1729 + }, + { + "epoch": 0.0173, + "grad_norm": 0.4864830783539529, + "learning_rate": 0.003, + "loss": 4.2498, + "step": 1730 + }, + { + "epoch": 0.01731, + "grad_norm": 0.49289174915320866, + "learning_rate": 0.003, + "loss": 4.2691, + "step": 1731 + }, + { + "epoch": 0.01732, + "grad_norm": 0.5258408981010323, + "learning_rate": 0.003, + "loss": 4.2782, + "step": 1732 + }, + { + "epoch": 0.01733, + "grad_norm": 0.5689089752248919, + "learning_rate": 0.003, + "loss": 4.2578, + "step": 1733 + }, + { + "epoch": 0.01734, + "grad_norm": 0.5515499021599949, + "learning_rate": 0.003, + "loss": 4.2704, + "step": 1734 + }, + { + "epoch": 0.01735, + "grad_norm": 0.5101191368780025, + "learning_rate": 0.003, + "loss": 4.2394, + "step": 1735 + }, + { + "epoch": 0.01736, + "grad_norm": 0.5538259418110674, + "learning_rate": 0.003, + "loss": 4.2405, + "step": 1736 + }, + { + "epoch": 0.01737, + "grad_norm": 0.6368154932756357, + "learning_rate": 0.003, + "loss": 4.273, + "step": 1737 + }, + { + "epoch": 0.01738, + "grad_norm": 0.6302527957263243, + "learning_rate": 0.003, + "loss": 4.2633, + "step": 1738 + }, + { + "epoch": 0.01739, + "grad_norm": 0.5659864529447296, + "learning_rate": 0.003, + "loss": 4.2757, + "step": 1739 + }, + { + "epoch": 0.0174, + "grad_norm": 0.6034857544005698, + "learning_rate": 0.003, + "loss": 4.278, + "step": 1740 + }, + { + "epoch": 0.01741, + "grad_norm": 0.7023281656557057, + "learning_rate": 0.003, + "loss": 4.2883, + "step": 1741 + }, + { + "epoch": 0.01742, + "grad_norm": 0.7127714842932543, + "learning_rate": 0.003, + "loss": 4.2766, + "step": 1742 + }, + { + "epoch": 0.01743, + "grad_norm": 0.7178940660267726, + "learning_rate": 0.003, + "loss": 4.2889, + "step": 1743 + }, + { + "epoch": 0.01744, + "grad_norm": 0.770440209657255, + "learning_rate": 0.003, + "loss": 4.2773, + "step": 1744 + }, + { + "epoch": 0.01745, + "grad_norm": 0.7362175602563142, + "learning_rate": 0.003, + "loss": 4.2484, + "step": 1745 + }, + { + "epoch": 0.01746, + "grad_norm": 0.7588375597390518, + "learning_rate": 0.003, + "loss": 4.2719, + "step": 1746 + }, + { + "epoch": 0.01747, + "grad_norm": 0.6839956521205998, + "learning_rate": 0.003, + "loss": 4.267, + "step": 1747 + }, + { + "epoch": 0.01748, + "grad_norm": 0.6088836411369196, + "learning_rate": 0.003, + "loss": 4.2555, + "step": 1748 + }, + { + "epoch": 0.01749, + "grad_norm": 0.548971540706334, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 1749 + }, + { + "epoch": 0.0175, + "grad_norm": 0.5440778124239496, + "learning_rate": 0.003, + "loss": 4.2665, + "step": 1750 + }, + { + "epoch": 0.01751, + "grad_norm": 0.5918941393242653, + "learning_rate": 0.003, + "loss": 4.2593, + "step": 1751 + }, + { + "epoch": 0.01752, + "grad_norm": 0.6455620934648258, + "learning_rate": 0.003, + "loss": 4.3014, + "step": 1752 + }, + { + "epoch": 0.01753, + "grad_norm": 0.6781800073296501, + "learning_rate": 0.003, + "loss": 4.2535, + "step": 1753 + }, + { + "epoch": 0.01754, + "grad_norm": 0.6882980076107725, + "learning_rate": 0.003, + "loss": 4.2853, + "step": 1754 + }, + { + "epoch": 0.01755, + "grad_norm": 0.7961632146248975, + "learning_rate": 0.003, + "loss": 4.233, + "step": 1755 + }, + { + "epoch": 0.01756, + "grad_norm": 0.8494780429534167, + "learning_rate": 0.003, + "loss": 4.314, + "step": 1756 + }, + { + "epoch": 0.01757, + "grad_norm": 0.9294824362906122, + "learning_rate": 0.003, + "loss": 4.2987, + "step": 1757 + }, + { + "epoch": 0.01758, + "grad_norm": 0.852175947314296, + "learning_rate": 0.003, + "loss": 4.3126, + "step": 1758 + }, + { + "epoch": 0.01759, + "grad_norm": 0.7347749740307682, + "learning_rate": 0.003, + "loss": 4.2719, + "step": 1759 + }, + { + "epoch": 0.0176, + "grad_norm": 0.832142747460363, + "learning_rate": 0.003, + "loss": 4.2994, + "step": 1760 + }, + { + "epoch": 0.01761, + "grad_norm": 0.8686282951443787, + "learning_rate": 0.003, + "loss": 4.2965, + "step": 1761 + }, + { + "epoch": 0.01762, + "grad_norm": 0.9408046252281581, + "learning_rate": 0.003, + "loss": 4.3172, + "step": 1762 + }, + { + "epoch": 0.01763, + "grad_norm": 0.8594428065536988, + "learning_rate": 0.003, + "loss": 4.3092, + "step": 1763 + }, + { + "epoch": 0.01764, + "grad_norm": 0.7749341487974934, + "learning_rate": 0.003, + "loss": 4.2754, + "step": 1764 + }, + { + "epoch": 0.01765, + "grad_norm": 0.7897730382355919, + "learning_rate": 0.003, + "loss": 4.3076, + "step": 1765 + }, + { + "epoch": 0.01766, + "grad_norm": 0.7174878978412247, + "learning_rate": 0.003, + "loss": 4.317, + "step": 1766 + }, + { + "epoch": 0.01767, + "grad_norm": 0.7171956437495152, + "learning_rate": 0.003, + "loss": 4.2757, + "step": 1767 + }, + { + "epoch": 0.01768, + "grad_norm": 0.8077487259776668, + "learning_rate": 0.003, + "loss": 4.2866, + "step": 1768 + }, + { + "epoch": 0.01769, + "grad_norm": 0.7100605985326003, + "learning_rate": 0.003, + "loss": 4.2811, + "step": 1769 + }, + { + "epoch": 0.0177, + "grad_norm": 0.578964917508603, + "learning_rate": 0.003, + "loss": 4.2635, + "step": 1770 + }, + { + "epoch": 0.01771, + "grad_norm": 0.6086271828330341, + "learning_rate": 0.003, + "loss": 4.2929, + "step": 1771 + }, + { + "epoch": 0.01772, + "grad_norm": 0.497546957954685, + "learning_rate": 0.003, + "loss": 4.262, + "step": 1772 + }, + { + "epoch": 0.01773, + "grad_norm": 0.490787842661778, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 1773 + }, + { + "epoch": 0.01774, + "grad_norm": 0.40528808868462435, + "learning_rate": 0.003, + "loss": 4.272, + "step": 1774 + }, + { + "epoch": 0.01775, + "grad_norm": 0.3996997018074339, + "learning_rate": 0.003, + "loss": 4.2574, + "step": 1775 + }, + { + "epoch": 0.01776, + "grad_norm": 0.40093115740510726, + "learning_rate": 0.003, + "loss": 4.2491, + "step": 1776 + }, + { + "epoch": 0.01777, + "grad_norm": 0.4577551703461772, + "learning_rate": 0.003, + "loss": 4.27, + "step": 1777 + }, + { + "epoch": 0.01778, + "grad_norm": 0.4894775985846586, + "learning_rate": 0.003, + "loss": 4.262, + "step": 1778 + }, + { + "epoch": 0.01779, + "grad_norm": 0.619715296261353, + "learning_rate": 0.003, + "loss": 4.2821, + "step": 1779 + }, + { + "epoch": 0.0178, + "grad_norm": 0.8092821046637155, + "learning_rate": 0.003, + "loss": 4.2576, + "step": 1780 + }, + { + "epoch": 0.01781, + "grad_norm": 1.007368746628607, + "learning_rate": 0.003, + "loss": 4.2833, + "step": 1781 + }, + { + "epoch": 0.01782, + "grad_norm": 1.0605028910459124, + "learning_rate": 0.003, + "loss": 4.314, + "step": 1782 + }, + { + "epoch": 0.01783, + "grad_norm": 0.8361916169047299, + "learning_rate": 0.003, + "loss": 4.2912, + "step": 1783 + }, + { + "epoch": 0.01784, + "grad_norm": 0.9950086095977337, + "learning_rate": 0.003, + "loss": 4.2963, + "step": 1784 + }, + { + "epoch": 0.01785, + "grad_norm": 1.0228877834258285, + "learning_rate": 0.003, + "loss": 4.2673, + "step": 1785 + }, + { + "epoch": 0.01786, + "grad_norm": 0.8766088407747356, + "learning_rate": 0.003, + "loss": 4.2899, + "step": 1786 + }, + { + "epoch": 0.01787, + "grad_norm": 0.9346024274483619, + "learning_rate": 0.003, + "loss": 4.2991, + "step": 1787 + }, + { + "epoch": 0.01788, + "grad_norm": 0.8655394124739175, + "learning_rate": 0.003, + "loss": 4.3097, + "step": 1788 + }, + { + "epoch": 0.01789, + "grad_norm": 0.7320722132470949, + "learning_rate": 0.003, + "loss": 4.2742, + "step": 1789 + }, + { + "epoch": 0.0179, + "grad_norm": 0.6043898276387406, + "learning_rate": 0.003, + "loss": 4.288, + "step": 1790 + }, + { + "epoch": 0.01791, + "grad_norm": 0.551731430114501, + "learning_rate": 0.003, + "loss": 4.2984, + "step": 1791 + }, + { + "epoch": 0.01792, + "grad_norm": 0.4116411374695503, + "learning_rate": 0.003, + "loss": 4.2944, + "step": 1792 + }, + { + "epoch": 0.01793, + "grad_norm": 0.4368353784765224, + "learning_rate": 0.003, + "loss": 4.2574, + "step": 1793 + }, + { + "epoch": 0.01794, + "grad_norm": 0.4134683249639472, + "learning_rate": 0.003, + "loss": 4.2696, + "step": 1794 + }, + { + "epoch": 0.01795, + "grad_norm": 0.3898886643379656, + "learning_rate": 0.003, + "loss": 4.2661, + "step": 1795 + }, + { + "epoch": 0.01796, + "grad_norm": 0.3690232703108766, + "learning_rate": 0.003, + "loss": 4.2797, + "step": 1796 + }, + { + "epoch": 0.01797, + "grad_norm": 0.3352292075992397, + "learning_rate": 0.003, + "loss": 4.2421, + "step": 1797 + }, + { + "epoch": 0.01798, + "grad_norm": 0.3577790623133374, + "learning_rate": 0.003, + "loss": 4.2547, + "step": 1798 + }, + { + "epoch": 0.01799, + "grad_norm": 0.40125449243674877, + "learning_rate": 0.003, + "loss": 4.269, + "step": 1799 + }, + { + "epoch": 0.018, + "grad_norm": 0.4677938601833102, + "learning_rate": 0.003, + "loss": 4.26, + "step": 1800 + }, + { + "epoch": 0.01801, + "grad_norm": 0.5610552208097703, + "learning_rate": 0.003, + "loss": 4.2554, + "step": 1801 + }, + { + "epoch": 0.01802, + "grad_norm": 0.5864236478232465, + "learning_rate": 0.003, + "loss": 4.2736, + "step": 1802 + }, + { + "epoch": 0.01803, + "grad_norm": 0.540429672022444, + "learning_rate": 0.003, + "loss": 4.2776, + "step": 1803 + }, + { + "epoch": 0.01804, + "grad_norm": 0.60459180760597, + "learning_rate": 0.003, + "loss": 4.2503, + "step": 1804 + }, + { + "epoch": 0.01805, + "grad_norm": 0.6945199909530504, + "learning_rate": 0.003, + "loss": 4.2727, + "step": 1805 + }, + { + "epoch": 0.01806, + "grad_norm": 0.6888751318415884, + "learning_rate": 0.003, + "loss": 4.2338, + "step": 1806 + }, + { + "epoch": 0.01807, + "grad_norm": 0.652641222784165, + "learning_rate": 0.003, + "loss": 4.2674, + "step": 1807 + }, + { + "epoch": 0.01808, + "grad_norm": 0.7163920878946057, + "learning_rate": 0.003, + "loss": 4.2613, + "step": 1808 + }, + { + "epoch": 0.01809, + "grad_norm": 0.7231527059402414, + "learning_rate": 0.003, + "loss": 4.284, + "step": 1809 + }, + { + "epoch": 0.0181, + "grad_norm": 0.7979758654578836, + "learning_rate": 0.003, + "loss": 4.2839, + "step": 1810 + }, + { + "epoch": 0.01811, + "grad_norm": 0.7630566976684018, + "learning_rate": 0.003, + "loss": 4.2692, + "step": 1811 + }, + { + "epoch": 0.01812, + "grad_norm": 0.68601985737994, + "learning_rate": 0.003, + "loss": 4.2545, + "step": 1812 + }, + { + "epoch": 0.01813, + "grad_norm": 0.601926611394952, + "learning_rate": 0.003, + "loss": 4.2949, + "step": 1813 + }, + { + "epoch": 0.01814, + "grad_norm": 0.66852657571521, + "learning_rate": 0.003, + "loss": 4.2647, + "step": 1814 + }, + { + "epoch": 0.01815, + "grad_norm": 0.6050905896281803, + "learning_rate": 0.003, + "loss": 4.2474, + "step": 1815 + }, + { + "epoch": 0.01816, + "grad_norm": 0.531953848915138, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1816 + }, + { + "epoch": 0.01817, + "grad_norm": 0.6652005043351588, + "learning_rate": 0.003, + "loss": 4.2676, + "step": 1817 + }, + { + "epoch": 0.01818, + "grad_norm": 0.6653492631522442, + "learning_rate": 0.003, + "loss": 4.2669, + "step": 1818 + }, + { + "epoch": 0.01819, + "grad_norm": 0.6430139858317634, + "learning_rate": 0.003, + "loss": 4.2522, + "step": 1819 + }, + { + "epoch": 0.0182, + "grad_norm": 0.7307512465670702, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1820 + }, + { + "epoch": 0.01821, + "grad_norm": 0.7117919312146421, + "learning_rate": 0.003, + "loss": 4.2608, + "step": 1821 + }, + { + "epoch": 0.01822, + "grad_norm": 0.6849992334976018, + "learning_rate": 0.003, + "loss": 4.2737, + "step": 1822 + }, + { + "epoch": 0.01823, + "grad_norm": 0.6437329965546797, + "learning_rate": 0.003, + "loss": 4.2861, + "step": 1823 + }, + { + "epoch": 0.01824, + "grad_norm": 0.6298055587951418, + "learning_rate": 0.003, + "loss": 4.2663, + "step": 1824 + }, + { + "epoch": 0.01825, + "grad_norm": 0.6105471946808345, + "learning_rate": 0.003, + "loss": 4.2706, + "step": 1825 + }, + { + "epoch": 0.01826, + "grad_norm": 0.5680736127986394, + "learning_rate": 0.003, + "loss": 4.2798, + "step": 1826 + }, + { + "epoch": 0.01827, + "grad_norm": 0.6203032072106797, + "learning_rate": 0.003, + "loss": 4.2493, + "step": 1827 + }, + { + "epoch": 0.01828, + "grad_norm": 0.5646483418870301, + "learning_rate": 0.003, + "loss": 4.2595, + "step": 1828 + }, + { + "epoch": 0.01829, + "grad_norm": 0.43449345506164244, + "learning_rate": 0.003, + "loss": 4.2724, + "step": 1829 + }, + { + "epoch": 0.0183, + "grad_norm": 0.4995474978675237, + "learning_rate": 0.003, + "loss": 4.268, + "step": 1830 + }, + { + "epoch": 0.01831, + "grad_norm": 0.6731917230470822, + "learning_rate": 0.003, + "loss": 4.2625, + "step": 1831 + }, + { + "epoch": 0.01832, + "grad_norm": 1.0735909763241713, + "learning_rate": 0.003, + "loss": 4.2749, + "step": 1832 + }, + { + "epoch": 0.01833, + "grad_norm": 0.9454987640162065, + "learning_rate": 0.003, + "loss": 4.3126, + "step": 1833 + }, + { + "epoch": 0.01834, + "grad_norm": 0.6279857981331142, + "learning_rate": 0.003, + "loss": 4.265, + "step": 1834 + }, + { + "epoch": 0.01835, + "grad_norm": 0.7795333868843384, + "learning_rate": 0.003, + "loss": 4.2708, + "step": 1835 + }, + { + "epoch": 0.01836, + "grad_norm": 0.7540135177559686, + "learning_rate": 0.003, + "loss": 4.269, + "step": 1836 + }, + { + "epoch": 0.01837, + "grad_norm": 0.4873694552023737, + "learning_rate": 0.003, + "loss": 4.2594, + "step": 1837 + }, + { + "epoch": 0.01838, + "grad_norm": 0.6475671331829288, + "learning_rate": 0.003, + "loss": 4.2679, + "step": 1838 + }, + { + "epoch": 0.01839, + "grad_norm": 0.5413345448794652, + "learning_rate": 0.003, + "loss": 4.2849, + "step": 1839 + }, + { + "epoch": 0.0184, + "grad_norm": 0.505876271981352, + "learning_rate": 0.003, + "loss": 4.2815, + "step": 1840 + }, + { + "epoch": 0.01841, + "grad_norm": 0.5559518639320817, + "learning_rate": 0.003, + "loss": 4.2211, + "step": 1841 + }, + { + "epoch": 0.01842, + "grad_norm": 0.565402658078461, + "learning_rate": 0.003, + "loss": 4.2563, + "step": 1842 + }, + { + "epoch": 0.01843, + "grad_norm": 0.5176031889881334, + "learning_rate": 0.003, + "loss": 4.2347, + "step": 1843 + }, + { + "epoch": 0.01844, + "grad_norm": 0.515476660965344, + "learning_rate": 0.003, + "loss": 4.2336, + "step": 1844 + }, + { + "epoch": 0.01845, + "grad_norm": 0.537738729997812, + "learning_rate": 0.003, + "loss": 4.2366, + "step": 1845 + }, + { + "epoch": 0.01846, + "grad_norm": 0.5825872331164712, + "learning_rate": 0.003, + "loss": 4.2629, + "step": 1846 + }, + { + "epoch": 0.01847, + "grad_norm": 0.6172426924095343, + "learning_rate": 0.003, + "loss": 4.261, + "step": 1847 + }, + { + "epoch": 0.01848, + "grad_norm": 0.5675520394460739, + "learning_rate": 0.003, + "loss": 4.2733, + "step": 1848 + }, + { + "epoch": 0.01849, + "grad_norm": 0.5804129655862207, + "learning_rate": 0.003, + "loss": 4.2733, + "step": 1849 + }, + { + "epoch": 0.0185, + "grad_norm": 0.5903509862738283, + "learning_rate": 0.003, + "loss": 4.2637, + "step": 1850 + }, + { + "epoch": 0.01851, + "grad_norm": 0.5789415751735032, + "learning_rate": 0.003, + "loss": 4.2453, + "step": 1851 + }, + { + "epoch": 0.01852, + "grad_norm": 0.6073114122094332, + "learning_rate": 0.003, + "loss": 4.249, + "step": 1852 + }, + { + "epoch": 0.01853, + "grad_norm": 0.6172059367670675, + "learning_rate": 0.003, + "loss": 4.2514, + "step": 1853 + }, + { + "epoch": 0.01854, + "grad_norm": 0.5952391168819953, + "learning_rate": 0.003, + "loss": 4.282, + "step": 1854 + }, + { + "epoch": 0.01855, + "grad_norm": 0.599929499071102, + "learning_rate": 0.003, + "loss": 4.2645, + "step": 1855 + }, + { + "epoch": 0.01856, + "grad_norm": 0.7854590983227943, + "learning_rate": 0.003, + "loss": 4.2582, + "step": 1856 + }, + { + "epoch": 0.01857, + "grad_norm": 0.9479235992175757, + "learning_rate": 0.003, + "loss": 4.2521, + "step": 1857 + }, + { + "epoch": 0.01858, + "grad_norm": 1.0547185212118042, + "learning_rate": 0.003, + "loss": 4.2782, + "step": 1858 + }, + { + "epoch": 0.01859, + "grad_norm": 0.7498301751007965, + "learning_rate": 0.003, + "loss": 4.259, + "step": 1859 + }, + { + "epoch": 0.0186, + "grad_norm": 0.6772314759733408, + "learning_rate": 0.003, + "loss": 4.238, + "step": 1860 + }, + { + "epoch": 0.01861, + "grad_norm": 0.7554457171283483, + "learning_rate": 0.003, + "loss": 4.2858, + "step": 1861 + }, + { + "epoch": 0.01862, + "grad_norm": 0.6601398192741809, + "learning_rate": 0.003, + "loss": 4.2482, + "step": 1862 + }, + { + "epoch": 0.01863, + "grad_norm": 0.5632517970928785, + "learning_rate": 0.003, + "loss": 4.2548, + "step": 1863 + }, + { + "epoch": 0.01864, + "grad_norm": 0.5518520472207163, + "learning_rate": 0.003, + "loss": 4.2848, + "step": 1864 + }, + { + "epoch": 0.01865, + "grad_norm": 0.5191605026436766, + "learning_rate": 0.003, + "loss": 4.2571, + "step": 1865 + }, + { + "epoch": 0.01866, + "grad_norm": 0.487977336215745, + "learning_rate": 0.003, + "loss": 4.2583, + "step": 1866 + }, + { + "epoch": 0.01867, + "grad_norm": 0.38111750510593084, + "learning_rate": 0.003, + "loss": 4.2288, + "step": 1867 + }, + { + "epoch": 0.01868, + "grad_norm": 0.36979378128361096, + "learning_rate": 0.003, + "loss": 4.2458, + "step": 1868 + }, + { + "epoch": 0.01869, + "grad_norm": 0.4056719991155462, + "learning_rate": 0.003, + "loss": 4.259, + "step": 1869 + }, + { + "epoch": 0.0187, + "grad_norm": 0.511550197815793, + "learning_rate": 0.003, + "loss": 4.2534, + "step": 1870 + }, + { + "epoch": 0.01871, + "grad_norm": 0.6908775527772675, + "learning_rate": 0.003, + "loss": 4.2322, + "step": 1871 + }, + { + "epoch": 0.01872, + "grad_norm": 0.8655201254185445, + "learning_rate": 0.003, + "loss": 4.2892, + "step": 1872 + }, + { + "epoch": 0.01873, + "grad_norm": 0.903028038452321, + "learning_rate": 0.003, + "loss": 4.2527, + "step": 1873 + }, + { + "epoch": 0.01874, + "grad_norm": 0.7655189256325052, + "learning_rate": 0.003, + "loss": 4.2534, + "step": 1874 + }, + { + "epoch": 0.01875, + "grad_norm": 0.6845228657861865, + "learning_rate": 0.003, + "loss": 4.2697, + "step": 1875 + }, + { + "epoch": 0.01876, + "grad_norm": 0.7967028641532641, + "learning_rate": 0.003, + "loss": 4.2796, + "step": 1876 + }, + { + "epoch": 0.01877, + "grad_norm": 0.7276633971950639, + "learning_rate": 0.003, + "loss": 4.2792, + "step": 1877 + }, + { + "epoch": 0.01878, + "grad_norm": 0.6745755254962531, + "learning_rate": 0.003, + "loss": 4.2647, + "step": 1878 + }, + { + "epoch": 0.01879, + "grad_norm": 0.6794948462625433, + "learning_rate": 0.003, + "loss": 4.2421, + "step": 1879 + }, + { + "epoch": 0.0188, + "grad_norm": 0.7496570961239313, + "learning_rate": 0.003, + "loss": 4.2581, + "step": 1880 + }, + { + "epoch": 0.01881, + "grad_norm": 0.7216177452798419, + "learning_rate": 0.003, + "loss": 4.2527, + "step": 1881 + }, + { + "epoch": 0.01882, + "grad_norm": 0.6411166720639763, + "learning_rate": 0.003, + "loss": 4.2764, + "step": 1882 + }, + { + "epoch": 0.01883, + "grad_norm": 0.4976595430995637, + "learning_rate": 0.003, + "loss": 4.2463, + "step": 1883 + }, + { + "epoch": 0.01884, + "grad_norm": 0.5806395175516162, + "learning_rate": 0.003, + "loss": 4.2463, + "step": 1884 + }, + { + "epoch": 0.01885, + "grad_norm": 0.5932999040186268, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 1885 + }, + { + "epoch": 0.01886, + "grad_norm": 0.6243294916603833, + "learning_rate": 0.003, + "loss": 4.2637, + "step": 1886 + }, + { + "epoch": 0.01887, + "grad_norm": 0.6594473434297762, + "learning_rate": 0.003, + "loss": 4.2583, + "step": 1887 + }, + { + "epoch": 0.01888, + "grad_norm": 0.5833874852548948, + "learning_rate": 0.003, + "loss": 4.2402, + "step": 1888 + }, + { + "epoch": 0.01889, + "grad_norm": 0.5554783687186964, + "learning_rate": 0.003, + "loss": 4.2547, + "step": 1889 + }, + { + "epoch": 0.0189, + "grad_norm": 0.5267765258339018, + "learning_rate": 0.003, + "loss": 4.2329, + "step": 1890 + }, + { + "epoch": 0.01891, + "grad_norm": 0.5533738473996538, + "learning_rate": 0.003, + "loss": 4.2819, + "step": 1891 + }, + { + "epoch": 0.01892, + "grad_norm": 0.5233002886731548, + "learning_rate": 0.003, + "loss": 4.2568, + "step": 1892 + }, + { + "epoch": 0.01893, + "grad_norm": 0.598656523235247, + "learning_rate": 0.003, + "loss": 4.2415, + "step": 1893 + }, + { + "epoch": 0.01894, + "grad_norm": 0.6329032192123055, + "learning_rate": 0.003, + "loss": 4.2498, + "step": 1894 + }, + { + "epoch": 0.01895, + "grad_norm": 0.7448049826558554, + "learning_rate": 0.003, + "loss": 4.2404, + "step": 1895 + }, + { + "epoch": 0.01896, + "grad_norm": 0.7896808634621466, + "learning_rate": 0.003, + "loss": 4.2666, + "step": 1896 + }, + { + "epoch": 0.01897, + "grad_norm": 0.8670691506268475, + "learning_rate": 0.003, + "loss": 4.267, + "step": 1897 + }, + { + "epoch": 0.01898, + "grad_norm": 0.7564766735493696, + "learning_rate": 0.003, + "loss": 4.2371, + "step": 1898 + }, + { + "epoch": 0.01899, + "grad_norm": 0.6014591021089313, + "learning_rate": 0.003, + "loss": 4.246, + "step": 1899 + }, + { + "epoch": 0.019, + "grad_norm": 0.6487238549786433, + "learning_rate": 0.003, + "loss": 4.2427, + "step": 1900 + }, + { + "epoch": 0.01901, + "grad_norm": 0.7335622998657767, + "learning_rate": 0.003, + "loss": 4.2939, + "step": 1901 + }, + { + "epoch": 0.01902, + "grad_norm": 0.727489215463111, + "learning_rate": 0.003, + "loss": 4.2706, + "step": 1902 + }, + { + "epoch": 0.01903, + "grad_norm": 0.6444511274857245, + "learning_rate": 0.003, + "loss": 4.2217, + "step": 1903 + }, + { + "epoch": 0.01904, + "grad_norm": 0.5669107570524424, + "learning_rate": 0.003, + "loss": 4.2738, + "step": 1904 + }, + { + "epoch": 0.01905, + "grad_norm": 0.47567496733321213, + "learning_rate": 0.003, + "loss": 4.2471, + "step": 1905 + }, + { + "epoch": 0.01906, + "grad_norm": 0.568871434515019, + "learning_rate": 0.003, + "loss": 4.2665, + "step": 1906 + }, + { + "epoch": 0.01907, + "grad_norm": 0.6437840413516579, + "learning_rate": 0.003, + "loss": 4.2605, + "step": 1907 + }, + { + "epoch": 0.01908, + "grad_norm": 0.8102395542085408, + "learning_rate": 0.003, + "loss": 4.2806, + "step": 1908 + }, + { + "epoch": 0.01909, + "grad_norm": 0.9019612301377007, + "learning_rate": 0.003, + "loss": 4.2522, + "step": 1909 + }, + { + "epoch": 0.0191, + "grad_norm": 0.8060528165222425, + "learning_rate": 0.003, + "loss": 4.2682, + "step": 1910 + }, + { + "epoch": 0.01911, + "grad_norm": 0.6514562643306322, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 1911 + }, + { + "epoch": 0.01912, + "grad_norm": 0.7512556786074466, + "learning_rate": 0.003, + "loss": 4.2481, + "step": 1912 + }, + { + "epoch": 0.01913, + "grad_norm": 0.7206930768558686, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 1913 + }, + { + "epoch": 0.01914, + "grad_norm": 0.6802559754696879, + "learning_rate": 0.003, + "loss": 4.2815, + "step": 1914 + }, + { + "epoch": 0.01915, + "grad_norm": 0.6329709283705148, + "learning_rate": 0.003, + "loss": 4.2674, + "step": 1915 + }, + { + "epoch": 0.01916, + "grad_norm": 0.6539184714683666, + "learning_rate": 0.003, + "loss": 4.2462, + "step": 1916 + }, + { + "epoch": 0.01917, + "grad_norm": 0.710899415708157, + "learning_rate": 0.003, + "loss": 4.2674, + "step": 1917 + }, + { + "epoch": 0.01918, + "grad_norm": 0.7167946113405885, + "learning_rate": 0.003, + "loss": 4.2562, + "step": 1918 + }, + { + "epoch": 0.01919, + "grad_norm": 0.7533071012807427, + "learning_rate": 0.003, + "loss": 4.2594, + "step": 1919 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6703396884694851, + "learning_rate": 0.003, + "loss": 4.2612, + "step": 1920 + }, + { + "epoch": 0.01921, + "grad_norm": 0.5834644623189628, + "learning_rate": 0.003, + "loss": 4.2441, + "step": 1921 + }, + { + "epoch": 0.01922, + "grad_norm": 0.6481277683352221, + "learning_rate": 0.003, + "loss": 4.2625, + "step": 1922 + }, + { + "epoch": 0.01923, + "grad_norm": 0.6044219599617415, + "learning_rate": 0.003, + "loss": 4.2661, + "step": 1923 + }, + { + "epoch": 0.01924, + "grad_norm": 0.5337323934986105, + "learning_rate": 0.003, + "loss": 4.2643, + "step": 1924 + }, + { + "epoch": 0.01925, + "grad_norm": 0.5129044588459396, + "learning_rate": 0.003, + "loss": 4.251, + "step": 1925 + }, + { + "epoch": 0.01926, + "grad_norm": 0.48201171188935904, + "learning_rate": 0.003, + "loss": 4.2563, + "step": 1926 + }, + { + "epoch": 0.01927, + "grad_norm": 0.4295757083815179, + "learning_rate": 0.003, + "loss": 4.2456, + "step": 1927 + }, + { + "epoch": 0.01928, + "grad_norm": 0.5221994128637466, + "learning_rate": 0.003, + "loss": 4.2241, + "step": 1928 + }, + { + "epoch": 0.01929, + "grad_norm": 0.6151786259894082, + "learning_rate": 0.003, + "loss": 4.246, + "step": 1929 + }, + { + "epoch": 0.0193, + "grad_norm": 0.717185888834206, + "learning_rate": 0.003, + "loss": 4.2399, + "step": 1930 + }, + { + "epoch": 0.01931, + "grad_norm": 0.865419932280181, + "learning_rate": 0.003, + "loss": 4.2485, + "step": 1931 + }, + { + "epoch": 0.01932, + "grad_norm": 1.015142018105296, + "learning_rate": 0.003, + "loss": 4.2679, + "step": 1932 + }, + { + "epoch": 0.01933, + "grad_norm": 1.0909531598947217, + "learning_rate": 0.003, + "loss": 4.2582, + "step": 1933 + }, + { + "epoch": 0.01934, + "grad_norm": 0.7995257446738059, + "learning_rate": 0.003, + "loss": 4.2554, + "step": 1934 + }, + { + "epoch": 0.01935, + "grad_norm": 0.6434815333532291, + "learning_rate": 0.003, + "loss": 4.2669, + "step": 1935 + }, + { + "epoch": 0.01936, + "grad_norm": 0.6569925784458591, + "learning_rate": 0.003, + "loss": 4.2489, + "step": 1936 + }, + { + "epoch": 0.01937, + "grad_norm": 0.5996128457789688, + "learning_rate": 0.003, + "loss": 4.2577, + "step": 1937 + }, + { + "epoch": 0.01938, + "grad_norm": 0.6329170311784792, + "learning_rate": 0.003, + "loss": 4.2615, + "step": 1938 + }, + { + "epoch": 0.01939, + "grad_norm": 0.6354665323417178, + "learning_rate": 0.003, + "loss": 4.2808, + "step": 1939 + }, + { + "epoch": 0.0194, + "grad_norm": 0.6008328437037136, + "learning_rate": 0.003, + "loss": 4.2581, + "step": 1940 + }, + { + "epoch": 0.01941, + "grad_norm": 0.5343827591468221, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 1941 + }, + { + "epoch": 0.01942, + "grad_norm": 0.5512416106954112, + "learning_rate": 0.003, + "loss": 4.2611, + "step": 1942 + }, + { + "epoch": 0.01943, + "grad_norm": 0.5866960320410021, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 1943 + }, + { + "epoch": 0.01944, + "grad_norm": 0.6240489288808161, + "learning_rate": 0.003, + "loss": 4.2751, + "step": 1944 + }, + { + "epoch": 0.01945, + "grad_norm": 0.6243606216006328, + "learning_rate": 0.003, + "loss": 4.2437, + "step": 1945 + }, + { + "epoch": 0.01946, + "grad_norm": 0.5880226151273782, + "learning_rate": 0.003, + "loss": 4.2379, + "step": 1946 + }, + { + "epoch": 0.01947, + "grad_norm": 0.5934159043950513, + "learning_rate": 0.003, + "loss": 4.2471, + "step": 1947 + }, + { + "epoch": 0.01948, + "grad_norm": 0.5378813319639308, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 1948 + }, + { + "epoch": 0.01949, + "grad_norm": 0.5777955568924656, + "learning_rate": 0.003, + "loss": 4.2595, + "step": 1949 + }, + { + "epoch": 0.0195, + "grad_norm": 0.6588236649280669, + "learning_rate": 0.003, + "loss": 4.2493, + "step": 1950 + }, + { + "epoch": 0.01951, + "grad_norm": 0.6662863362190039, + "learning_rate": 0.003, + "loss": 4.2457, + "step": 1951 + }, + { + "epoch": 0.01952, + "grad_norm": 0.6956758803553106, + "learning_rate": 0.003, + "loss": 4.2563, + "step": 1952 + }, + { + "epoch": 0.01953, + "grad_norm": 0.7309670323591311, + "learning_rate": 0.003, + "loss": 4.2663, + "step": 1953 + }, + { + "epoch": 0.01954, + "grad_norm": 0.7102958436956044, + "learning_rate": 0.003, + "loss": 4.2656, + "step": 1954 + }, + { + "epoch": 0.01955, + "grad_norm": 0.748927101392194, + "learning_rate": 0.003, + "loss": 4.2596, + "step": 1955 + }, + { + "epoch": 0.01956, + "grad_norm": 0.6357959051502906, + "learning_rate": 0.003, + "loss": 4.2507, + "step": 1956 + }, + { + "epoch": 0.01957, + "grad_norm": 0.5776415679220375, + "learning_rate": 0.003, + "loss": 4.2172, + "step": 1957 + }, + { + "epoch": 0.01958, + "grad_norm": 0.5010308008271186, + "learning_rate": 0.003, + "loss": 4.2108, + "step": 1958 + }, + { + "epoch": 0.01959, + "grad_norm": 0.500419502823399, + "learning_rate": 0.003, + "loss": 4.2342, + "step": 1959 + }, + { + "epoch": 0.0196, + "grad_norm": 0.5017850552651191, + "learning_rate": 0.003, + "loss": 4.229, + "step": 1960 + }, + { + "epoch": 0.01961, + "grad_norm": 0.49043553705552145, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 1961 + }, + { + "epoch": 0.01962, + "grad_norm": 0.49806058358446914, + "learning_rate": 0.003, + "loss": 4.2301, + "step": 1962 + }, + { + "epoch": 0.01963, + "grad_norm": 0.5386284819858854, + "learning_rate": 0.003, + "loss": 4.2201, + "step": 1963 + }, + { + "epoch": 0.01964, + "grad_norm": 0.6462166130085853, + "learning_rate": 0.003, + "loss": 4.2492, + "step": 1964 + }, + { + "epoch": 0.01965, + "grad_norm": 0.8617584950578717, + "learning_rate": 0.003, + "loss": 4.2512, + "step": 1965 + }, + { + "epoch": 0.01966, + "grad_norm": 1.064420589029421, + "learning_rate": 0.003, + "loss": 4.2624, + "step": 1966 + }, + { + "epoch": 0.01967, + "grad_norm": 0.8064456905773513, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 1967 + }, + { + "epoch": 0.01968, + "grad_norm": 0.6120027249682956, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 1968 + }, + { + "epoch": 0.01969, + "grad_norm": 0.7508122971760963, + "learning_rate": 0.003, + "loss": 4.2526, + "step": 1969 + }, + { + "epoch": 0.0197, + "grad_norm": 0.7405449826366749, + "learning_rate": 0.003, + "loss": 4.2514, + "step": 1970 + }, + { + "epoch": 0.01971, + "grad_norm": 0.7056278637839583, + "learning_rate": 0.003, + "loss": 4.2377, + "step": 1971 + }, + { + "epoch": 0.01972, + "grad_norm": 0.7105683738164339, + "learning_rate": 0.003, + "loss": 4.2407, + "step": 1972 + }, + { + "epoch": 0.01973, + "grad_norm": 0.7508003521914575, + "learning_rate": 0.003, + "loss": 4.2264, + "step": 1973 + }, + { + "epoch": 0.01974, + "grad_norm": 0.7061088132196116, + "learning_rate": 0.003, + "loss": 4.2641, + "step": 1974 + }, + { + "epoch": 0.01975, + "grad_norm": 0.6140071978685933, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 1975 + }, + { + "epoch": 0.01976, + "grad_norm": 0.6214170582947839, + "learning_rate": 0.003, + "loss": 4.243, + "step": 1976 + }, + { + "epoch": 0.01977, + "grad_norm": 0.6032091701282252, + "learning_rate": 0.003, + "loss": 4.2759, + "step": 1977 + }, + { + "epoch": 0.01978, + "grad_norm": 0.6890579485446906, + "learning_rate": 0.003, + "loss": 4.2782, + "step": 1978 + }, + { + "epoch": 0.01979, + "grad_norm": 0.7078287139411981, + "learning_rate": 0.003, + "loss": 4.269, + "step": 1979 + }, + { + "epoch": 0.0198, + "grad_norm": 0.726915367409026, + "learning_rate": 0.003, + "loss": 4.2503, + "step": 1980 + }, + { + "epoch": 0.01981, + "grad_norm": 0.7277727772925232, + "learning_rate": 0.003, + "loss": 4.2394, + "step": 1981 + }, + { + "epoch": 0.01982, + "grad_norm": 0.6618110470758236, + "learning_rate": 0.003, + "loss": 4.262, + "step": 1982 + }, + { + "epoch": 0.01983, + "grad_norm": 0.6282900232413579, + "learning_rate": 0.003, + "loss": 4.2497, + "step": 1983 + }, + { + "epoch": 0.01984, + "grad_norm": 0.6651788332059134, + "learning_rate": 0.003, + "loss": 4.2635, + "step": 1984 + }, + { + "epoch": 0.01985, + "grad_norm": 0.6195263920647736, + "learning_rate": 0.003, + "loss": 4.2484, + "step": 1985 + }, + { + "epoch": 0.01986, + "grad_norm": 0.6562857171469267, + "learning_rate": 0.003, + "loss": 4.2792, + "step": 1986 + }, + { + "epoch": 0.01987, + "grad_norm": 0.6274806569391715, + "learning_rate": 0.003, + "loss": 4.2331, + "step": 1987 + }, + { + "epoch": 0.01988, + "grad_norm": 0.6627977107268054, + "learning_rate": 0.003, + "loss": 4.2466, + "step": 1988 + }, + { + "epoch": 0.01989, + "grad_norm": 0.6286439849240877, + "learning_rate": 0.003, + "loss": 4.2344, + "step": 1989 + }, + { + "epoch": 0.0199, + "grad_norm": 0.5786457654364954, + "learning_rate": 0.003, + "loss": 4.246, + "step": 1990 + }, + { + "epoch": 0.01991, + "grad_norm": 0.5167049375276446, + "learning_rate": 0.003, + "loss": 4.2238, + "step": 1991 + }, + { + "epoch": 0.01992, + "grad_norm": 0.50421095170271, + "learning_rate": 0.003, + "loss": 4.261, + "step": 1992 + }, + { + "epoch": 0.01993, + "grad_norm": 0.4699320041008658, + "learning_rate": 0.003, + "loss": 4.2447, + "step": 1993 + }, + { + "epoch": 0.01994, + "grad_norm": 0.5186369446769569, + "learning_rate": 0.003, + "loss": 4.2629, + "step": 1994 + }, + { + "epoch": 0.01995, + "grad_norm": 0.5867398105885127, + "learning_rate": 0.003, + "loss": 4.2547, + "step": 1995 + }, + { + "epoch": 0.01996, + "grad_norm": 0.8031924386202848, + "learning_rate": 0.003, + "loss": 4.2326, + "step": 1996 + }, + { + "epoch": 0.01997, + "grad_norm": 0.9756949043315064, + "learning_rate": 0.003, + "loss": 4.2746, + "step": 1997 + }, + { + "epoch": 0.01998, + "grad_norm": 0.8340147112264455, + "learning_rate": 0.003, + "loss": 4.2546, + "step": 1998 + }, + { + "epoch": 0.01999, + "grad_norm": 0.6652469206537822, + "learning_rate": 0.003, + "loss": 4.2393, + "step": 1999 + }, + { + "epoch": 0.02, + "grad_norm": 0.7695645348180246, + "learning_rate": 0.003, + "loss": 4.2318, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.9287285710848e+16, + "train_batch_size": 1024, + "trial_name": null, + "trial_params": null +}