diff --git "a/checkpoint-1316/trainer_state.json" "b/checkpoint-1316/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1316/trainer_state.json" @@ -0,0 +1,9253 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9993355481727575, + "eval_steps": 500, + "global_step": 1316, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007593735168485999, + "grad_norm": 4.0, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.9271, + "step": 1 + }, + { + "epoch": 0.0015187470336971997, + "grad_norm": 3.359375, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.6784, + "step": 2 + }, + { + "epoch": 0.0022781205505457997, + "grad_norm": 3.1875, + "learning_rate": 6.000000000000001e-07, + "loss": 1.7025, + "step": 3 + }, + { + "epoch": 0.0030374940673943995, + "grad_norm": 3.640625, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8366, + "step": 4 + }, + { + "epoch": 0.0037968675842429997, + "grad_norm": 3.859375, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.8837, + "step": 5 + }, + { + "epoch": 0.0045562411010915994, + "grad_norm": 3.546875, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.7767, + "step": 6 + }, + { + "epoch": 0.0053156146179402, + "grad_norm": 3.21875, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.6786, + "step": 7 + }, + { + "epoch": 0.006074988134788799, + "grad_norm": 3.96875, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.873, + "step": 8 + }, + { + "epoch": 0.006834361651637399, + "grad_norm": 3.703125, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8584, + "step": 9 + }, + { + "epoch": 0.007593735168485999, + "grad_norm": 3.40625, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7817, + "step": 10 + }, + { + "epoch": 0.0083531086853346, + "grad_norm": 3.25, + "learning_rate": 2.2e-06, + "loss": 1.7579, + "step": 11 + }, + { + "epoch": 0.009112482202183199, + "grad_norm": 3.953125, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.9496, + "step": 12 + }, + { + "epoch": 0.009871855719031798, + "grad_norm": 3.515625, + "learning_rate": 2.6e-06, + "loss": 1.7234, + "step": 13 + }, + { + "epoch": 0.0106312292358804, + "grad_norm": 4.375, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.003, + "step": 14 + }, + { + "epoch": 0.011390602752728999, + "grad_norm": 4.125, + "learning_rate": 3e-06, + "loss": 1.8606, + "step": 15 + }, + { + "epoch": 0.012149976269577598, + "grad_norm": 3.71875, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.8039, + "step": 16 + }, + { + "epoch": 0.012909349786426199, + "grad_norm": 3.96875, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.936, + "step": 17 + }, + { + "epoch": 0.013668723303274798, + "grad_norm": 3.453125, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.7465, + "step": 18 + }, + { + "epoch": 0.014428096820123398, + "grad_norm": 3.5625, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.659, + "step": 19 + }, + { + "epoch": 0.015187470336971999, + "grad_norm": 3.59375, + "learning_rate": 4.000000000000001e-06, + "loss": 1.7962, + "step": 20 + }, + { + "epoch": 0.015946843853820596, + "grad_norm": 3.375, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.6802, + "step": 21 + }, + { + "epoch": 0.0167062173706692, + "grad_norm": 3.875, + "learning_rate": 4.4e-06, + "loss": 1.8444, + "step": 22 + }, + { + "epoch": 0.0174655908875178, + "grad_norm": 3.53125, + "learning_rate": 4.600000000000001e-06, + "loss": 1.7685, + "step": 23 + }, + { + "epoch": 0.018224964404366398, + "grad_norm": 3.421875, + "learning_rate": 4.800000000000001e-06, + "loss": 1.6768, + "step": 24 + }, + { + "epoch": 0.018984337921214997, + "grad_norm": 3.6875, + "learning_rate": 5e-06, + "loss": 1.8115, + "step": 25 + }, + { + "epoch": 0.019743711438063596, + "grad_norm": 3.671875, + "learning_rate": 5.2e-06, + "loss": 1.7844, + "step": 26 + }, + { + "epoch": 0.020503084954912196, + "grad_norm": 3.75, + "learning_rate": 5.400000000000001e-06, + "loss": 1.8313, + "step": 27 + }, + { + "epoch": 0.0212624584717608, + "grad_norm": 3.953125, + "learning_rate": 5.600000000000001e-06, + "loss": 1.8675, + "step": 28 + }, + { + "epoch": 0.022021831988609398, + "grad_norm": 3.71875, + "learning_rate": 5.8e-06, + "loss": 1.7895, + "step": 29 + }, + { + "epoch": 0.022781205505457997, + "grad_norm": 4.03125, + "learning_rate": 6e-06, + "loss": 1.8702, + "step": 30 + }, + { + "epoch": 0.023540579022306597, + "grad_norm": 3.640625, + "learning_rate": 6.200000000000001e-06, + "loss": 1.6666, + "step": 31 + }, + { + "epoch": 0.024299952539155196, + "grad_norm": 4.125, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.9699, + "step": 32 + }, + { + "epoch": 0.025059326056003795, + "grad_norm": 3.578125, + "learning_rate": 6.600000000000001e-06, + "loss": 1.6828, + "step": 33 + }, + { + "epoch": 0.025818699572852398, + "grad_norm": 3.65625, + "learning_rate": 6.800000000000001e-06, + "loss": 1.8098, + "step": 34 + }, + { + "epoch": 0.026578073089700997, + "grad_norm": 3.484375, + "learning_rate": 7e-06, + "loss": 1.6943, + "step": 35 + }, + { + "epoch": 0.027337446606549597, + "grad_norm": 3.40625, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.6835, + "step": 36 + }, + { + "epoch": 0.028096820123398196, + "grad_norm": 3.53125, + "learning_rate": 7.4e-06, + "loss": 1.7776, + "step": 37 + }, + { + "epoch": 0.028856193640246795, + "grad_norm": 3.828125, + "learning_rate": 7.600000000000001e-06, + "loss": 1.8554, + "step": 38 + }, + { + "epoch": 0.029615567157095395, + "grad_norm": 4.28125, + "learning_rate": 7.800000000000002e-06, + "loss": 2.0373, + "step": 39 + }, + { + "epoch": 0.030374940673943997, + "grad_norm": 3.4375, + "learning_rate": 8.000000000000001e-06, + "loss": 1.7848, + "step": 40 + }, + { + "epoch": 0.031134314190792597, + "grad_norm": 3.71875, + "learning_rate": 8.2e-06, + "loss": 1.8114, + "step": 41 + }, + { + "epoch": 0.03189368770764119, + "grad_norm": 4.21875, + "learning_rate": 8.400000000000001e-06, + "loss": 1.9938, + "step": 42 + }, + { + "epoch": 0.0326530612244898, + "grad_norm": 3.515625, + "learning_rate": 8.6e-06, + "loss": 1.6792, + "step": 43 + }, + { + "epoch": 0.0334124347413384, + "grad_norm": 3.6875, + "learning_rate": 8.8e-06, + "loss": 1.8027, + "step": 44 + }, + { + "epoch": 0.034171808258187, + "grad_norm": 4.0625, + "learning_rate": 9e-06, + "loss": 1.8655, + "step": 45 + }, + { + "epoch": 0.0349311817750356, + "grad_norm": 3.75, + "learning_rate": 9.200000000000002e-06, + "loss": 1.8402, + "step": 46 + }, + { + "epoch": 0.035690555291884196, + "grad_norm": 3.75, + "learning_rate": 9.4e-06, + "loss": 1.8507, + "step": 47 + }, + { + "epoch": 0.036449928808732796, + "grad_norm": 3.515625, + "learning_rate": 9.600000000000001e-06, + "loss": 1.7811, + "step": 48 + }, + { + "epoch": 0.037209302325581395, + "grad_norm": 3.796875, + "learning_rate": 9.800000000000001e-06, + "loss": 1.873, + "step": 49 + }, + { + "epoch": 0.037968675842429994, + "grad_norm": 3.40625, + "learning_rate": 1e-05, + "loss": 1.7869, + "step": 50 + }, + { + "epoch": 0.038728049359278593, + "grad_norm": 3.890625, + "learning_rate": 1.02e-05, + "loss": 1.847, + "step": 51 + }, + { + "epoch": 0.03948742287612719, + "grad_norm": 4.0, + "learning_rate": 1.04e-05, + "loss": 1.9245, + "step": 52 + }, + { + "epoch": 0.04024679639297579, + "grad_norm": 4.09375, + "learning_rate": 1.0600000000000002e-05, + "loss": 2.0196, + "step": 53 + }, + { + "epoch": 0.04100616990982439, + "grad_norm": 3.609375, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.7158, + "step": 54 + }, + { + "epoch": 0.041765543426673, + "grad_norm": 3.453125, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.7212, + "step": 55 + }, + { + "epoch": 0.0425249169435216, + "grad_norm": 4.0625, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.9365, + "step": 56 + }, + { + "epoch": 0.043284290460370196, + "grad_norm": 3.65625, + "learning_rate": 1.14e-05, + "loss": 1.8307, + "step": 57 + }, + { + "epoch": 0.044043663977218796, + "grad_norm": 3.296875, + "learning_rate": 1.16e-05, + "loss": 1.6974, + "step": 58 + }, + { + "epoch": 0.044803037494067395, + "grad_norm": 3.328125, + "learning_rate": 1.18e-05, + "loss": 1.6723, + "step": 59 + }, + { + "epoch": 0.045562411010915994, + "grad_norm": 3.25, + "learning_rate": 1.2e-05, + "loss": 1.6466, + "step": 60 + }, + { + "epoch": 0.046321784527764594, + "grad_norm": 3.59375, + "learning_rate": 1.22e-05, + "loss": 1.7542, + "step": 61 + }, + { + "epoch": 0.04708115804461319, + "grad_norm": 3.546875, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.8449, + "step": 62 + }, + { + "epoch": 0.04784053156146179, + "grad_norm": 3.140625, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.6788, + "step": 63 + }, + { + "epoch": 0.04859990507831039, + "grad_norm": 3.421875, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.8164, + "step": 64 + }, + { + "epoch": 0.04935927859515899, + "grad_norm": 3.359375, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.7674, + "step": 65 + }, + { + "epoch": 0.05011865211200759, + "grad_norm": 3.296875, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.711, + "step": 66 + }, + { + "epoch": 0.0508780256288562, + "grad_norm": 3.109375, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.6482, + "step": 67 + }, + { + "epoch": 0.051637399145704796, + "grad_norm": 3.1875, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.6734, + "step": 68 + }, + { + "epoch": 0.052396772662553395, + "grad_norm": 3.546875, + "learning_rate": 1.38e-05, + "loss": 1.793, + "step": 69 + }, + { + "epoch": 0.053156146179401995, + "grad_norm": 3.65625, + "learning_rate": 1.4e-05, + "loss": 1.8647, + "step": 70 + }, + { + "epoch": 0.053915519696250594, + "grad_norm": 3.53125, + "learning_rate": 1.4200000000000001e-05, + "loss": 1.8249, + "step": 71 + }, + { + "epoch": 0.05467489321309919, + "grad_norm": 3.796875, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.88, + "step": 72 + }, + { + "epoch": 0.05543426672994779, + "grad_norm": 3.984375, + "learning_rate": 1.46e-05, + "loss": 1.9618, + "step": 73 + }, + { + "epoch": 0.05619364024679639, + "grad_norm": 3.46875, + "learning_rate": 1.48e-05, + "loss": 1.8017, + "step": 74 + }, + { + "epoch": 0.05695301376364499, + "grad_norm": 3.40625, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.6894, + "step": 75 + }, + { + "epoch": 0.05771238728049359, + "grad_norm": 3.375, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.7385, + "step": 76 + }, + { + "epoch": 0.05847176079734219, + "grad_norm": 3.21875, + "learning_rate": 1.54e-05, + "loss": 1.631, + "step": 77 + }, + { + "epoch": 0.05923113431419079, + "grad_norm": 3.421875, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.7345, + "step": 78 + }, + { + "epoch": 0.059990507831039395, + "grad_norm": 3.25, + "learning_rate": 1.58e-05, + "loss": 1.6198, + "step": 79 + }, + { + "epoch": 0.060749881347887995, + "grad_norm": 3.703125, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.7695, + "step": 80 + }, + { + "epoch": 0.061509254864736594, + "grad_norm": 3.546875, + "learning_rate": 1.62e-05, + "loss": 1.7828, + "step": 81 + }, + { + "epoch": 0.062268628381585193, + "grad_norm": 4.0625, + "learning_rate": 1.64e-05, + "loss": 1.8845, + "step": 82 + }, + { + "epoch": 0.06302800189843379, + "grad_norm": 3.1875, + "learning_rate": 1.66e-05, + "loss": 1.698, + "step": 83 + }, + { + "epoch": 0.06378737541528239, + "grad_norm": 3.28125, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.6712, + "step": 84 + }, + { + "epoch": 0.064546748932131, + "grad_norm": 3.375, + "learning_rate": 1.7e-05, + "loss": 1.7673, + "step": 85 + }, + { + "epoch": 0.0653061224489796, + "grad_norm": 3.609375, + "learning_rate": 1.72e-05, + "loss": 1.8005, + "step": 86 + }, + { + "epoch": 0.0660654959658282, + "grad_norm": 3.5, + "learning_rate": 1.7400000000000003e-05, + "loss": 1.7674, + "step": 87 + }, + { + "epoch": 0.0668248694826768, + "grad_norm": 3.484375, + "learning_rate": 1.76e-05, + "loss": 1.7665, + "step": 88 + }, + { + "epoch": 0.0675842429995254, + "grad_norm": 3.578125, + "learning_rate": 1.7800000000000002e-05, + "loss": 1.7815, + "step": 89 + }, + { + "epoch": 0.068343616516374, + "grad_norm": 3.25, + "learning_rate": 1.8e-05, + "loss": 1.6855, + "step": 90 + }, + { + "epoch": 0.0691029900332226, + "grad_norm": 3.1875, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.6461, + "step": 91 + }, + { + "epoch": 0.0698623635500712, + "grad_norm": 3.09375, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.6388, + "step": 92 + }, + { + "epoch": 0.07062173706691979, + "grad_norm": 3.46875, + "learning_rate": 1.86e-05, + "loss": 1.7505, + "step": 93 + }, + { + "epoch": 0.07138111058376839, + "grad_norm": 2.84375, + "learning_rate": 1.88e-05, + "loss": 1.6048, + "step": 94 + }, + { + "epoch": 0.07214048410061699, + "grad_norm": 3.34375, + "learning_rate": 1.9e-05, + "loss": 1.6765, + "step": 95 + }, + { + "epoch": 0.07289985761746559, + "grad_norm": 3.171875, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.634, + "step": 96 + }, + { + "epoch": 0.07365923113431419, + "grad_norm": 3.0625, + "learning_rate": 1.94e-05, + "loss": 1.6586, + "step": 97 + }, + { + "epoch": 0.07441860465116279, + "grad_norm": 3.140625, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.6871, + "step": 98 + }, + { + "epoch": 0.07517797816801139, + "grad_norm": 3.703125, + "learning_rate": 1.98e-05, + "loss": 1.7815, + "step": 99 + }, + { + "epoch": 0.07593735168485999, + "grad_norm": 3.671875, + "learning_rate": 2e-05, + "loss": 1.8349, + "step": 100 + }, + { + "epoch": 0.07669672520170859, + "grad_norm": 3.625, + "learning_rate": 1.9999966626453647e-05, + "loss": 1.8303, + "step": 101 + }, + { + "epoch": 0.07745609871855719, + "grad_norm": 2.921875, + "learning_rate": 1.9999866506037346e-05, + "loss": 1.5889, + "step": 102 + }, + { + "epoch": 0.07821547223540579, + "grad_norm": 2.984375, + "learning_rate": 1.9999699639419373e-05, + "loss": 1.5841, + "step": 103 + }, + { + "epoch": 0.07897484575225439, + "grad_norm": 2.96875, + "learning_rate": 1.999946602771351e-05, + "loss": 1.6492, + "step": 104 + }, + { + "epoch": 0.07973421926910298, + "grad_norm": 3.203125, + "learning_rate": 1.999916567247905e-05, + "loss": 1.6682, + "step": 105 + }, + { + "epoch": 0.08049359278595158, + "grad_norm": 2.6875, + "learning_rate": 1.9998798575720776e-05, + "loss": 1.522, + "step": 106 + }, + { + "epoch": 0.08125296630280018, + "grad_norm": 3.171875, + "learning_rate": 1.9998364739888954e-05, + "loss": 1.6903, + "step": 107 + }, + { + "epoch": 0.08201233981964878, + "grad_norm": 2.765625, + "learning_rate": 1.9997864167879313e-05, + "loss": 1.5823, + "step": 108 + }, + { + "epoch": 0.0827717133364974, + "grad_norm": 2.953125, + "learning_rate": 1.9997296863033018e-05, + "loss": 1.6105, + "step": 109 + }, + { + "epoch": 0.083531086853346, + "grad_norm": 2.90625, + "learning_rate": 1.9996662829136676e-05, + "loss": 1.5877, + "step": 110 + }, + { + "epoch": 0.0842904603701946, + "grad_norm": 3.515625, + "learning_rate": 1.999596207042227e-05, + "loss": 1.7453, + "step": 111 + }, + { + "epoch": 0.0850498338870432, + "grad_norm": 3.3125, + "learning_rate": 1.999519459156716e-05, + "loss": 1.7015, + "step": 112 + }, + { + "epoch": 0.0858092074038918, + "grad_norm": 3.09375, + "learning_rate": 1.999436039769405e-05, + "loss": 1.6773, + "step": 113 + }, + { + "epoch": 0.08656858092074039, + "grad_norm": 2.84375, + "learning_rate": 1.9993459494370938e-05, + "loss": 1.6287, + "step": 114 + }, + { + "epoch": 0.08732795443758899, + "grad_norm": 3.40625, + "learning_rate": 1.9992491887611095e-05, + "loss": 1.7393, + "step": 115 + }, + { + "epoch": 0.08808732795443759, + "grad_norm": 4.03125, + "learning_rate": 1.999145758387301e-05, + "loss": 1.9157, + "step": 116 + }, + { + "epoch": 0.08884670147128619, + "grad_norm": 2.734375, + "learning_rate": 1.9990356590060363e-05, + "loss": 1.6195, + "step": 117 + }, + { + "epoch": 0.08960607498813479, + "grad_norm": 2.71875, + "learning_rate": 1.998918891352197e-05, + "loss": 1.6428, + "step": 118 + }, + { + "epoch": 0.09036544850498339, + "grad_norm": 3.1875, + "learning_rate": 1.9987954562051724e-05, + "loss": 1.6772, + "step": 119 + }, + { + "epoch": 0.09112482202183199, + "grad_norm": 2.84375, + "learning_rate": 1.998665354388857e-05, + "loss": 1.5625, + "step": 120 + }, + { + "epoch": 0.09188419553868059, + "grad_norm": 2.984375, + "learning_rate": 1.9985285867716423e-05, + "loss": 1.6915, + "step": 121 + }, + { + "epoch": 0.09264356905552919, + "grad_norm": 2.828125, + "learning_rate": 1.9983851542664125e-05, + "loss": 1.6413, + "step": 122 + }, + { + "epoch": 0.09340294257237779, + "grad_norm": 2.65625, + "learning_rate": 1.998235057830538e-05, + "loss": 1.5844, + "step": 123 + }, + { + "epoch": 0.09416231608922639, + "grad_norm": 2.59375, + "learning_rate": 1.9980782984658682e-05, + "loss": 1.561, + "step": 124 + }, + { + "epoch": 0.09492168960607499, + "grad_norm": 2.921875, + "learning_rate": 1.997914877218727e-05, + "loss": 1.6305, + "step": 125 + }, + { + "epoch": 0.09568106312292358, + "grad_norm": 2.25, + "learning_rate": 1.9977447951799035e-05, + "loss": 1.4409, + "step": 126 + }, + { + "epoch": 0.09644043663977218, + "grad_norm": 2.484375, + "learning_rate": 1.9975680534846457e-05, + "loss": 1.5723, + "step": 127 + }, + { + "epoch": 0.09719981015662078, + "grad_norm": 3.453125, + "learning_rate": 1.9973846533126533e-05, + "loss": 1.7338, + "step": 128 + }, + { + "epoch": 0.09795918367346938, + "grad_norm": 2.703125, + "learning_rate": 1.997194595888069e-05, + "loss": 1.6383, + "step": 129 + }, + { + "epoch": 0.09871855719031798, + "grad_norm": 2.4375, + "learning_rate": 1.996997882479471e-05, + "loss": 1.5887, + "step": 130 + }, + { + "epoch": 0.09947793070716658, + "grad_norm": 2.4375, + "learning_rate": 1.9967945143998636e-05, + "loss": 1.5525, + "step": 131 + }, + { + "epoch": 0.10023730422401518, + "grad_norm": 2.359375, + "learning_rate": 1.99658449300667e-05, + "loss": 1.4995, + "step": 132 + }, + { + "epoch": 0.1009966777408638, + "grad_norm": 2.140625, + "learning_rate": 1.996367819701722e-05, + "loss": 1.5085, + "step": 133 + }, + { + "epoch": 0.1017560512577124, + "grad_norm": 2.46875, + "learning_rate": 1.996144495931251e-05, + "loss": 1.5708, + "step": 134 + }, + { + "epoch": 0.10251542477456099, + "grad_norm": 2.71875, + "learning_rate": 1.995914523185878e-05, + "loss": 1.623, + "step": 135 + }, + { + "epoch": 0.10327479829140959, + "grad_norm": 2.1875, + "learning_rate": 1.9956779030006038e-05, + "loss": 1.5378, + "step": 136 + }, + { + "epoch": 0.10403417180825819, + "grad_norm": 2.5, + "learning_rate": 1.9954346369548002e-05, + "loss": 1.5672, + "step": 137 + }, + { + "epoch": 0.10479354532510679, + "grad_norm": 2.078125, + "learning_rate": 1.995184726672197e-05, + "loss": 1.5316, + "step": 138 + }, + { + "epoch": 0.10555291884195539, + "grad_norm": 2.25, + "learning_rate": 1.994928173820873e-05, + "loss": 1.5776, + "step": 139 + }, + { + "epoch": 0.10631229235880399, + "grad_norm": 2.34375, + "learning_rate": 1.994664980113243e-05, + "loss": 1.6079, + "step": 140 + }, + { + "epoch": 0.10707166587565259, + "grad_norm": 2.296875, + "learning_rate": 1.9943951473060488e-05, + "loss": 1.5903, + "step": 141 + }, + { + "epoch": 0.10783103939250119, + "grad_norm": 2.53125, + "learning_rate": 1.9941186772003463e-05, + "loss": 1.6456, + "step": 142 + }, + { + "epoch": 0.10859041290934979, + "grad_norm": 2.171875, + "learning_rate": 1.9938355716414933e-05, + "loss": 1.5053, + "step": 143 + }, + { + "epoch": 0.10934978642619839, + "grad_norm": 2.09375, + "learning_rate": 1.9935458325191365e-05, + "loss": 1.5925, + "step": 144 + }, + { + "epoch": 0.11010915994304699, + "grad_norm": 2.171875, + "learning_rate": 1.9932494617672007e-05, + "loss": 1.6033, + "step": 145 + }, + { + "epoch": 0.11086853345989559, + "grad_norm": 2.046875, + "learning_rate": 1.992946461363874e-05, + "loss": 1.553, + "step": 146 + }, + { + "epoch": 0.11162790697674418, + "grad_norm": 2.21875, + "learning_rate": 1.9926368333315964e-05, + "loss": 1.5962, + "step": 147 + }, + { + "epoch": 0.11238728049359278, + "grad_norm": 2.203125, + "learning_rate": 1.992320579737045e-05, + "loss": 1.6159, + "step": 148 + }, + { + "epoch": 0.11314665401044138, + "grad_norm": 2.0625, + "learning_rate": 1.991997702691121e-05, + "loss": 1.4709, + "step": 149 + }, + { + "epoch": 0.11390602752728998, + "grad_norm": 2.375, + "learning_rate": 1.9916682043489337e-05, + "loss": 1.6076, + "step": 150 + }, + { + "epoch": 0.11466540104413858, + "grad_norm": 1.984375, + "learning_rate": 1.9913320869097897e-05, + "loss": 1.4864, + "step": 151 + }, + { + "epoch": 0.11542477456098718, + "grad_norm": 1.7734375, + "learning_rate": 1.9909893526171745e-05, + "loss": 1.4559, + "step": 152 + }, + { + "epoch": 0.11618414807783578, + "grad_norm": 1.9921875, + "learning_rate": 1.990640003758741e-05, + "loss": 1.5585, + "step": 153 + }, + { + "epoch": 0.11694352159468438, + "grad_norm": 1.796875, + "learning_rate": 1.9902840426662897e-05, + "loss": 1.4656, + "step": 154 + }, + { + "epoch": 0.11770289511153298, + "grad_norm": 1.9296875, + "learning_rate": 1.9899214717157588e-05, + "loss": 1.5357, + "step": 155 + }, + { + "epoch": 0.11846226862838158, + "grad_norm": 1.9296875, + "learning_rate": 1.9895522933272028e-05, + "loss": 1.5101, + "step": 156 + }, + { + "epoch": 0.11922164214523019, + "grad_norm": 1.9140625, + "learning_rate": 1.989176509964781e-05, + "loss": 1.5287, + "step": 157 + }, + { + "epoch": 0.11998101566207879, + "grad_norm": 1.9765625, + "learning_rate": 1.988794124136738e-05, + "loss": 1.6104, + "step": 158 + }, + { + "epoch": 0.12074038917892739, + "grad_norm": 1.9296875, + "learning_rate": 1.9884051383953876e-05, + "loss": 1.5313, + "step": 159 + }, + { + "epoch": 0.12149976269577599, + "grad_norm": 1.703125, + "learning_rate": 1.9880095553370967e-05, + "loss": 1.4602, + "step": 160 + }, + { + "epoch": 0.12225913621262459, + "grad_norm": 1.59375, + "learning_rate": 1.9876073776022676e-05, + "loss": 1.4071, + "step": 161 + }, + { + "epoch": 0.12301850972947319, + "grad_norm": 1.6953125, + "learning_rate": 1.987198607875319e-05, + "loss": 1.4707, + "step": 162 + }, + { + "epoch": 0.12377788324632179, + "grad_norm": 1.8125, + "learning_rate": 1.9867832488846702e-05, + "loss": 1.4729, + "step": 163 + }, + { + "epoch": 0.12453725676317039, + "grad_norm": 1.6328125, + "learning_rate": 1.9863613034027224e-05, + "loss": 1.4967, + "step": 164 + }, + { + "epoch": 0.12529663028001897, + "grad_norm": 1.6171875, + "learning_rate": 1.9859327742458387e-05, + "loss": 1.4463, + "step": 165 + }, + { + "epoch": 0.12605600379686757, + "grad_norm": 1.65625, + "learning_rate": 1.985497664274326e-05, + "loss": 1.4763, + "step": 166 + }, + { + "epoch": 0.12681537731371617, + "grad_norm": 1.6953125, + "learning_rate": 1.9850559763924176e-05, + "loss": 1.5175, + "step": 167 + }, + { + "epoch": 0.12757475083056477, + "grad_norm": 1.609375, + "learning_rate": 1.9846077135482513e-05, + "loss": 1.4363, + "step": 168 + }, + { + "epoch": 0.1283341243474134, + "grad_norm": 1.4609375, + "learning_rate": 1.9841528787338513e-05, + "loss": 1.3922, + "step": 169 + }, + { + "epoch": 0.129093497864262, + "grad_norm": 1.578125, + "learning_rate": 1.983691474985108e-05, + "loss": 1.4937, + "step": 170 + }, + { + "epoch": 0.1298528713811106, + "grad_norm": 1.421875, + "learning_rate": 1.983223505381757e-05, + "loss": 1.4381, + "step": 171 + }, + { + "epoch": 0.1306122448979592, + "grad_norm": 1.6484375, + "learning_rate": 1.9827489730473597e-05, + "loss": 1.5019, + "step": 172 + }, + { + "epoch": 0.1313716184148078, + "grad_norm": 1.4140625, + "learning_rate": 1.982267881149281e-05, + "loss": 1.3798, + "step": 173 + }, + { + "epoch": 0.1321309919316564, + "grad_norm": 1.6796875, + "learning_rate": 1.9817802328986696e-05, + "loss": 1.5623, + "step": 174 + }, + { + "epoch": 0.132890365448505, + "grad_norm": 1.5390625, + "learning_rate": 1.9812860315504362e-05, + "loss": 1.4497, + "step": 175 + }, + { + "epoch": 0.1336497389653536, + "grad_norm": 1.4921875, + "learning_rate": 1.9807852804032306e-05, + "loss": 1.4442, + "step": 176 + }, + { + "epoch": 0.1344091124822022, + "grad_norm": 1.734375, + "learning_rate": 1.9802779827994214e-05, + "loss": 1.5552, + "step": 177 + }, + { + "epoch": 0.1351684859990508, + "grad_norm": 1.421875, + "learning_rate": 1.9797641421250725e-05, + "loss": 1.4411, + "step": 178 + }, + { + "epoch": 0.1359278595158994, + "grad_norm": 1.3515625, + "learning_rate": 1.9792437618099215e-05, + "loss": 1.4569, + "step": 179 + }, + { + "epoch": 0.136687233032748, + "grad_norm": 1.40625, + "learning_rate": 1.9787168453273546e-05, + "loss": 1.4257, + "step": 180 + }, + { + "epoch": 0.1374466065495966, + "grad_norm": 1.359375, + "learning_rate": 1.9781833961943874e-05, + "loss": 1.417, + "step": 181 + }, + { + "epoch": 0.1382059800664452, + "grad_norm": 1.5625, + "learning_rate": 1.9776434179716365e-05, + "loss": 1.4831, + "step": 182 + }, + { + "epoch": 0.1389653535832938, + "grad_norm": 1.265625, + "learning_rate": 1.977096914263301e-05, + "loss": 1.3927, + "step": 183 + }, + { + "epoch": 0.1397247271001424, + "grad_norm": 1.3671875, + "learning_rate": 1.9765438887171327e-05, + "loss": 1.431, + "step": 184 + }, + { + "epoch": 0.140484100616991, + "grad_norm": 1.4609375, + "learning_rate": 1.975984345024418e-05, + "loss": 1.4798, + "step": 185 + }, + { + "epoch": 0.14124347413383959, + "grad_norm": 1.484375, + "learning_rate": 1.975418286919947e-05, + "loss": 1.4939, + "step": 186 + }, + { + "epoch": 0.14200284765068819, + "grad_norm": 1.390625, + "learning_rate": 1.9748457181819937e-05, + "loss": 1.4784, + "step": 187 + }, + { + "epoch": 0.14276222116753678, + "grad_norm": 1.2421875, + "learning_rate": 1.9742666426322877e-05, + "loss": 1.3947, + "step": 188 + }, + { + "epoch": 0.14352159468438538, + "grad_norm": 1.2109375, + "learning_rate": 1.97368106413599e-05, + "loss": 1.3783, + "step": 189 + }, + { + "epoch": 0.14428096820123398, + "grad_norm": 1.0859375, + "learning_rate": 1.9730889866016668e-05, + "loss": 1.3301, + "step": 190 + }, + { + "epoch": 0.14504034171808258, + "grad_norm": 1.1953125, + "learning_rate": 1.9724904139812636e-05, + "loss": 1.4403, + "step": 191 + }, + { + "epoch": 0.14579971523493118, + "grad_norm": 1.1484375, + "learning_rate": 1.9718853502700783e-05, + "loss": 1.4301, + "step": 192 + }, + { + "epoch": 0.14655908875177978, + "grad_norm": 1.0859375, + "learning_rate": 1.9712737995067357e-05, + "loss": 1.3473, + "step": 193 + }, + { + "epoch": 0.14731846226862838, + "grad_norm": 1.078125, + "learning_rate": 1.970655765773159e-05, + "loss": 1.3557, + "step": 194 + }, + { + "epoch": 0.14807783578547698, + "grad_norm": 1.0703125, + "learning_rate": 1.9700312531945444e-05, + "loss": 1.3979, + "step": 195 + }, + { + "epoch": 0.14883720930232558, + "grad_norm": 1.375, + "learning_rate": 1.9694002659393306e-05, + "loss": 1.5305, + "step": 196 + }, + { + "epoch": 0.14959658281917418, + "grad_norm": 1.1875, + "learning_rate": 1.9687628082191748e-05, + "loss": 1.5078, + "step": 197 + }, + { + "epoch": 0.15035595633602278, + "grad_norm": 1.265625, + "learning_rate": 1.9681188842889222e-05, + "loss": 1.4817, + "step": 198 + }, + { + "epoch": 0.15111532985287138, + "grad_norm": 1.0234375, + "learning_rate": 1.9674684984465774e-05, + "loss": 1.3599, + "step": 199 + }, + { + "epoch": 0.15187470336971998, + "grad_norm": 1.0625, + "learning_rate": 1.966811655033277e-05, + "loss": 1.384, + "step": 200 + }, + { + "epoch": 0.15263407688656858, + "grad_norm": 1.3515625, + "learning_rate": 1.9661483584332592e-05, + "loss": 1.514, + "step": 201 + }, + { + "epoch": 0.15339345040341718, + "grad_norm": 1.1328125, + "learning_rate": 1.9654786130738372e-05, + "loss": 1.3908, + "step": 202 + }, + { + "epoch": 0.15415282392026577, + "grad_norm": 1.015625, + "learning_rate": 1.9648024234253654e-05, + "loss": 1.336, + "step": 203 + }, + { + "epoch": 0.15491219743711437, + "grad_norm": 1.0078125, + "learning_rate": 1.9641197940012136e-05, + "loss": 1.3723, + "step": 204 + }, + { + "epoch": 0.15567157095396297, + "grad_norm": 1.015625, + "learning_rate": 1.963430729357735e-05, + "loss": 1.3784, + "step": 205 + }, + { + "epoch": 0.15643094447081157, + "grad_norm": 0.9375, + "learning_rate": 1.9627352340942355e-05, + "loss": 1.3541, + "step": 206 + }, + { + "epoch": 0.15719031798766017, + "grad_norm": 1.0546875, + "learning_rate": 1.9620333128529436e-05, + "loss": 1.3969, + "step": 207 + }, + { + "epoch": 0.15794969150450877, + "grad_norm": 1.1953125, + "learning_rate": 1.96132497031898e-05, + "loss": 1.4611, + "step": 208 + }, + { + "epoch": 0.15870906502135737, + "grad_norm": 0.96484375, + "learning_rate": 1.9606102112203243e-05, + "loss": 1.3631, + "step": 209 + }, + { + "epoch": 0.15946843853820597, + "grad_norm": 0.9375, + "learning_rate": 1.9598890403277867e-05, + "loss": 1.3605, + "step": 210 + }, + { + "epoch": 0.16022781205505457, + "grad_norm": 1.078125, + "learning_rate": 1.9591614624549724e-05, + "loss": 1.4721, + "step": 211 + }, + { + "epoch": 0.16098718557190317, + "grad_norm": 1.0234375, + "learning_rate": 1.958427482458253e-05, + "loss": 1.429, + "step": 212 + }, + { + "epoch": 0.16174655908875177, + "grad_norm": 0.94921875, + "learning_rate": 1.9576871052367307e-05, + "loss": 1.3866, + "step": 213 + }, + { + "epoch": 0.16250593260560037, + "grad_norm": 0.9140625, + "learning_rate": 1.956940335732209e-05, + "loss": 1.4103, + "step": 214 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.953125, + "learning_rate": 1.956187178929157e-05, + "loss": 1.3547, + "step": 215 + }, + { + "epoch": 0.16402467963929757, + "grad_norm": 1.0078125, + "learning_rate": 1.9554276398546767e-05, + "loss": 1.4262, + "step": 216 + }, + { + "epoch": 0.1647840531561462, + "grad_norm": 0.89453125, + "learning_rate": 1.9546617235784716e-05, + "loss": 1.3589, + "step": 217 + }, + { + "epoch": 0.1655434266729948, + "grad_norm": 0.91796875, + "learning_rate": 1.95388943521281e-05, + "loss": 1.3694, + "step": 218 + }, + { + "epoch": 0.1663028001898434, + "grad_norm": 0.859375, + "learning_rate": 1.953110779912492e-05, + "loss": 1.3515, + "step": 219 + }, + { + "epoch": 0.167062173706692, + "grad_norm": 0.96875, + "learning_rate": 1.9523257628748148e-05, + "loss": 1.419, + "step": 220 + }, + { + "epoch": 0.1678215472235406, + "grad_norm": 0.8515625, + "learning_rate": 1.9515343893395394e-05, + "loss": 1.3665, + "step": 221 + }, + { + "epoch": 0.1685809207403892, + "grad_norm": 0.7734375, + "learning_rate": 1.9507366645888544e-05, + "loss": 1.3448, + "step": 222 + }, + { + "epoch": 0.1693402942572378, + "grad_norm": 0.7578125, + "learning_rate": 1.9499325939473403e-05, + "loss": 1.3186, + "step": 223 + }, + { + "epoch": 0.1700996677740864, + "grad_norm": 0.6875, + "learning_rate": 1.9491221827819348e-05, + "loss": 1.2722, + "step": 224 + }, + { + "epoch": 0.170859041290935, + "grad_norm": 0.9375, + "learning_rate": 1.948305436501897e-05, + "loss": 1.4339, + "step": 225 + }, + { + "epoch": 0.1716184148077836, + "grad_norm": 0.796875, + "learning_rate": 1.9474823605587705e-05, + "loss": 1.3838, + "step": 226 + }, + { + "epoch": 0.1723777883246322, + "grad_norm": 0.98828125, + "learning_rate": 1.9466529604463484e-05, + "loss": 1.4411, + "step": 227 + }, + { + "epoch": 0.17313716184148079, + "grad_norm": 0.78515625, + "learning_rate": 1.9458172417006347e-05, + "loss": 1.3107, + "step": 228 + }, + { + "epoch": 0.17389653535832938, + "grad_norm": 0.86328125, + "learning_rate": 1.9449752098998097e-05, + "loss": 1.4422, + "step": 229 + }, + { + "epoch": 0.17465590887517798, + "grad_norm": 0.80078125, + "learning_rate": 1.9441268706641907e-05, + "loss": 1.3728, + "step": 230 + }, + { + "epoch": 0.17541528239202658, + "grad_norm": 0.9453125, + "learning_rate": 1.9432722296561954e-05, + "loss": 1.4489, + "step": 231 + }, + { + "epoch": 0.17617465590887518, + "grad_norm": 0.78125, + "learning_rate": 1.942411292580304e-05, + "loss": 1.3594, + "step": 232 + }, + { + "epoch": 0.17693402942572378, + "grad_norm": 0.7421875, + "learning_rate": 1.941544065183021e-05, + "loss": 1.3176, + "step": 233 + }, + { + "epoch": 0.17769340294257238, + "grad_norm": 0.71875, + "learning_rate": 1.9406705532528373e-05, + "loss": 1.3331, + "step": 234 + }, + { + "epoch": 0.17845277645942098, + "grad_norm": 0.73828125, + "learning_rate": 1.9397907626201915e-05, + "loss": 1.3217, + "step": 235 + }, + { + "epoch": 0.17921214997626958, + "grad_norm": 0.7578125, + "learning_rate": 1.9389046991574298e-05, + "loss": 1.3825, + "step": 236 + }, + { + "epoch": 0.17997152349311818, + "grad_norm": 0.78515625, + "learning_rate": 1.938012368778768e-05, + "loss": 1.3604, + "step": 237 + }, + { + "epoch": 0.18073089700996678, + "grad_norm": 0.6875, + "learning_rate": 1.9371137774402528e-05, + "loss": 1.3345, + "step": 238 + }, + { + "epoch": 0.18149027052681538, + "grad_norm": 0.87890625, + "learning_rate": 1.9362089311397194e-05, + "loss": 1.417, + "step": 239 + }, + { + "epoch": 0.18224964404366398, + "grad_norm": 0.63671875, + "learning_rate": 1.935297835916754e-05, + "loss": 1.2646, + "step": 240 + }, + { + "epoch": 0.18300901756051258, + "grad_norm": 0.67578125, + "learning_rate": 1.9343804978526525e-05, + "loss": 1.3089, + "step": 241 + }, + { + "epoch": 0.18376839107736118, + "grad_norm": 0.6328125, + "learning_rate": 1.9334569230703794e-05, + "loss": 1.2812, + "step": 242 + }, + { + "epoch": 0.18452776459420978, + "grad_norm": 0.76171875, + "learning_rate": 1.9325271177345284e-05, + "loss": 1.3355, + "step": 243 + }, + { + "epoch": 0.18528713811105837, + "grad_norm": 0.6484375, + "learning_rate": 1.9315910880512792e-05, + "loss": 1.3089, + "step": 244 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 0.703125, + "learning_rate": 1.9306488402683582e-05, + "loss": 1.3573, + "step": 245 + }, + { + "epoch": 0.18680588514475557, + "grad_norm": 0.62890625, + "learning_rate": 1.929700380674995e-05, + "loss": 1.2955, + "step": 246 + }, + { + "epoch": 0.18756525866160417, + "grad_norm": 0.6015625, + "learning_rate": 1.9287457156018824e-05, + "loss": 1.2819, + "step": 247 + }, + { + "epoch": 0.18832463217845277, + "grad_norm": 0.60546875, + "learning_rate": 1.927784851421132e-05, + "loss": 1.2677, + "step": 248 + }, + { + "epoch": 0.18908400569530137, + "grad_norm": 0.75, + "learning_rate": 1.926817794546232e-05, + "loss": 1.3524, + "step": 249 + }, + { + "epoch": 0.18984337921214997, + "grad_norm": 0.859375, + "learning_rate": 1.9258445514320064e-05, + "loss": 1.4673, + "step": 250 + }, + { + "epoch": 0.19060275272899857, + "grad_norm": 0.70703125, + "learning_rate": 1.9248651285745708e-05, + "loss": 1.3484, + "step": 251 + }, + { + "epoch": 0.19136212624584717, + "grad_norm": 0.7109375, + "learning_rate": 1.9238795325112867e-05, + "loss": 1.3565, + "step": 252 + }, + { + "epoch": 0.19212149976269577, + "grad_norm": 0.625, + "learning_rate": 1.9228877698207227e-05, + "loss": 1.3004, + "step": 253 + }, + { + "epoch": 0.19288087327954437, + "grad_norm": 0.66796875, + "learning_rate": 1.921889847122605e-05, + "loss": 1.3457, + "step": 254 + }, + { + "epoch": 0.19364024679639297, + "grad_norm": 0.69921875, + "learning_rate": 1.9208857710777785e-05, + "loss": 1.314, + "step": 255 + }, + { + "epoch": 0.19439962031324157, + "grad_norm": 0.8046875, + "learning_rate": 1.9198755483881585e-05, + "loss": 1.4202, + "step": 256 + }, + { + "epoch": 0.19515899383009017, + "grad_norm": 0.59375, + "learning_rate": 1.9188591857966875e-05, + "loss": 1.3255, + "step": 257 + }, + { + "epoch": 0.19591836734693877, + "grad_norm": 0.828125, + "learning_rate": 1.917836690087291e-05, + "loss": 1.4397, + "step": 258 + }, + { + "epoch": 0.19667774086378736, + "grad_norm": 0.640625, + "learning_rate": 1.91680806808483e-05, + "loss": 1.3296, + "step": 259 + }, + { + "epoch": 0.19743711438063596, + "grad_norm": 0.5859375, + "learning_rate": 1.9157733266550577e-05, + "loss": 1.2916, + "step": 260 + }, + { + "epoch": 0.19819648789748456, + "grad_norm": 0.64453125, + "learning_rate": 1.914732472704572e-05, + "loss": 1.3308, + "step": 261 + }, + { + "epoch": 0.19895586141433316, + "grad_norm": 0.6484375, + "learning_rate": 1.9136855131807705e-05, + "loss": 1.3426, + "step": 262 + }, + { + "epoch": 0.19971523493118176, + "grad_norm": 0.5390625, + "learning_rate": 1.9126324550718036e-05, + "loss": 1.2745, + "step": 263 + }, + { + "epoch": 0.20047460844803036, + "grad_norm": 0.6015625, + "learning_rate": 1.911573305406528e-05, + "loss": 1.3073, + "step": 264 + }, + { + "epoch": 0.201233981964879, + "grad_norm": 0.578125, + "learning_rate": 1.9105080712544603e-05, + "loss": 1.2674, + "step": 265 + }, + { + "epoch": 0.2019933554817276, + "grad_norm": 0.53515625, + "learning_rate": 1.909436759725728e-05, + "loss": 1.3087, + "step": 266 + }, + { + "epoch": 0.2027527289985762, + "grad_norm": 0.56640625, + "learning_rate": 1.908359377971025e-05, + "loss": 1.284, + "step": 267 + }, + { + "epoch": 0.2035121025154248, + "grad_norm": 0.5625, + "learning_rate": 1.9072759331815602e-05, + "loss": 1.2451, + "step": 268 + }, + { + "epoch": 0.20427147603227339, + "grad_norm": 0.5703125, + "learning_rate": 1.9061864325890132e-05, + "loss": 1.2624, + "step": 269 + }, + { + "epoch": 0.20503084954912199, + "grad_norm": 0.515625, + "learning_rate": 1.9050908834654834e-05, + "loss": 1.2392, + "step": 270 + }, + { + "epoch": 0.20579022306597058, + "grad_norm": 0.546875, + "learning_rate": 1.9039892931234434e-05, + "loss": 1.2405, + "step": 271 + }, + { + "epoch": 0.20654959658281918, + "grad_norm": 0.5625, + "learning_rate": 1.902881668915688e-05, + "loss": 1.2509, + "step": 272 + }, + { + "epoch": 0.20730897009966778, + "grad_norm": 0.5625, + "learning_rate": 1.9017680182352866e-05, + "loss": 1.3047, + "step": 273 + }, + { + "epoch": 0.20806834361651638, + "grad_norm": 0.68359375, + "learning_rate": 1.9006483485155338e-05, + "loss": 1.3492, + "step": 274 + }, + { + "epoch": 0.20882771713336498, + "grad_norm": 0.54296875, + "learning_rate": 1.8995226672298993e-05, + "loss": 1.2451, + "step": 275 + }, + { + "epoch": 0.20958709065021358, + "grad_norm": 0.62890625, + "learning_rate": 1.898390981891979e-05, + "loss": 1.3577, + "step": 276 + }, + { + "epoch": 0.21034646416706218, + "grad_norm": 0.58984375, + "learning_rate": 1.897253300055443e-05, + "loss": 1.3152, + "step": 277 + }, + { + "epoch": 0.21110583768391078, + "grad_norm": 0.58203125, + "learning_rate": 1.896109629313987e-05, + "loss": 1.3153, + "step": 278 + }, + { + "epoch": 0.21186521120075938, + "grad_norm": 0.60546875, + "learning_rate": 1.8949599773012808e-05, + "loss": 1.3153, + "step": 279 + }, + { + "epoch": 0.21262458471760798, + "grad_norm": 0.578125, + "learning_rate": 1.8938043516909173e-05, + "loss": 1.2932, + "step": 280 + }, + { + "epoch": 0.21338395823445658, + "grad_norm": 0.50390625, + "learning_rate": 1.892642760196361e-05, + "loss": 1.2294, + "step": 281 + }, + { + "epoch": 0.21414333175130518, + "grad_norm": 0.64453125, + "learning_rate": 1.891475210570898e-05, + "loss": 1.3246, + "step": 282 + }, + { + "epoch": 0.21490270526815378, + "grad_norm": 0.51953125, + "learning_rate": 1.890301710607582e-05, + "loss": 1.2312, + "step": 283 + }, + { + "epoch": 0.21566207878500238, + "grad_norm": 0.4609375, + "learning_rate": 1.8891222681391853e-05, + "loss": 1.2243, + "step": 284 + }, + { + "epoch": 0.21642145230185098, + "grad_norm": 0.5234375, + "learning_rate": 1.8879368910381423e-05, + "loss": 1.2593, + "step": 285 + }, + { + "epoch": 0.21718082581869957, + "grad_norm": 0.640625, + "learning_rate": 1.8867455872165006e-05, + "loss": 1.3375, + "step": 286 + }, + { + "epoch": 0.21794019933554817, + "grad_norm": 0.51171875, + "learning_rate": 1.8855483646258677e-05, + "loss": 1.2492, + "step": 287 + }, + { + "epoch": 0.21869957285239677, + "grad_norm": 0.61328125, + "learning_rate": 1.8843452312573557e-05, + "loss": 1.3306, + "step": 288 + }, + { + "epoch": 0.21945894636924537, + "grad_norm": 0.5546875, + "learning_rate": 1.8831361951415298e-05, + "loss": 1.2743, + "step": 289 + }, + { + "epoch": 0.22021831988609397, + "grad_norm": 0.6875, + "learning_rate": 1.881921264348355e-05, + "loss": 1.3699, + "step": 290 + }, + { + "epoch": 0.22097769340294257, + "grad_norm": 0.63671875, + "learning_rate": 1.880700446987141e-05, + "loss": 1.3548, + "step": 291 + }, + { + "epoch": 0.22173706691979117, + "grad_norm": 0.671875, + "learning_rate": 1.879473751206489e-05, + "loss": 1.3974, + "step": 292 + }, + { + "epoch": 0.22249644043663977, + "grad_norm": 0.5625, + "learning_rate": 1.8782411851942365e-05, + "loss": 1.29, + "step": 293 + }, + { + "epoch": 0.22325581395348837, + "grad_norm": 0.53125, + "learning_rate": 1.877002757177403e-05, + "loss": 1.2906, + "step": 294 + }, + { + "epoch": 0.22401518747033697, + "grad_norm": 0.462890625, + "learning_rate": 1.8757584754221363e-05, + "loss": 1.2135, + "step": 295 + }, + { + "epoch": 0.22477456098718557, + "grad_norm": 0.52734375, + "learning_rate": 1.8745083482336547e-05, + "loss": 1.3045, + "step": 296 + }, + { + "epoch": 0.22553393450403417, + "grad_norm": 0.5703125, + "learning_rate": 1.8732523839561934e-05, + "loss": 1.2641, + "step": 297 + }, + { + "epoch": 0.22629330802088277, + "grad_norm": 0.4921875, + "learning_rate": 1.8719905909729493e-05, + "loss": 1.2492, + "step": 298 + }, + { + "epoch": 0.22705268153773137, + "grad_norm": 0.5078125, + "learning_rate": 1.8707229777060242e-05, + "loss": 1.2867, + "step": 299 + }, + { + "epoch": 0.22781205505457996, + "grad_norm": 0.5390625, + "learning_rate": 1.869449552616367e-05, + "loss": 1.2946, + "step": 300 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.53125, + "learning_rate": 1.8681703242037208e-05, + "loss": 1.3014, + "step": 301 + }, + { + "epoch": 0.22933080208827716, + "grad_norm": 0.470703125, + "learning_rate": 1.8668853010065633e-05, + "loss": 1.2937, + "step": 302 + }, + { + "epoch": 0.23009017560512576, + "grad_norm": 0.57421875, + "learning_rate": 1.86559449160205e-05, + "loss": 1.2866, + "step": 303 + }, + { + "epoch": 0.23084954912197436, + "grad_norm": 0.51953125, + "learning_rate": 1.8642979046059595e-05, + "loss": 1.2542, + "step": 304 + }, + { + "epoch": 0.23160892263882296, + "grad_norm": 0.5078125, + "learning_rate": 1.8629955486726324e-05, + "loss": 1.2718, + "step": 305 + }, + { + "epoch": 0.23236829615567156, + "grad_norm": 0.5234375, + "learning_rate": 1.861687432494916e-05, + "loss": 1.2645, + "step": 306 + }, + { + "epoch": 0.23312766967252016, + "grad_norm": 0.54296875, + "learning_rate": 1.8603735648041054e-05, + "loss": 1.2895, + "step": 307 + }, + { + "epoch": 0.23388704318936876, + "grad_norm": 0.578125, + "learning_rate": 1.8590539543698852e-05, + "loss": 1.322, + "step": 308 + }, + { + "epoch": 0.23464641670621736, + "grad_norm": 0.49609375, + "learning_rate": 1.8577286100002723e-05, + "loss": 1.2584, + "step": 309 + }, + { + "epoch": 0.23540579022306596, + "grad_norm": 0.52734375, + "learning_rate": 1.856397540541554e-05, + "loss": 1.2814, + "step": 310 + }, + { + "epoch": 0.23616516373991456, + "grad_norm": 0.52734375, + "learning_rate": 1.855060754878233e-05, + "loss": 1.2865, + "step": 311 + }, + { + "epoch": 0.23692453725676316, + "grad_norm": 0.466796875, + "learning_rate": 1.853718261932964e-05, + "loss": 1.2597, + "step": 312 + }, + { + "epoch": 0.23768391077361178, + "grad_norm": 0.5078125, + "learning_rate": 1.852370070666498e-05, + "loss": 1.2556, + "step": 313 + }, + { + "epoch": 0.23844328429046038, + "grad_norm": 0.55078125, + "learning_rate": 1.8510161900776186e-05, + "loss": 1.304, + "step": 314 + }, + { + "epoch": 0.23920265780730898, + "grad_norm": 0.439453125, + "learning_rate": 1.8496566292030864e-05, + "loss": 1.2148, + "step": 315 + }, + { + "epoch": 0.23996203132415758, + "grad_norm": 0.515625, + "learning_rate": 1.8482913971175737e-05, + "loss": 1.2887, + "step": 316 + }, + { + "epoch": 0.24072140484100618, + "grad_norm": 0.57421875, + "learning_rate": 1.846920502933609e-05, + "loss": 1.3276, + "step": 317 + }, + { + "epoch": 0.24148077835785478, + "grad_norm": 0.474609375, + "learning_rate": 1.8455439558015117e-05, + "loss": 1.2681, + "step": 318 + }, + { + "epoch": 0.24224015187470338, + "grad_norm": 0.55078125, + "learning_rate": 1.8441617649093334e-05, + "loss": 1.2898, + "step": 319 + }, + { + "epoch": 0.24299952539155198, + "grad_norm": 0.5234375, + "learning_rate": 1.8427739394827976e-05, + "loss": 1.2785, + "step": 320 + }, + { + "epoch": 0.24375889890840058, + "grad_norm": 0.4765625, + "learning_rate": 1.8413804887852343e-05, + "loss": 1.1799, + "step": 321 + }, + { + "epoch": 0.24451827242524918, + "grad_norm": 0.45703125, + "learning_rate": 1.839981422117523e-05, + "loss": 1.1951, + "step": 322 + }, + { + "epoch": 0.24527764594209778, + "grad_norm": 0.546875, + "learning_rate": 1.8385767488180255e-05, + "loss": 1.3233, + "step": 323 + }, + { + "epoch": 0.24603701945894638, + "grad_norm": 0.451171875, + "learning_rate": 1.8371664782625287e-05, + "loss": 1.2204, + "step": 324 + }, + { + "epoch": 0.24679639297579498, + "grad_norm": 0.478515625, + "learning_rate": 1.8357506198641784e-05, + "loss": 1.2763, + "step": 325 + }, + { + "epoch": 0.24755576649264358, + "grad_norm": 0.578125, + "learning_rate": 1.8343291830734176e-05, + "loss": 1.3397, + "step": 326 + }, + { + "epoch": 0.24831514000949217, + "grad_norm": 0.52734375, + "learning_rate": 1.8329021773779242e-05, + "loss": 1.3029, + "step": 327 + }, + { + "epoch": 0.24907451352634077, + "grad_norm": 0.54296875, + "learning_rate": 1.8314696123025456e-05, + "loss": 1.2977, + "step": 328 + }, + { + "epoch": 0.24983388704318937, + "grad_norm": 0.5078125, + "learning_rate": 1.8300314974092372e-05, + "loss": 1.2915, + "step": 329 + }, + { + "epoch": 0.25059326056003794, + "grad_norm": 0.478515625, + "learning_rate": 1.8285878422969982e-05, + "loss": 1.2278, + "step": 330 + }, + { + "epoch": 0.25135263407688657, + "grad_norm": 0.421875, + "learning_rate": 1.827138656601807e-05, + "loss": 1.2337, + "step": 331 + }, + { + "epoch": 0.25211200759373514, + "grad_norm": 0.52734375, + "learning_rate": 1.825683949996556e-05, + "loss": 1.2978, + "step": 332 + }, + { + "epoch": 0.25287138111058377, + "grad_norm": 0.5390625, + "learning_rate": 1.8242237321909895e-05, + "loss": 1.2512, + "step": 333 + }, + { + "epoch": 0.25363075462743234, + "grad_norm": 0.49609375, + "learning_rate": 1.8227580129316368e-05, + "loss": 1.2702, + "step": 334 + }, + { + "epoch": 0.25439012814428097, + "grad_norm": 0.4609375, + "learning_rate": 1.821286802001747e-05, + "loss": 1.2253, + "step": 335 + }, + { + "epoch": 0.25514950166112954, + "grad_norm": 0.5390625, + "learning_rate": 1.819810109221227e-05, + "loss": 1.2708, + "step": 336 + }, + { + "epoch": 0.25590887517797817, + "grad_norm": 0.44140625, + "learning_rate": 1.81832794444657e-05, + "loss": 1.2157, + "step": 337 + }, + { + "epoch": 0.2566682486948268, + "grad_norm": 0.5703125, + "learning_rate": 1.8168403175707958e-05, + "loss": 1.3529, + "step": 338 + }, + { + "epoch": 0.25742762221167537, + "grad_norm": 0.390625, + "learning_rate": 1.815347238523381e-05, + "loss": 1.1796, + "step": 339 + }, + { + "epoch": 0.258186995728524, + "grad_norm": 0.466796875, + "learning_rate": 1.813848717270195e-05, + "loss": 1.2568, + "step": 340 + }, + { + "epoch": 0.25894636924537257, + "grad_norm": 0.490234375, + "learning_rate": 1.812344763813431e-05, + "loss": 1.2732, + "step": 341 + }, + { + "epoch": 0.2597057427622212, + "grad_norm": 0.44140625, + "learning_rate": 1.8108353881915403e-05, + "loss": 1.2737, + "step": 342 + }, + { + "epoch": 0.26046511627906976, + "grad_norm": 0.45703125, + "learning_rate": 1.8093206004791673e-05, + "loss": 1.2281, + "step": 343 + }, + { + "epoch": 0.2612244897959184, + "grad_norm": 0.546875, + "learning_rate": 1.8078004107870797e-05, + "loss": 1.3148, + "step": 344 + }, + { + "epoch": 0.26198386331276696, + "grad_norm": 0.44921875, + "learning_rate": 1.806274829262101e-05, + "loss": 1.2584, + "step": 345 + }, + { + "epoch": 0.2627432368296156, + "grad_norm": 0.5078125, + "learning_rate": 1.8047438660870447e-05, + "loss": 1.2665, + "step": 346 + }, + { + "epoch": 0.26350261034646416, + "grad_norm": 0.51171875, + "learning_rate": 1.803207531480645e-05, + "loss": 1.2892, + "step": 347 + }, + { + "epoch": 0.2642619838633128, + "grad_norm": 0.51953125, + "learning_rate": 1.8016658356974885e-05, + "loss": 1.2782, + "step": 348 + }, + { + "epoch": 0.26502135738016136, + "grad_norm": 0.46484375, + "learning_rate": 1.800118789027947e-05, + "loss": 1.2857, + "step": 349 + }, + { + "epoch": 0.26578073089701, + "grad_norm": 0.455078125, + "learning_rate": 1.798566401798106e-05, + "loss": 1.2529, + "step": 350 + }, + { + "epoch": 0.26654010441385856, + "grad_norm": 0.466796875, + "learning_rate": 1.7970086843697e-05, + "loss": 1.2445, + "step": 351 + }, + { + "epoch": 0.2672994779307072, + "grad_norm": 0.439453125, + "learning_rate": 1.7954456471400393e-05, + "loss": 1.2143, + "step": 352 + }, + { + "epoch": 0.26805885144755576, + "grad_norm": 0.421875, + "learning_rate": 1.793877300541944e-05, + "loss": 1.2444, + "step": 353 + }, + { + "epoch": 0.2688182249644044, + "grad_norm": 0.474609375, + "learning_rate": 1.7923036550436706e-05, + "loss": 1.2674, + "step": 354 + }, + { + "epoch": 0.26957759848125296, + "grad_norm": 0.5, + "learning_rate": 1.7907247211488456e-05, + "loss": 1.2926, + "step": 355 + }, + { + "epoch": 0.2703369719981016, + "grad_norm": 0.439453125, + "learning_rate": 1.789140509396394e-05, + "loss": 1.2125, + "step": 356 + }, + { + "epoch": 0.27109634551495015, + "grad_norm": 0.443359375, + "learning_rate": 1.7875510303604678e-05, + "loss": 1.1936, + "step": 357 + }, + { + "epoch": 0.2718557190317988, + "grad_norm": 0.462890625, + "learning_rate": 1.7859562946503787e-05, + "loss": 1.2251, + "step": 358 + }, + { + "epoch": 0.27261509254864735, + "grad_norm": 0.470703125, + "learning_rate": 1.784356312910523e-05, + "loss": 1.2829, + "step": 359 + }, + { + "epoch": 0.273374466065496, + "grad_norm": 0.44140625, + "learning_rate": 1.7827510958203147e-05, + "loss": 1.2277, + "step": 360 + }, + { + "epoch": 0.27413383958234455, + "grad_norm": 0.486328125, + "learning_rate": 1.78114065409411e-05, + "loss": 1.2715, + "step": 361 + }, + { + "epoch": 0.2748932130991932, + "grad_norm": 0.47265625, + "learning_rate": 1.7795249984811397e-05, + "loss": 1.2467, + "step": 362 + }, + { + "epoch": 0.27565258661604175, + "grad_norm": 0.455078125, + "learning_rate": 1.7779041397654355e-05, + "loss": 1.2529, + "step": 363 + }, + { + "epoch": 0.2764119601328904, + "grad_norm": 0.5, + "learning_rate": 1.7762780887657576e-05, + "loss": 1.2749, + "step": 364 + }, + { + "epoch": 0.27717133364973895, + "grad_norm": 0.40234375, + "learning_rate": 1.7746468563355243e-05, + "loss": 1.1978, + "step": 365 + }, + { + "epoch": 0.2779307071665876, + "grad_norm": 0.451171875, + "learning_rate": 1.773010453362737e-05, + "loss": 1.244, + "step": 366 + }, + { + "epoch": 0.27869008068343615, + "grad_norm": 0.53515625, + "learning_rate": 1.7713688907699107e-05, + "loss": 1.3013, + "step": 367 + }, + { + "epoch": 0.2794494542002848, + "grad_norm": 0.482421875, + "learning_rate": 1.769722179513998e-05, + "loss": 1.2608, + "step": 368 + }, + { + "epoch": 0.28020882771713335, + "grad_norm": 0.412109375, + "learning_rate": 1.7680703305863177e-05, + "loss": 1.1853, + "step": 369 + }, + { + "epoch": 0.280968201233982, + "grad_norm": 0.44140625, + "learning_rate": 1.7664133550124815e-05, + "loss": 1.2565, + "step": 370 + }, + { + "epoch": 0.28172757475083055, + "grad_norm": 0.41796875, + "learning_rate": 1.7647512638523193e-05, + "loss": 1.1891, + "step": 371 + }, + { + "epoch": 0.28248694826767917, + "grad_norm": 0.4375, + "learning_rate": 1.7630840681998068e-05, + "loss": 1.231, + "step": 372 + }, + { + "epoch": 0.28324632178452774, + "grad_norm": 0.490234375, + "learning_rate": 1.7614117791829897e-05, + "loss": 1.2935, + "step": 373 + }, + { + "epoch": 0.28400569530137637, + "grad_norm": 0.52734375, + "learning_rate": 1.759734407963911e-05, + "loss": 1.2953, + "step": 374 + }, + { + "epoch": 0.28476506881822494, + "grad_norm": 0.482421875, + "learning_rate": 1.7580519657385368e-05, + "loss": 1.2782, + "step": 375 + }, + { + "epoch": 0.28552444233507357, + "grad_norm": 0.55859375, + "learning_rate": 1.7563644637366786e-05, + "loss": 1.333, + "step": 376 + }, + { + "epoch": 0.28628381585192214, + "grad_norm": 0.515625, + "learning_rate": 1.754671913221923e-05, + "loss": 1.2813, + "step": 377 + }, + { + "epoch": 0.28704318936877077, + "grad_norm": 0.498046875, + "learning_rate": 1.752974325491551e-05, + "loss": 1.2581, + "step": 378 + }, + { + "epoch": 0.28780256288561934, + "grad_norm": 0.400390625, + "learning_rate": 1.7512717118764687e-05, + "loss": 1.2302, + "step": 379 + }, + { + "epoch": 0.28856193640246797, + "grad_norm": 0.466796875, + "learning_rate": 1.7495640837411265e-05, + "loss": 1.2359, + "step": 380 + }, + { + "epoch": 0.28932130991931654, + "grad_norm": 0.51953125, + "learning_rate": 1.747851452483445e-05, + "loss": 1.2548, + "step": 381 + }, + { + "epoch": 0.29008068343616517, + "grad_norm": 0.47265625, + "learning_rate": 1.7461338295347404e-05, + "loss": 1.2752, + "step": 382 + }, + { + "epoch": 0.29084005695301374, + "grad_norm": 0.392578125, + "learning_rate": 1.7444112263596474e-05, + "loss": 1.2092, + "step": 383 + }, + { + "epoch": 0.29159943046986236, + "grad_norm": 0.51171875, + "learning_rate": 1.74268365445604e-05, + "loss": 1.3045, + "step": 384 + }, + { + "epoch": 0.292358803986711, + "grad_norm": 0.46484375, + "learning_rate": 1.7409511253549592e-05, + "loss": 1.2586, + "step": 385 + }, + { + "epoch": 0.29311817750355956, + "grad_norm": 0.439453125, + "learning_rate": 1.7392136506205332e-05, + "loss": 1.1966, + "step": 386 + }, + { + "epoch": 0.2938775510204082, + "grad_norm": 0.408203125, + "learning_rate": 1.7374712418498997e-05, + "loss": 1.1853, + "step": 387 + }, + { + "epoch": 0.29463692453725676, + "grad_norm": 0.439453125, + "learning_rate": 1.735723910673132e-05, + "loss": 1.2408, + "step": 388 + }, + { + "epoch": 0.2953962980541054, + "grad_norm": 0.412109375, + "learning_rate": 1.7339716687531564e-05, + "loss": 1.163, + "step": 389 + }, + { + "epoch": 0.29615567157095396, + "grad_norm": 0.50390625, + "learning_rate": 1.7322145277856793e-05, + "loss": 1.2941, + "step": 390 + }, + { + "epoch": 0.2969150450878026, + "grad_norm": 0.419921875, + "learning_rate": 1.7304524994991056e-05, + "loss": 1.2504, + "step": 391 + }, + { + "epoch": 0.29767441860465116, + "grad_norm": 0.470703125, + "learning_rate": 1.7286855956544616e-05, + "loss": 1.2842, + "step": 392 + }, + { + "epoch": 0.2984337921214998, + "grad_norm": 0.41796875, + "learning_rate": 1.726913828045317e-05, + "loss": 1.2403, + "step": 393 + }, + { + "epoch": 0.29919316563834836, + "grad_norm": 0.498046875, + "learning_rate": 1.725137208497705e-05, + "loss": 1.254, + "step": 394 + }, + { + "epoch": 0.299952539155197, + "grad_norm": 0.4609375, + "learning_rate": 1.7233557488700453e-05, + "loss": 1.2395, + "step": 395 + }, + { + "epoch": 0.30071191267204556, + "grad_norm": 0.53125, + "learning_rate": 1.7215694610530624e-05, + "loss": 1.2705, + "step": 396 + }, + { + "epoch": 0.3014712861888942, + "grad_norm": 0.416015625, + "learning_rate": 1.7197783569697084e-05, + "loss": 1.2212, + "step": 397 + }, + { + "epoch": 0.30223065970574275, + "grad_norm": 0.51171875, + "learning_rate": 1.7179824485750824e-05, + "loss": 1.2975, + "step": 398 + }, + { + "epoch": 0.3029900332225914, + "grad_norm": 0.43359375, + "learning_rate": 1.7161817478563504e-05, + "loss": 1.2402, + "step": 399 + }, + { + "epoch": 0.30374940673943995, + "grad_norm": 0.423828125, + "learning_rate": 1.7143762668326667e-05, + "loss": 1.2287, + "step": 400 + }, + { + "epoch": 0.3045087802562886, + "grad_norm": 0.451171875, + "learning_rate": 1.712566017555092e-05, + "loss": 1.2097, + "step": 401 + }, + { + "epoch": 0.30526815377313715, + "grad_norm": 0.51953125, + "learning_rate": 1.7107510121065138e-05, + "loss": 1.3114, + "step": 402 + }, + { + "epoch": 0.3060275272899858, + "grad_norm": 0.423828125, + "learning_rate": 1.7089312626015663e-05, + "loss": 1.2468, + "step": 403 + }, + { + "epoch": 0.30678690080683435, + "grad_norm": 0.384765625, + "learning_rate": 1.7071067811865477e-05, + "loss": 1.1837, + "step": 404 + }, + { + "epoch": 0.307546274323683, + "grad_norm": 0.470703125, + "learning_rate": 1.7052775800393415e-05, + "loss": 1.238, + "step": 405 + }, + { + "epoch": 0.30830564784053155, + "grad_norm": 0.39453125, + "learning_rate": 1.703443671369333e-05, + "loss": 1.217, + "step": 406 + }, + { + "epoch": 0.3090650213573802, + "grad_norm": 0.384765625, + "learning_rate": 1.7016050674173304e-05, + "loss": 1.2202, + "step": 407 + }, + { + "epoch": 0.30982439487422875, + "grad_norm": 0.45703125, + "learning_rate": 1.69976178045548e-05, + "loss": 1.2238, + "step": 408 + }, + { + "epoch": 0.3105837683910774, + "grad_norm": 0.435546875, + "learning_rate": 1.6979138227871858e-05, + "loss": 1.2318, + "step": 409 + }, + { + "epoch": 0.31134314190792595, + "grad_norm": 0.453125, + "learning_rate": 1.696061206747029e-05, + "loss": 1.2208, + "step": 410 + }, + { + "epoch": 0.3121025154247746, + "grad_norm": 0.453125, + "learning_rate": 1.6942039447006823e-05, + "loss": 1.2223, + "step": 411 + }, + { + "epoch": 0.31286188894162315, + "grad_norm": 0.33984375, + "learning_rate": 1.6923420490448298e-05, + "loss": 1.1626, + "step": 412 + }, + { + "epoch": 0.3136212624584718, + "grad_norm": 0.5625, + "learning_rate": 1.6904755322070846e-05, + "loss": 1.2768, + "step": 413 + }, + { + "epoch": 0.31438063597532034, + "grad_norm": 0.462890625, + "learning_rate": 1.688604406645903e-05, + "loss": 1.2694, + "step": 414 + }, + { + "epoch": 0.31514000949216897, + "grad_norm": 0.39453125, + "learning_rate": 1.686728684850505e-05, + "loss": 1.1856, + "step": 415 + }, + { + "epoch": 0.31589938300901754, + "grad_norm": 0.380859375, + "learning_rate": 1.6848483793407874e-05, + "loss": 1.2184, + "step": 416 + }, + { + "epoch": 0.31665875652586617, + "grad_norm": 0.361328125, + "learning_rate": 1.6829635026672432e-05, + "loss": 1.1899, + "step": 417 + }, + { + "epoch": 0.31741813004271474, + "grad_norm": 0.44921875, + "learning_rate": 1.6810740674108763e-05, + "loss": 1.2078, + "step": 418 + }, + { + "epoch": 0.31817750355956337, + "grad_norm": 0.46484375, + "learning_rate": 1.6791800861831176e-05, + "loss": 1.2226, + "step": 419 + }, + { + "epoch": 0.31893687707641194, + "grad_norm": 0.404296875, + "learning_rate": 1.6772815716257414e-05, + "loss": 1.2044, + "step": 420 + }, + { + "epoch": 0.31969625059326057, + "grad_norm": 0.44921875, + "learning_rate": 1.6753785364107796e-05, + "loss": 1.2699, + "step": 421 + }, + { + "epoch": 0.32045562411010914, + "grad_norm": 0.37109375, + "learning_rate": 1.6734709932404404e-05, + "loss": 1.1732, + "step": 422 + }, + { + "epoch": 0.32121499762695777, + "grad_norm": 0.48046875, + "learning_rate": 1.6715589548470187e-05, + "loss": 1.2655, + "step": 423 + }, + { + "epoch": 0.32197437114380634, + "grad_norm": 0.40625, + "learning_rate": 1.6696424339928153e-05, + "loss": 1.2044, + "step": 424 + }, + { + "epoch": 0.32273374466065496, + "grad_norm": 0.427734375, + "learning_rate": 1.6677214434700495e-05, + "loss": 1.2083, + "step": 425 + }, + { + "epoch": 0.32349311817750354, + "grad_norm": 0.4453125, + "learning_rate": 1.665795996100775e-05, + "loss": 1.2273, + "step": 426 + }, + { + "epoch": 0.32425249169435216, + "grad_norm": 0.5, + "learning_rate": 1.663866104736793e-05, + "loss": 1.2407, + "step": 427 + }, + { + "epoch": 0.32501186521120073, + "grad_norm": 0.390625, + "learning_rate": 1.6619317822595666e-05, + "loss": 1.2166, + "step": 428 + }, + { + "epoch": 0.32577123872804936, + "grad_norm": 0.42578125, + "learning_rate": 1.6599930415801374e-05, + "loss": 1.238, + "step": 429 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.390625, + "learning_rate": 1.658049895639034e-05, + "loss": 1.1813, + "step": 430 + }, + { + "epoch": 0.32728998576174656, + "grad_norm": 0.423828125, + "learning_rate": 1.6561023574061925e-05, + "loss": 1.2264, + "step": 431 + }, + { + "epoch": 0.32804935927859513, + "grad_norm": 0.40625, + "learning_rate": 1.6541504398808633e-05, + "loss": 1.2364, + "step": 432 + }, + { + "epoch": 0.32880873279544376, + "grad_norm": 0.44921875, + "learning_rate": 1.6521941560915284e-05, + "loss": 1.2339, + "step": 433 + }, + { + "epoch": 0.3295681063122924, + "grad_norm": 0.4765625, + "learning_rate": 1.6502335190958135e-05, + "loss": 1.2952, + "step": 434 + }, + { + "epoch": 0.33032747982914096, + "grad_norm": 0.380859375, + "learning_rate": 1.648268541980401e-05, + "loss": 1.195, + "step": 435 + }, + { + "epoch": 0.3310868533459896, + "grad_norm": 0.490234375, + "learning_rate": 1.646299237860941e-05, + "loss": 1.2866, + "step": 436 + }, + { + "epoch": 0.33184622686283816, + "grad_norm": 0.392578125, + "learning_rate": 1.6443256198819665e-05, + "loss": 1.2219, + "step": 437 + }, + { + "epoch": 0.3326056003796868, + "grad_norm": 0.427734375, + "learning_rate": 1.6423477012168038e-05, + "loss": 1.2458, + "step": 438 + }, + { + "epoch": 0.33336497389653535, + "grad_norm": 0.384765625, + "learning_rate": 1.640365495067485e-05, + "loss": 1.21, + "step": 439 + }, + { + "epoch": 0.334124347413384, + "grad_norm": 0.416015625, + "learning_rate": 1.638379014664659e-05, + "loss": 1.2286, + "step": 440 + }, + { + "epoch": 0.33488372093023255, + "grad_norm": 0.40625, + "learning_rate": 1.636388273267506e-05, + "loss": 1.1945, + "step": 441 + }, + { + "epoch": 0.3356430944470812, + "grad_norm": 0.5078125, + "learning_rate": 1.6343932841636455e-05, + "loss": 1.3204, + "step": 442 + }, + { + "epoch": 0.33640246796392975, + "grad_norm": 0.423828125, + "learning_rate": 1.63239406066905e-05, + "loss": 1.2361, + "step": 443 + }, + { + "epoch": 0.3371618414807784, + "grad_norm": 0.40234375, + "learning_rate": 1.6303906161279554e-05, + "loss": 1.1951, + "step": 444 + }, + { + "epoch": 0.33792121499762695, + "grad_norm": 0.43359375, + "learning_rate": 1.6283829639127705e-05, + "loss": 1.2686, + "step": 445 + }, + { + "epoch": 0.3386805885144756, + "grad_norm": 0.482421875, + "learning_rate": 1.6263711174239914e-05, + "loss": 1.264, + "step": 446 + }, + { + "epoch": 0.33943996203132415, + "grad_norm": 0.4375, + "learning_rate": 1.6243550900901076e-05, + "loss": 1.2668, + "step": 447 + }, + { + "epoch": 0.3401993355481728, + "grad_norm": 0.408203125, + "learning_rate": 1.6223348953675163e-05, + "loss": 1.1683, + "step": 448 + }, + { + "epoch": 0.34095870906502135, + "grad_norm": 0.408203125, + "learning_rate": 1.6203105467404284e-05, + "loss": 1.2147, + "step": 449 + }, + { + "epoch": 0.34171808258187, + "grad_norm": 0.400390625, + "learning_rate": 1.6182820577207842e-05, + "loss": 1.2178, + "step": 450 + }, + { + "epoch": 0.34247745609871855, + "grad_norm": 0.408203125, + "learning_rate": 1.6162494418481574e-05, + "loss": 1.2321, + "step": 451 + }, + { + "epoch": 0.3432368296155672, + "grad_norm": 0.447265625, + "learning_rate": 1.6142127126896682e-05, + "loss": 1.2495, + "step": 452 + }, + { + "epoch": 0.34399620313241575, + "grad_norm": 0.38671875, + "learning_rate": 1.612171883839891e-05, + "loss": 1.1807, + "step": 453 + }, + { + "epoch": 0.3447555766492644, + "grad_norm": 0.38671875, + "learning_rate": 1.6101269689207656e-05, + "loss": 1.1941, + "step": 454 + }, + { + "epoch": 0.34551495016611294, + "grad_norm": 0.369140625, + "learning_rate": 1.6080779815815043e-05, + "loss": 1.2159, + "step": 455 + }, + { + "epoch": 0.34627432368296157, + "grad_norm": 0.412109375, + "learning_rate": 1.6060249354985023e-05, + "loss": 1.222, + "step": 456 + }, + { + "epoch": 0.34703369719981014, + "grad_norm": 0.44140625, + "learning_rate": 1.603967844375245e-05, + "loss": 1.2526, + "step": 457 + }, + { + "epoch": 0.34779307071665877, + "grad_norm": 0.3671875, + "learning_rate": 1.6019067219422178e-05, + "loss": 1.1691, + "step": 458 + }, + { + "epoch": 0.34855244423350734, + "grad_norm": 0.390625, + "learning_rate": 1.5998415819568135e-05, + "loss": 1.1933, + "step": 459 + }, + { + "epoch": 0.34931181775035597, + "grad_norm": 0.50390625, + "learning_rate": 1.597772438203241e-05, + "loss": 1.2525, + "step": 460 + }, + { + "epoch": 0.35007119126720454, + "grad_norm": 0.38671875, + "learning_rate": 1.5956993044924334e-05, + "loss": 1.2022, + "step": 461 + }, + { + "epoch": 0.35083056478405317, + "grad_norm": 0.470703125, + "learning_rate": 1.593622194661956e-05, + "loss": 1.2853, + "step": 462 + }, + { + "epoch": 0.35158993830090174, + "grad_norm": 0.466796875, + "learning_rate": 1.5915411225759122e-05, + "loss": 1.3113, + "step": 463 + }, + { + "epoch": 0.35234931181775037, + "grad_norm": 0.462890625, + "learning_rate": 1.5894561021248535e-05, + "loss": 1.246, + "step": 464 + }, + { + "epoch": 0.35310868533459894, + "grad_norm": 0.376953125, + "learning_rate": 1.5873671472256854e-05, + "loss": 1.1929, + "step": 465 + }, + { + "epoch": 0.35386805885144756, + "grad_norm": 0.416015625, + "learning_rate": 1.5852742718215743e-05, + "loss": 1.2469, + "step": 466 + }, + { + "epoch": 0.35462743236829614, + "grad_norm": 0.35546875, + "learning_rate": 1.5831774898818558e-05, + "loss": 1.1592, + "step": 467 + }, + { + "epoch": 0.35538680588514476, + "grad_norm": 0.40625, + "learning_rate": 1.5810768154019386e-05, + "loss": 1.2145, + "step": 468 + }, + { + "epoch": 0.35614617940199333, + "grad_norm": 0.400390625, + "learning_rate": 1.5789722624032143e-05, + "loss": 1.1859, + "step": 469 + }, + { + "epoch": 0.35690555291884196, + "grad_norm": 0.423828125, + "learning_rate": 1.576863844932963e-05, + "loss": 1.2184, + "step": 470 + }, + { + "epoch": 0.35766492643569053, + "grad_norm": 0.435546875, + "learning_rate": 1.5747515770642582e-05, + "loss": 1.2126, + "step": 471 + }, + { + "epoch": 0.35842429995253916, + "grad_norm": 0.443359375, + "learning_rate": 1.5726354728958736e-05, + "loss": 1.2569, + "step": 472 + }, + { + "epoch": 0.35918367346938773, + "grad_norm": 0.39453125, + "learning_rate": 1.570515546552189e-05, + "loss": 1.2173, + "step": 473 + }, + { + "epoch": 0.35994304698623636, + "grad_norm": 0.400390625, + "learning_rate": 1.568391812183097e-05, + "loss": 1.1995, + "step": 474 + }, + { + "epoch": 0.36070242050308493, + "grad_norm": 0.40234375, + "learning_rate": 1.566264283963907e-05, + "loss": 1.238, + "step": 475 + }, + { + "epoch": 0.36146179401993356, + "grad_norm": 0.400390625, + "learning_rate": 1.5641329760952514e-05, + "loss": 1.2179, + "step": 476 + }, + { + "epoch": 0.36222116753678213, + "grad_norm": 0.41015625, + "learning_rate": 1.5619979028029898e-05, + "loss": 1.2148, + "step": 477 + }, + { + "epoch": 0.36298054105363076, + "grad_norm": 0.3828125, + "learning_rate": 1.5598590783381165e-05, + "loss": 1.201, + "step": 478 + }, + { + "epoch": 0.36373991457047933, + "grad_norm": 0.40234375, + "learning_rate": 1.5577165169766627e-05, + "loss": 1.2383, + "step": 479 + }, + { + "epoch": 0.36449928808732796, + "grad_norm": 0.396484375, + "learning_rate": 1.5555702330196024e-05, + "loss": 1.2399, + "step": 480 + }, + { + "epoch": 0.3652586616041765, + "grad_norm": 0.41015625, + "learning_rate": 1.5534202407927574e-05, + "loss": 1.2565, + "step": 481 + }, + { + "epoch": 0.36601803512102515, + "grad_norm": 0.41796875, + "learning_rate": 1.5512665546467008e-05, + "loss": 1.2256, + "step": 482 + }, + { + "epoch": 0.3667774086378738, + "grad_norm": 0.38671875, + "learning_rate": 1.549109188956661e-05, + "loss": 1.1796, + "step": 483 + }, + { + "epoch": 0.36753678215472235, + "grad_norm": 0.404296875, + "learning_rate": 1.5469481581224274e-05, + "loss": 1.2004, + "step": 484 + }, + { + "epoch": 0.368296155671571, + "grad_norm": 0.369140625, + "learning_rate": 1.5447834765682515e-05, + "loss": 1.1787, + "step": 485 + }, + { + "epoch": 0.36905552918841955, + "grad_norm": 0.337890625, + "learning_rate": 1.5426151587427548e-05, + "loss": 1.1656, + "step": 486 + }, + { + "epoch": 0.3698149027052682, + "grad_norm": 0.376953125, + "learning_rate": 1.540443219118827e-05, + "loss": 1.1887, + "step": 487 + }, + { + "epoch": 0.37057427622211675, + "grad_norm": 0.427734375, + "learning_rate": 1.5382676721935344e-05, + "loss": 1.2309, + "step": 488 + }, + { + "epoch": 0.3713336497389654, + "grad_norm": 0.365234375, + "learning_rate": 1.5360885324880205e-05, + "loss": 1.1869, + "step": 489 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.4296875, + "learning_rate": 1.5339058145474086e-05, + "loss": 1.2477, + "step": 490 + }, + { + "epoch": 0.3728523967726626, + "grad_norm": 0.412109375, + "learning_rate": 1.5317195329407067e-05, + "loss": 1.2257, + "step": 491 + }, + { + "epoch": 0.37361177028951115, + "grad_norm": 0.4140625, + "learning_rate": 1.529529702260709e-05, + "loss": 1.2565, + "step": 492 + }, + { + "epoch": 0.3743711438063598, + "grad_norm": 0.41015625, + "learning_rate": 1.5273363371238983e-05, + "loss": 1.1869, + "step": 493 + }, + { + "epoch": 0.37513051732320835, + "grad_norm": 0.3828125, + "learning_rate": 1.5251394521703496e-05, + "loss": 1.2229, + "step": 494 + }, + { + "epoch": 0.375889890840057, + "grad_norm": 0.427734375, + "learning_rate": 1.5229390620636309e-05, + "loss": 1.2105, + "step": 495 + }, + { + "epoch": 0.37664926435690554, + "grad_norm": 0.37890625, + "learning_rate": 1.5207351814907068e-05, + "loss": 1.2271, + "step": 496 + }, + { + "epoch": 0.37740863787375417, + "grad_norm": 0.361328125, + "learning_rate": 1.5185278251618391e-05, + "loss": 1.1995, + "step": 497 + }, + { + "epoch": 0.37816801139060274, + "grad_norm": 0.318359375, + "learning_rate": 1.51631700781049e-05, + "loss": 1.1512, + "step": 498 + }, + { + "epoch": 0.37892738490745137, + "grad_norm": 0.384765625, + "learning_rate": 1.5141027441932217e-05, + "loss": 1.2129, + "step": 499 + }, + { + "epoch": 0.37968675842429994, + "grad_norm": 0.44921875, + "learning_rate": 1.5118850490896012e-05, + "loss": 1.2336, + "step": 500 + }, + { + "epoch": 0.38044613194114857, + "grad_norm": 0.388671875, + "learning_rate": 1.5096639373020976e-05, + "loss": 1.1947, + "step": 501 + }, + { + "epoch": 0.38120550545799714, + "grad_norm": 0.373046875, + "learning_rate": 1.5074394236559871e-05, + "loss": 1.2024, + "step": 502 + }, + { + "epoch": 0.38196487897484577, + "grad_norm": 0.3828125, + "learning_rate": 1.5052115229992512e-05, + "loss": 1.2024, + "step": 503 + }, + { + "epoch": 0.38272425249169434, + "grad_norm": 0.41796875, + "learning_rate": 1.5029802502024788e-05, + "loss": 1.2601, + "step": 504 + }, + { + "epoch": 0.38348362600854297, + "grad_norm": 0.373046875, + "learning_rate": 1.5007456201587676e-05, + "loss": 1.2082, + "step": 505 + }, + { + "epoch": 0.38424299952539154, + "grad_norm": 0.357421875, + "learning_rate": 1.4985076477836232e-05, + "loss": 1.1751, + "step": 506 + }, + { + "epoch": 0.38500237304224016, + "grad_norm": 0.34375, + "learning_rate": 1.4962663480148606e-05, + "loss": 1.1682, + "step": 507 + }, + { + "epoch": 0.38576174655908874, + "grad_norm": 0.400390625, + "learning_rate": 1.4940217358125042e-05, + "loss": 1.222, + "step": 508 + }, + { + "epoch": 0.38652112007593736, + "grad_norm": 0.376953125, + "learning_rate": 1.4917738261586878e-05, + "loss": 1.1834, + "step": 509 + }, + { + "epoch": 0.38728049359278593, + "grad_norm": 0.38671875, + "learning_rate": 1.489522634057555e-05, + "loss": 1.1874, + "step": 510 + }, + { + "epoch": 0.38803986710963456, + "grad_norm": 0.41015625, + "learning_rate": 1.4872681745351582e-05, + "loss": 1.2168, + "step": 511 + }, + { + "epoch": 0.38879924062648313, + "grad_norm": 0.44921875, + "learning_rate": 1.4850104626393598e-05, + "loss": 1.2838, + "step": 512 + }, + { + "epoch": 0.38955861414333176, + "grad_norm": 0.39453125, + "learning_rate": 1.4827495134397298e-05, + "loss": 1.1814, + "step": 513 + }, + { + "epoch": 0.39031798766018033, + "grad_norm": 0.421875, + "learning_rate": 1.4804853420274471e-05, + "loss": 1.2424, + "step": 514 + }, + { + "epoch": 0.39107736117702896, + "grad_norm": 0.48046875, + "learning_rate": 1.4782179635151978e-05, + "loss": 1.2785, + "step": 515 + }, + { + "epoch": 0.39183673469387753, + "grad_norm": 0.41015625, + "learning_rate": 1.4759473930370738e-05, + "loss": 1.2162, + "step": 516 + }, + { + "epoch": 0.39259610821072616, + "grad_norm": 0.3828125, + "learning_rate": 1.473673645748473e-05, + "loss": 1.2142, + "step": 517 + }, + { + "epoch": 0.39335548172757473, + "grad_norm": 0.3984375, + "learning_rate": 1.4713967368259981e-05, + "loss": 1.2056, + "step": 518 + }, + { + "epoch": 0.39411485524442336, + "grad_norm": 0.427734375, + "learning_rate": 1.469116681467353e-05, + "loss": 1.2555, + "step": 519 + }, + { + "epoch": 0.39487422876127193, + "grad_norm": 0.353515625, + "learning_rate": 1.4668334948912455e-05, + "loss": 1.1837, + "step": 520 + }, + { + "epoch": 0.39563360227812056, + "grad_norm": 0.390625, + "learning_rate": 1.4645471923372818e-05, + "loss": 1.192, + "step": 521 + }, + { + "epoch": 0.3963929757949691, + "grad_norm": 0.3828125, + "learning_rate": 1.4622577890658668e-05, + "loss": 1.2303, + "step": 522 + }, + { + "epoch": 0.39715234931181775, + "grad_norm": 0.439453125, + "learning_rate": 1.4599653003581016e-05, + "loss": 1.2871, + "step": 523 + }, + { + "epoch": 0.3979117228286663, + "grad_norm": 0.404296875, + "learning_rate": 1.4576697415156818e-05, + "loss": 1.2274, + "step": 524 + }, + { + "epoch": 0.39867109634551495, + "grad_norm": 0.408203125, + "learning_rate": 1.4553711278607953e-05, + "loss": 1.2148, + "step": 525 + }, + { + "epoch": 0.3994304698623635, + "grad_norm": 0.298828125, + "learning_rate": 1.4530694747360203e-05, + "loss": 1.123, + "step": 526 + }, + { + "epoch": 0.40018984337921215, + "grad_norm": 0.380859375, + "learning_rate": 1.4507647975042221e-05, + "loss": 1.1685, + "step": 527 + }, + { + "epoch": 0.4009492168960607, + "grad_norm": 0.388671875, + "learning_rate": 1.4484571115484508e-05, + "loss": 1.2304, + "step": 528 + }, + { + "epoch": 0.40170859041290935, + "grad_norm": 0.375, + "learning_rate": 1.44614643227184e-05, + "loss": 1.1826, + "step": 529 + }, + { + "epoch": 0.402467963929758, + "grad_norm": 0.369140625, + "learning_rate": 1.4438327750975009e-05, + "loss": 1.2434, + "step": 530 + }, + { + "epoch": 0.40322733744660655, + "grad_norm": 0.3671875, + "learning_rate": 1.4415161554684239e-05, + "loss": 1.177, + "step": 531 + }, + { + "epoch": 0.4039867109634552, + "grad_norm": 0.357421875, + "learning_rate": 1.4391965888473705e-05, + "loss": 1.1952, + "step": 532 + }, + { + "epoch": 0.40474608448030375, + "grad_norm": 0.4609375, + "learning_rate": 1.436874090716774e-05, + "loss": 1.2767, + "step": 533 + }, + { + "epoch": 0.4055054579971524, + "grad_norm": 0.408203125, + "learning_rate": 1.434548676578634e-05, + "loss": 1.2334, + "step": 534 + }, + { + "epoch": 0.40626483151400095, + "grad_norm": 0.376953125, + "learning_rate": 1.432220361954414e-05, + "loss": 1.1755, + "step": 535 + }, + { + "epoch": 0.4070242050308496, + "grad_norm": 0.32421875, + "learning_rate": 1.429889162384937e-05, + "loss": 1.1615, + "step": 536 + }, + { + "epoch": 0.40778357854769814, + "grad_norm": 0.408203125, + "learning_rate": 1.4275550934302822e-05, + "loss": 1.2221, + "step": 537 + }, + { + "epoch": 0.40854295206454677, + "grad_norm": 0.357421875, + "learning_rate": 1.4252181706696817e-05, + "loss": 1.2065, + "step": 538 + }, + { + "epoch": 0.40930232558139534, + "grad_norm": 0.388671875, + "learning_rate": 1.4228784097014156e-05, + "loss": 1.2361, + "step": 539 + }, + { + "epoch": 0.41006169909824397, + "grad_norm": 0.349609375, + "learning_rate": 1.4205358261427076e-05, + "loss": 1.1413, + "step": 540 + }, + { + "epoch": 0.41082107261509254, + "grad_norm": 0.34765625, + "learning_rate": 1.4181904356296225e-05, + "loss": 1.1597, + "step": 541 + }, + { + "epoch": 0.41158044613194117, + "grad_norm": 0.33984375, + "learning_rate": 1.4158422538169596e-05, + "loss": 1.1972, + "step": 542 + }, + { + "epoch": 0.41233981964878974, + "grad_norm": 0.365234375, + "learning_rate": 1.4134912963781501e-05, + "loss": 1.1908, + "step": 543 + }, + { + "epoch": 0.41309919316563837, + "grad_norm": 0.35546875, + "learning_rate": 1.4111375790051511e-05, + "loss": 1.2195, + "step": 544 + }, + { + "epoch": 0.41385856668248694, + "grad_norm": 0.439453125, + "learning_rate": 1.4087811174083422e-05, + "loss": 1.2675, + "step": 545 + }, + { + "epoch": 0.41461794019933557, + "grad_norm": 0.38671875, + "learning_rate": 1.4064219273164192e-05, + "loss": 1.2397, + "step": 546 + }, + { + "epoch": 0.41537731371618414, + "grad_norm": 0.37109375, + "learning_rate": 1.40406002447629e-05, + "loss": 1.1723, + "step": 547 + }, + { + "epoch": 0.41613668723303276, + "grad_norm": 0.361328125, + "learning_rate": 1.4016954246529697e-05, + "loss": 1.1875, + "step": 548 + }, + { + "epoch": 0.41689606074988134, + "grad_norm": 0.3984375, + "learning_rate": 1.3993281436294743e-05, + "loss": 1.1678, + "step": 549 + }, + { + "epoch": 0.41765543426672996, + "grad_norm": 0.44140625, + "learning_rate": 1.3969581972067166e-05, + "loss": 1.2402, + "step": 550 + }, + { + "epoch": 0.41841480778357854, + "grad_norm": 0.40625, + "learning_rate": 1.3945856012034003e-05, + "loss": 1.2136, + "step": 551 + }, + { + "epoch": 0.41917418130042716, + "grad_norm": 0.40625, + "learning_rate": 1.392210371455913e-05, + "loss": 1.1965, + "step": 552 + }, + { + "epoch": 0.41993355481727573, + "grad_norm": 0.4140625, + "learning_rate": 1.3898325238182235e-05, + "loss": 1.1927, + "step": 553 + }, + { + "epoch": 0.42069292833412436, + "grad_norm": 0.3671875, + "learning_rate": 1.3874520741617734e-05, + "loss": 1.2102, + "step": 554 + }, + { + "epoch": 0.42145230185097293, + "grad_norm": 0.41796875, + "learning_rate": 1.3850690383753718e-05, + "loss": 1.2486, + "step": 555 + }, + { + "epoch": 0.42221167536782156, + "grad_norm": 0.359375, + "learning_rate": 1.3826834323650899e-05, + "loss": 1.1525, + "step": 556 + }, + { + "epoch": 0.42297104888467013, + "grad_norm": 0.4140625, + "learning_rate": 1.3802952720541543e-05, + "loss": 1.2107, + "step": 557 + }, + { + "epoch": 0.42373042240151876, + "grad_norm": 0.41796875, + "learning_rate": 1.377904573382841e-05, + "loss": 1.22, + "step": 558 + }, + { + "epoch": 0.42448979591836733, + "grad_norm": 0.34765625, + "learning_rate": 1.3755113523083679e-05, + "loss": 1.1559, + "step": 559 + }, + { + "epoch": 0.42524916943521596, + "grad_norm": 0.361328125, + "learning_rate": 1.3731156248047903e-05, + "loss": 1.2233, + "step": 560 + }, + { + "epoch": 0.42600854295206453, + "grad_norm": 0.314453125, + "learning_rate": 1.3707174068628927e-05, + "loss": 1.1299, + "step": 561 + }, + { + "epoch": 0.42676791646891316, + "grad_norm": 0.361328125, + "learning_rate": 1.3683167144900833e-05, + "loss": 1.182, + "step": 562 + }, + { + "epoch": 0.4275272899857617, + "grad_norm": 0.400390625, + "learning_rate": 1.3659135637102845e-05, + "loss": 1.2002, + "step": 563 + }, + { + "epoch": 0.42828666350261035, + "grad_norm": 0.375, + "learning_rate": 1.3635079705638298e-05, + "loss": 1.2027, + "step": 564 + }, + { + "epoch": 0.4290460370194589, + "grad_norm": 0.359375, + "learning_rate": 1.3610999511073544e-05, + "loss": 1.1353, + "step": 565 + }, + { + "epoch": 0.42980541053630755, + "grad_norm": 0.349609375, + "learning_rate": 1.3586895214136875e-05, + "loss": 1.1544, + "step": 566 + }, + { + "epoch": 0.4305647840531561, + "grad_norm": 0.318359375, + "learning_rate": 1.3562766975717468e-05, + "loss": 1.1621, + "step": 567 + }, + { + "epoch": 0.43132415757000475, + "grad_norm": 0.30859375, + "learning_rate": 1.3538614956864297e-05, + "loss": 1.1351, + "step": 568 + }, + { + "epoch": 0.4320835310868533, + "grad_norm": 0.3828125, + "learning_rate": 1.3514439318785067e-05, + "loss": 1.2011, + "step": 569 + }, + { + "epoch": 0.43284290460370195, + "grad_norm": 0.34375, + "learning_rate": 1.3490240222845139e-05, + "loss": 1.1835, + "step": 570 + }, + { + "epoch": 0.4336022781205505, + "grad_norm": 0.392578125, + "learning_rate": 1.3466017830566433e-05, + "loss": 1.1919, + "step": 571 + }, + { + "epoch": 0.43436165163739915, + "grad_norm": 0.33203125, + "learning_rate": 1.3441772303626387e-05, + "loss": 1.1314, + "step": 572 + }, + { + "epoch": 0.4351210251542477, + "grad_norm": 0.34375, + "learning_rate": 1.3417503803856835e-05, + "loss": 1.1481, + "step": 573 + }, + { + "epoch": 0.43588039867109635, + "grad_norm": 0.36328125, + "learning_rate": 1.3393212493242964e-05, + "loss": 1.2217, + "step": 574 + }, + { + "epoch": 0.4366397721879449, + "grad_norm": 0.39453125, + "learning_rate": 1.3368898533922202e-05, + "loss": 1.1553, + "step": 575 + }, + { + "epoch": 0.43739914570479355, + "grad_norm": 0.41015625, + "learning_rate": 1.3344562088183166e-05, + "loss": 1.2189, + "step": 576 + }, + { + "epoch": 0.4381585192216421, + "grad_norm": 0.330078125, + "learning_rate": 1.3320203318464552e-05, + "loss": 1.1301, + "step": 577 + }, + { + "epoch": 0.43891789273849074, + "grad_norm": 0.40234375, + "learning_rate": 1.3295822387354071e-05, + "loss": 1.2088, + "step": 578 + }, + { + "epoch": 0.43967726625533937, + "grad_norm": 0.337890625, + "learning_rate": 1.3271419457587344e-05, + "loss": 1.1475, + "step": 579 + }, + { + "epoch": 0.44043663977218794, + "grad_norm": 0.33203125, + "learning_rate": 1.3246994692046837e-05, + "loss": 1.16, + "step": 580 + }, + { + "epoch": 0.44119601328903657, + "grad_norm": 0.384765625, + "learning_rate": 1.3222548253760756e-05, + "loss": 1.1764, + "step": 581 + }, + { + "epoch": 0.44195538680588514, + "grad_norm": 0.41015625, + "learning_rate": 1.319808030590197e-05, + "loss": 1.206, + "step": 582 + }, + { + "epoch": 0.44271476032273377, + "grad_norm": 0.32421875, + "learning_rate": 1.3173591011786917e-05, + "loss": 1.1696, + "step": 583 + }, + { + "epoch": 0.44347413383958234, + "grad_norm": 0.359375, + "learning_rate": 1.3149080534874519e-05, + "loss": 1.1935, + "step": 584 + }, + { + "epoch": 0.44423350735643097, + "grad_norm": 0.384765625, + "learning_rate": 1.3124549038765078e-05, + "loss": 1.1915, + "step": 585 + }, + { + "epoch": 0.44499288087327954, + "grad_norm": 0.326171875, + "learning_rate": 1.3099996687199203e-05, + "loss": 1.159, + "step": 586 + }, + { + "epoch": 0.44575225439012817, + "grad_norm": 0.39453125, + "learning_rate": 1.3075423644056699e-05, + "loss": 1.2283, + "step": 587 + }, + { + "epoch": 0.44651162790697674, + "grad_norm": 0.361328125, + "learning_rate": 1.305083007335549e-05, + "loss": 1.1949, + "step": 588 + }, + { + "epoch": 0.44727100142382537, + "grad_norm": 0.3359375, + "learning_rate": 1.3026216139250505e-05, + "loss": 1.1641, + "step": 589 + }, + { + "epoch": 0.44803037494067394, + "grad_norm": 0.375, + "learning_rate": 1.3001582006032601e-05, + "loss": 1.2071, + "step": 590 + }, + { + "epoch": 0.44878974845752256, + "grad_norm": 0.3671875, + "learning_rate": 1.2976927838127453e-05, + "loss": 1.16, + "step": 591 + }, + { + "epoch": 0.44954912197437114, + "grad_norm": 0.38671875, + "learning_rate": 1.2952253800094467e-05, + "loss": 1.2239, + "step": 592 + }, + { + "epoch": 0.45030849549121976, + "grad_norm": 0.361328125, + "learning_rate": 1.2927560056625672e-05, + "loss": 1.1955, + "step": 593 + }, + { + "epoch": 0.45106786900806833, + "grad_norm": 0.34375, + "learning_rate": 1.2902846772544625e-05, + "loss": 1.1833, + "step": 594 + }, + { + "epoch": 0.45182724252491696, + "grad_norm": 0.36328125, + "learning_rate": 1.2878114112805315e-05, + "loss": 1.212, + "step": 595 + }, + { + "epoch": 0.45258661604176553, + "grad_norm": 0.38671875, + "learning_rate": 1.2853362242491054e-05, + "loss": 1.1979, + "step": 596 + }, + { + "epoch": 0.45334598955861416, + "grad_norm": 0.3203125, + "learning_rate": 1.2828591326813382e-05, + "loss": 1.1222, + "step": 597 + }, + { + "epoch": 0.45410536307546273, + "grad_norm": 0.375, + "learning_rate": 1.2803801531110956e-05, + "loss": 1.1922, + "step": 598 + }, + { + "epoch": 0.45486473659231136, + "grad_norm": 0.361328125, + "learning_rate": 1.2778993020848457e-05, + "loss": 1.1596, + "step": 599 + }, + { + "epoch": 0.45562411010915993, + "grad_norm": 0.392578125, + "learning_rate": 1.2754165961615482e-05, + "loss": 1.2171, + "step": 600 + }, + { + "epoch": 0.45638348362600856, + "grad_norm": 0.376953125, + "learning_rate": 1.2729320519125426e-05, + "loss": 1.1937, + "step": 601 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.34375, + "learning_rate": 1.2704456859214397e-05, + "loss": 1.1604, + "step": 602 + }, + { + "epoch": 0.45790223065970576, + "grad_norm": 0.380859375, + "learning_rate": 1.2679575147840102e-05, + "loss": 1.1724, + "step": 603 + }, + { + "epoch": 0.4586616041765543, + "grad_norm": 0.353515625, + "learning_rate": 1.2654675551080724e-05, + "loss": 1.1699, + "step": 604 + }, + { + "epoch": 0.45942097769340295, + "grad_norm": 0.3203125, + "learning_rate": 1.2629758235133838e-05, + "loss": 1.1697, + "step": 605 + }, + { + "epoch": 0.4601803512102515, + "grad_norm": 0.376953125, + "learning_rate": 1.2604823366315273e-05, + "loss": 1.1973, + "step": 606 + }, + { + "epoch": 0.46093972472710015, + "grad_norm": 0.451171875, + "learning_rate": 1.2579871111058042e-05, + "loss": 1.2494, + "step": 607 + }, + { + "epoch": 0.4616990982439487, + "grad_norm": 0.3359375, + "learning_rate": 1.2554901635911188e-05, + "loss": 1.1515, + "step": 608 + }, + { + "epoch": 0.46245847176079735, + "grad_norm": 0.353515625, + "learning_rate": 1.2529915107538698e-05, + "loss": 1.1638, + "step": 609 + }, + { + "epoch": 0.4632178452776459, + "grad_norm": 0.40234375, + "learning_rate": 1.2504911692718387e-05, + "loss": 1.2225, + "step": 610 + }, + { + "epoch": 0.46397721879449455, + "grad_norm": 0.365234375, + "learning_rate": 1.2479891558340777e-05, + "loss": 1.1996, + "step": 611 + }, + { + "epoch": 0.4647365923113431, + "grad_norm": 0.466796875, + "learning_rate": 1.2454854871407993e-05, + "loss": 1.2728, + "step": 612 + }, + { + "epoch": 0.46549596582819175, + "grad_norm": 0.31640625, + "learning_rate": 1.242980179903264e-05, + "loss": 1.1579, + "step": 613 + }, + { + "epoch": 0.4662553393450403, + "grad_norm": 0.380859375, + "learning_rate": 1.2404732508436693e-05, + "loss": 1.2026, + "step": 614 + }, + { + "epoch": 0.46701471286188895, + "grad_norm": 0.4140625, + "learning_rate": 1.2379647166950381e-05, + "loss": 1.1719, + "step": 615 + }, + { + "epoch": 0.4677740863787375, + "grad_norm": 0.353515625, + "learning_rate": 1.2354545942011058e-05, + "loss": 1.1853, + "step": 616 + }, + { + "epoch": 0.46853345989558615, + "grad_norm": 0.318359375, + "learning_rate": 1.2329429001162114e-05, + "loss": 1.1524, + "step": 617 + }, + { + "epoch": 0.4692928334124347, + "grad_norm": 0.419921875, + "learning_rate": 1.2304296512051814e-05, + "loss": 1.2056, + "step": 618 + }, + { + "epoch": 0.47005220692928334, + "grad_norm": 0.33984375, + "learning_rate": 1.2279148642432229e-05, + "loss": 1.187, + "step": 619 + }, + { + "epoch": 0.4708115804461319, + "grad_norm": 0.330078125, + "learning_rate": 1.2253985560158064e-05, + "loss": 1.1578, + "step": 620 + }, + { + "epoch": 0.47157095396298054, + "grad_norm": 0.3203125, + "learning_rate": 1.2228807433185588e-05, + "loss": 1.1355, + "step": 621 + }, + { + "epoch": 0.4723303274798291, + "grad_norm": 0.326171875, + "learning_rate": 1.2203614429571475e-05, + "loss": 1.1617, + "step": 622 + }, + { + "epoch": 0.47308970099667774, + "grad_norm": 0.416015625, + "learning_rate": 1.2178406717471702e-05, + "loss": 1.1254, + "step": 623 + }, + { + "epoch": 0.4738490745135263, + "grad_norm": 0.3828125, + "learning_rate": 1.2153184465140413e-05, + "loss": 1.1904, + "step": 624 + }, + { + "epoch": 0.47460844803037494, + "grad_norm": 0.337890625, + "learning_rate": 1.2127947840928816e-05, + "loss": 1.158, + "step": 625 + }, + { + "epoch": 0.47536782154722357, + "grad_norm": 0.36328125, + "learning_rate": 1.2102697013284035e-05, + "loss": 1.1188, + "step": 626 + }, + { + "epoch": 0.47612719506407214, + "grad_norm": 0.333984375, + "learning_rate": 1.207743215074801e-05, + "loss": 1.1458, + "step": 627 + }, + { + "epoch": 0.47688656858092077, + "grad_norm": 0.33203125, + "learning_rate": 1.2052153421956343e-05, + "loss": 1.1472, + "step": 628 + }, + { + "epoch": 0.47764594209776934, + "grad_norm": 0.3828125, + "learning_rate": 1.2026860995637211e-05, + "loss": 1.2092, + "step": 629 + }, + { + "epoch": 0.47840531561461797, + "grad_norm": 0.3828125, + "learning_rate": 1.2001555040610197e-05, + "loss": 1.1966, + "step": 630 + }, + { + "epoch": 0.47916468913146654, + "grad_norm": 0.3203125, + "learning_rate": 1.1976235725785202e-05, + "loss": 1.094, + "step": 631 + }, + { + "epoch": 0.47992406264831516, + "grad_norm": 0.34765625, + "learning_rate": 1.1950903220161286e-05, + "loss": 1.1493, + "step": 632 + }, + { + "epoch": 0.48068343616516374, + "grad_norm": 0.39453125, + "learning_rate": 1.1925557692825558e-05, + "loss": 1.2334, + "step": 633 + }, + { + "epoch": 0.48144280968201236, + "grad_norm": 0.310546875, + "learning_rate": 1.1900199312952047e-05, + "loss": 1.1418, + "step": 634 + }, + { + "epoch": 0.48220218319886093, + "grad_norm": 0.359375, + "learning_rate": 1.1874828249800565e-05, + "loss": 1.144, + "step": 635 + }, + { + "epoch": 0.48296155671570956, + "grad_norm": 0.353515625, + "learning_rate": 1.1849444672715587e-05, + "loss": 1.1465, + "step": 636 + }, + { + "epoch": 0.48372093023255813, + "grad_norm": 0.404296875, + "learning_rate": 1.1824048751125101e-05, + "loss": 1.2054, + "step": 637 + }, + { + "epoch": 0.48448030374940676, + "grad_norm": 0.310546875, + "learning_rate": 1.1798640654539511e-05, + "loss": 1.1376, + "step": 638 + }, + { + "epoch": 0.48523967726625533, + "grad_norm": 0.30859375, + "learning_rate": 1.1773220552550463e-05, + "loss": 1.1574, + "step": 639 + }, + { + "epoch": 0.48599905078310396, + "grad_norm": 0.4140625, + "learning_rate": 1.1747788614829758e-05, + "loss": 1.2302, + "step": 640 + }, + { + "epoch": 0.48675842429995253, + "grad_norm": 0.3046875, + "learning_rate": 1.1722345011128183e-05, + "loss": 1.1259, + "step": 641 + }, + { + "epoch": 0.48751779781680116, + "grad_norm": 0.32421875, + "learning_rate": 1.1696889911274394e-05, + "loss": 1.1542, + "step": 642 + }, + { + "epoch": 0.48827717133364973, + "grad_norm": 0.37109375, + "learning_rate": 1.1671423485173783e-05, + "loss": 1.23, + "step": 643 + }, + { + "epoch": 0.48903654485049836, + "grad_norm": 0.44921875, + "learning_rate": 1.164594590280734e-05, + "loss": 1.2568, + "step": 644 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.341796875, + "learning_rate": 1.162045733423052e-05, + "loss": 1.1619, + "step": 645 + }, + { + "epoch": 0.49055529188419555, + "grad_norm": 0.40234375, + "learning_rate": 1.159495794957211e-05, + "loss": 1.2003, + "step": 646 + }, + { + "epoch": 0.4913146654010441, + "grad_norm": 0.412109375, + "learning_rate": 1.1569447919033086e-05, + "loss": 1.2507, + "step": 647 + }, + { + "epoch": 0.49207403891789275, + "grad_norm": 0.337890625, + "learning_rate": 1.1543927412885489e-05, + "loss": 1.1381, + "step": 648 + }, + { + "epoch": 0.4928334124347413, + "grad_norm": 0.3515625, + "learning_rate": 1.1518396601471273e-05, + "loss": 1.1715, + "step": 649 + }, + { + "epoch": 0.49359278595158995, + "grad_norm": 0.359375, + "learning_rate": 1.149285565520119e-05, + "loss": 1.1947, + "step": 650 + }, + { + "epoch": 0.4943521594684385, + "grad_norm": 0.3515625, + "learning_rate": 1.1467304744553618e-05, + "loss": 1.1499, + "step": 651 + }, + { + "epoch": 0.49511153298528715, + "grad_norm": 0.35546875, + "learning_rate": 1.1441744040073469e-05, + "loss": 1.1873, + "step": 652 + }, + { + "epoch": 0.4958709065021357, + "grad_norm": 0.3203125, + "learning_rate": 1.1416173712371008e-05, + "loss": 1.1398, + "step": 653 + }, + { + "epoch": 0.49663028001898435, + "grad_norm": 0.388671875, + "learning_rate": 1.1390593932120742e-05, + "loss": 1.2044, + "step": 654 + }, + { + "epoch": 0.4973896535358329, + "grad_norm": 0.349609375, + "learning_rate": 1.1365004870060266e-05, + "loss": 1.1856, + "step": 655 + }, + { + "epoch": 0.49814902705268155, + "grad_norm": 0.345703125, + "learning_rate": 1.1339406696989128e-05, + "loss": 1.1601, + "step": 656 + }, + { + "epoch": 0.4989084005695301, + "grad_norm": 0.408203125, + "learning_rate": 1.1313799583767693e-05, + "loss": 1.2261, + "step": 657 + }, + { + "epoch": 0.49966777408637875, + "grad_norm": 0.3515625, + "learning_rate": 1.1288183701315996e-05, + "loss": 1.1504, + "step": 658 + }, + { + "epoch": 0.5004271476032274, + "grad_norm": 0.361328125, + "learning_rate": 1.1262559220612602e-05, + "loss": 1.1967, + "step": 659 + }, + { + "epoch": 0.5011865211200759, + "grad_norm": 0.359375, + "learning_rate": 1.123692631269348e-05, + "loss": 1.1724, + "step": 660 + }, + { + "epoch": 0.5019458946369245, + "grad_norm": 0.326171875, + "learning_rate": 1.1211285148650826e-05, + "loss": 1.158, + "step": 661 + }, + { + "epoch": 0.5027052681537731, + "grad_norm": 0.36328125, + "learning_rate": 1.1185635899631963e-05, + "loss": 1.1994, + "step": 662 + }, + { + "epoch": 0.5034646416706218, + "grad_norm": 0.3515625, + "learning_rate": 1.1159978736838169e-05, + "loss": 1.1844, + "step": 663 + }, + { + "epoch": 0.5042240151874703, + "grad_norm": 0.322265625, + "learning_rate": 1.1134313831523547e-05, + "loss": 1.151, + "step": 664 + }, + { + "epoch": 0.5049833887043189, + "grad_norm": 0.390625, + "learning_rate": 1.1108641354993876e-05, + "loss": 1.1455, + "step": 665 + }, + { + "epoch": 0.5057427622211675, + "grad_norm": 0.373046875, + "learning_rate": 1.1082961478605476e-05, + "loss": 1.1656, + "step": 666 + }, + { + "epoch": 0.5065021357380162, + "grad_norm": 0.328125, + "learning_rate": 1.1057274373764056e-05, + "loss": 1.141, + "step": 667 + }, + { + "epoch": 0.5072615092548647, + "grad_norm": 0.302734375, + "learning_rate": 1.103158021192357e-05, + "loss": 1.136, + "step": 668 + }, + { + "epoch": 0.5080208827717133, + "grad_norm": 0.361328125, + "learning_rate": 1.1005879164585083e-05, + "loss": 1.1902, + "step": 669 + }, + { + "epoch": 0.5087802562885619, + "grad_norm": 0.345703125, + "learning_rate": 1.098017140329561e-05, + "loss": 1.1535, + "step": 670 + }, + { + "epoch": 0.5095396298054106, + "grad_norm": 0.390625, + "learning_rate": 1.0954457099646981e-05, + "loss": 1.1909, + "step": 671 + }, + { + "epoch": 0.5102990033222591, + "grad_norm": 0.40625, + "learning_rate": 1.0928736425274702e-05, + "loss": 1.1445, + "step": 672 + }, + { + "epoch": 0.5110583768391077, + "grad_norm": 0.326171875, + "learning_rate": 1.0903009551856795e-05, + "loss": 1.1776, + "step": 673 + }, + { + "epoch": 0.5118177503559563, + "grad_norm": 0.330078125, + "learning_rate": 1.0877276651112662e-05, + "loss": 1.1799, + "step": 674 + }, + { + "epoch": 0.512577123872805, + "grad_norm": 0.451171875, + "learning_rate": 1.0851537894801935e-05, + "loss": 1.2681, + "step": 675 + }, + { + "epoch": 0.5133364973896536, + "grad_norm": 0.392578125, + "learning_rate": 1.0825793454723325e-05, + "loss": 1.1858, + "step": 676 + }, + { + "epoch": 0.5140958709065021, + "grad_norm": 0.388671875, + "learning_rate": 1.0800043502713486e-05, + "loss": 1.2268, + "step": 677 + }, + { + "epoch": 0.5148552444233507, + "grad_norm": 0.375, + "learning_rate": 1.0774288210645862e-05, + "loss": 1.1628, + "step": 678 + }, + { + "epoch": 0.5156146179401994, + "grad_norm": 0.400390625, + "learning_rate": 1.0748527750429545e-05, + "loss": 1.2508, + "step": 679 + }, + { + "epoch": 0.516373991457048, + "grad_norm": 0.373046875, + "learning_rate": 1.0722762294008107e-05, + "loss": 1.1958, + "step": 680 + }, + { + "epoch": 0.5171333649738965, + "grad_norm": 0.326171875, + "learning_rate": 1.069699201335849e-05, + "loss": 1.13, + "step": 681 + }, + { + "epoch": 0.5178927384907451, + "grad_norm": 0.365234375, + "learning_rate": 1.0671217080489816e-05, + "loss": 1.2132, + "step": 682 + }, + { + "epoch": 0.5186521120075938, + "grad_norm": 0.408203125, + "learning_rate": 1.0645437667442273e-05, + "loss": 1.2433, + "step": 683 + }, + { + "epoch": 0.5194114855244424, + "grad_norm": 0.328125, + "learning_rate": 1.0619653946285948e-05, + "loss": 1.1013, + "step": 684 + }, + { + "epoch": 0.5201708590412909, + "grad_norm": 0.365234375, + "learning_rate": 1.0593866089119683e-05, + "loss": 1.171, + "step": 685 + }, + { + "epoch": 0.5209302325581395, + "grad_norm": 0.375, + "learning_rate": 1.0568074268069928e-05, + "loss": 1.1771, + "step": 686 + }, + { + "epoch": 0.5216896060749882, + "grad_norm": 0.396484375, + "learning_rate": 1.0542278655289588e-05, + "loss": 1.1808, + "step": 687 + }, + { + "epoch": 0.5224489795918368, + "grad_norm": 0.357421875, + "learning_rate": 1.0516479422956882e-05, + "loss": 1.1398, + "step": 688 + }, + { + "epoch": 0.5232083531086853, + "grad_norm": 0.38671875, + "learning_rate": 1.0490676743274181e-05, + "loss": 1.1954, + "step": 689 + }, + { + "epoch": 0.5239677266255339, + "grad_norm": 0.37890625, + "learning_rate": 1.0464870788466875e-05, + "loss": 1.1792, + "step": 690 + }, + { + "epoch": 0.5247271001423826, + "grad_norm": 0.3359375, + "learning_rate": 1.0439061730782207e-05, + "loss": 1.1585, + "step": 691 + }, + { + "epoch": 0.5254864736592312, + "grad_norm": 0.3203125, + "learning_rate": 1.0413249742488132e-05, + "loss": 1.1658, + "step": 692 + }, + { + "epoch": 0.5262458471760797, + "grad_norm": 0.337890625, + "learning_rate": 1.0387434995872174e-05, + "loss": 1.1443, + "step": 693 + }, + { + "epoch": 0.5270052206929283, + "grad_norm": 0.376953125, + "learning_rate": 1.0361617663240253e-05, + "loss": 1.176, + "step": 694 + }, + { + "epoch": 0.527764594209777, + "grad_norm": 0.345703125, + "learning_rate": 1.0335797916915568e-05, + "loss": 1.2121, + "step": 695 + }, + { + "epoch": 0.5285239677266256, + "grad_norm": 0.38671875, + "learning_rate": 1.0309975929237408e-05, + "loss": 1.209, + "step": 696 + }, + { + "epoch": 0.5292833412434741, + "grad_norm": 0.3203125, + "learning_rate": 1.0284151872560042e-05, + "loss": 1.1629, + "step": 697 + }, + { + "epoch": 0.5300427147603227, + "grad_norm": 0.376953125, + "learning_rate": 1.0258325919251537e-05, + "loss": 1.1606, + "step": 698 + }, + { + "epoch": 0.5308020882771713, + "grad_norm": 0.30078125, + "learning_rate": 1.0232498241692625e-05, + "loss": 1.1405, + "step": 699 + }, + { + "epoch": 0.53156146179402, + "grad_norm": 0.41796875, + "learning_rate": 1.0206669012275546e-05, + "loss": 1.1829, + "step": 700 + }, + { + "epoch": 0.5323208353108685, + "grad_norm": 0.33984375, + "learning_rate": 1.018083840340289e-05, + "loss": 1.1182, + "step": 701 + }, + { + "epoch": 0.5330802088277171, + "grad_norm": 0.380859375, + "learning_rate": 1.0155006587486468e-05, + "loss": 1.2416, + "step": 702 + }, + { + "epoch": 0.5338395823445657, + "grad_norm": 0.349609375, + "learning_rate": 1.0129173736946143e-05, + "loss": 1.1733, + "step": 703 + }, + { + "epoch": 0.5345989558614144, + "grad_norm": 0.333984375, + "learning_rate": 1.0103340024208674e-05, + "loss": 1.1117, + "step": 704 + }, + { + "epoch": 0.5353583293782629, + "grad_norm": 0.353515625, + "learning_rate": 1.007750562170659e-05, + "loss": 1.2096, + "step": 705 + }, + { + "epoch": 0.5361177028951115, + "grad_norm": 0.3515625, + "learning_rate": 1.0051670701877011e-05, + "loss": 1.1615, + "step": 706 + }, + { + "epoch": 0.5368770764119601, + "grad_norm": 0.322265625, + "learning_rate": 1.0025835437160523e-05, + "loss": 1.181, + "step": 707 + }, + { + "epoch": 0.5376364499288088, + "grad_norm": 0.40625, + "learning_rate": 1e-05, + "loss": 1.2599, + "step": 708 + }, + { + "epoch": 0.5383958234456573, + "grad_norm": 0.306640625, + "learning_rate": 9.97416456283948e-06, + "loss": 1.1557, + "step": 709 + }, + { + "epoch": 0.5391551969625059, + "grad_norm": 0.333984375, + "learning_rate": 9.948329298122989e-06, + "loss": 1.1486, + "step": 710 + }, + { + "epoch": 0.5399145704793545, + "grad_norm": 0.32421875, + "learning_rate": 9.922494378293414e-06, + "loss": 1.146, + "step": 711 + }, + { + "epoch": 0.5406739439962032, + "grad_norm": 0.375, + "learning_rate": 9.89665997579133e-06, + "loss": 1.1826, + "step": 712 + }, + { + "epoch": 0.5414333175130517, + "grad_norm": 0.353515625, + "learning_rate": 9.870826263053859e-06, + "loss": 1.1607, + "step": 713 + }, + { + "epoch": 0.5421926910299003, + "grad_norm": 0.318359375, + "learning_rate": 9.844993412513533e-06, + "loss": 1.1287, + "step": 714 + }, + { + "epoch": 0.5429520645467489, + "grad_norm": 0.466796875, + "learning_rate": 9.819161596597112e-06, + "loss": 1.3019, + "step": 715 + }, + { + "epoch": 0.5437114380635976, + "grad_norm": 0.330078125, + "learning_rate": 9.79333098772446e-06, + "loss": 1.1456, + "step": 716 + }, + { + "epoch": 0.5444708115804461, + "grad_norm": 0.333984375, + "learning_rate": 9.767501758307376e-06, + "loss": 1.1532, + "step": 717 + }, + { + "epoch": 0.5452301850972947, + "grad_norm": 0.275390625, + "learning_rate": 9.741674080748465e-06, + "loss": 1.1244, + "step": 718 + }, + { + "epoch": 0.5459895586141433, + "grad_norm": 0.333984375, + "learning_rate": 9.715848127439958e-06, + "loss": 1.1617, + "step": 719 + }, + { + "epoch": 0.546748932130992, + "grad_norm": 0.369140625, + "learning_rate": 9.690024070762597e-06, + "loss": 1.2031, + "step": 720 + }, + { + "epoch": 0.5475083056478405, + "grad_norm": 0.35546875, + "learning_rate": 9.664202083084437e-06, + "loss": 1.1701, + "step": 721 + }, + { + "epoch": 0.5482676791646891, + "grad_norm": 0.341796875, + "learning_rate": 9.638382336759749e-06, + "loss": 1.1756, + "step": 722 + }, + { + "epoch": 0.5490270526815377, + "grad_norm": 0.34375, + "learning_rate": 9.612565004127828e-06, + "loss": 1.192, + "step": 723 + }, + { + "epoch": 0.5497864261983864, + "grad_norm": 0.341796875, + "learning_rate": 9.586750257511868e-06, + "loss": 1.1673, + "step": 724 + }, + { + "epoch": 0.550545799715235, + "grad_norm": 0.3359375, + "learning_rate": 9.560938269217798e-06, + "loss": 1.1835, + "step": 725 + }, + { + "epoch": 0.5513051732320835, + "grad_norm": 0.34375, + "learning_rate": 9.53512921153313e-06, + "loss": 1.2177, + "step": 726 + }, + { + "epoch": 0.5520645467489321, + "grad_norm": 0.33984375, + "learning_rate": 9.50932325672582e-06, + "loss": 1.1675, + "step": 727 + }, + { + "epoch": 0.5528239202657808, + "grad_norm": 0.275390625, + "learning_rate": 9.483520577043121e-06, + "loss": 1.104, + "step": 728 + }, + { + "epoch": 0.5535832937826294, + "grad_norm": 0.31640625, + "learning_rate": 9.457721344710412e-06, + "loss": 1.126, + "step": 729 + }, + { + "epoch": 0.5543426672994779, + "grad_norm": 0.392578125, + "learning_rate": 9.431925731930079e-06, + "loss": 1.1852, + "step": 730 + }, + { + "epoch": 0.5551020408163265, + "grad_norm": 0.330078125, + "learning_rate": 9.406133910880319e-06, + "loss": 1.1576, + "step": 731 + }, + { + "epoch": 0.5558614143331752, + "grad_norm": 0.291015625, + "learning_rate": 9.380346053714055e-06, + "loss": 1.0863, + "step": 732 + }, + { + "epoch": 0.5566207878500238, + "grad_norm": 0.318359375, + "learning_rate": 9.354562332557728e-06, + "loss": 1.1338, + "step": 733 + }, + { + "epoch": 0.5573801613668723, + "grad_norm": 0.37890625, + "learning_rate": 9.328782919510186e-06, + "loss": 1.2238, + "step": 734 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 0.330078125, + "learning_rate": 9.303007986641515e-06, + "loss": 1.1432, + "step": 735 + }, + { + "epoch": 0.5588989084005695, + "grad_norm": 0.302734375, + "learning_rate": 9.277237705991895e-06, + "loss": 1.15, + "step": 736 + }, + { + "epoch": 0.5596582819174182, + "grad_norm": 0.283203125, + "learning_rate": 9.251472249570458e-06, + "loss": 1.1075, + "step": 737 + }, + { + "epoch": 0.5604176554342667, + "grad_norm": 0.33203125, + "learning_rate": 9.225711789354138e-06, + "loss": 1.1256, + "step": 738 + }, + { + "epoch": 0.5611770289511153, + "grad_norm": 0.357421875, + "learning_rate": 9.199956497286517e-06, + "loss": 1.1923, + "step": 739 + }, + { + "epoch": 0.561936402467964, + "grad_norm": 0.287109375, + "learning_rate": 9.174206545276678e-06, + "loss": 1.1069, + "step": 740 + }, + { + "epoch": 0.5626957759848126, + "grad_norm": 0.375, + "learning_rate": 9.148462105198068e-06, + "loss": 1.2118, + "step": 741 + }, + { + "epoch": 0.5634551495016611, + "grad_norm": 0.392578125, + "learning_rate": 9.12272334888734e-06, + "loss": 1.2203, + "step": 742 + }, + { + "epoch": 0.5642145230185097, + "grad_norm": 0.375, + "learning_rate": 9.096990448143203e-06, + "loss": 1.1714, + "step": 743 + }, + { + "epoch": 0.5649738965353583, + "grad_norm": 0.32421875, + "learning_rate": 9.0712635747253e-06, + "loss": 1.1562, + "step": 744 + }, + { + "epoch": 0.565733270052207, + "grad_norm": 0.3125, + "learning_rate": 9.045542900353022e-06, + "loss": 1.138, + "step": 745 + }, + { + "epoch": 0.5664926435690555, + "grad_norm": 0.376953125, + "learning_rate": 9.019828596704394e-06, + "loss": 1.2036, + "step": 746 + }, + { + "epoch": 0.5672520170859041, + "grad_norm": 0.3984375, + "learning_rate": 8.99412083541492e-06, + "loss": 1.2011, + "step": 747 + }, + { + "epoch": 0.5680113906027527, + "grad_norm": 0.3828125, + "learning_rate": 8.968419788076431e-06, + "loss": 1.2146, + "step": 748 + }, + { + "epoch": 0.5687707641196014, + "grad_norm": 0.3125, + "learning_rate": 8.942725626235949e-06, + "loss": 1.1499, + "step": 749 + }, + { + "epoch": 0.5695301376364499, + "grad_norm": 0.357421875, + "learning_rate": 8.917038521394526e-06, + "loss": 1.1884, + "step": 750 + }, + { + "epoch": 0.5702895111532985, + "grad_norm": 0.3359375, + "learning_rate": 8.891358645006126e-06, + "loss": 1.1455, + "step": 751 + }, + { + "epoch": 0.5710488846701471, + "grad_norm": 0.2578125, + "learning_rate": 8.865686168476458e-06, + "loss": 1.1044, + "step": 752 + }, + { + "epoch": 0.5718082581869958, + "grad_norm": 0.3671875, + "learning_rate": 8.840021263161831e-06, + "loss": 1.1989, + "step": 753 + }, + { + "epoch": 0.5725676317038443, + "grad_norm": 0.328125, + "learning_rate": 8.81436410036804e-06, + "loss": 1.1432, + "step": 754 + }, + { + "epoch": 0.5733270052206929, + "grad_norm": 0.30078125, + "learning_rate": 8.788714851349177e-06, + "loss": 1.1265, + "step": 755 + }, + { + "epoch": 0.5740863787375415, + "grad_norm": 0.326171875, + "learning_rate": 8.763073687306523e-06, + "loss": 1.1427, + "step": 756 + }, + { + "epoch": 0.5748457522543902, + "grad_norm": 0.345703125, + "learning_rate": 8.737440779387398e-06, + "loss": 1.1363, + "step": 757 + }, + { + "epoch": 0.5756051257712387, + "grad_norm": 0.326171875, + "learning_rate": 8.711816298684011e-06, + "loss": 1.1628, + "step": 758 + }, + { + "epoch": 0.5763644992880873, + "grad_norm": 0.4140625, + "learning_rate": 8.686200416232314e-06, + "loss": 1.2075, + "step": 759 + }, + { + "epoch": 0.5771238728049359, + "grad_norm": 0.3203125, + "learning_rate": 8.660593303010876e-06, + "loss": 1.1384, + "step": 760 + }, + { + "epoch": 0.5778832463217846, + "grad_norm": 0.3046875, + "learning_rate": 8.634995129939737e-06, + "loss": 1.1354, + "step": 761 + }, + { + "epoch": 0.5786426198386331, + "grad_norm": 0.390625, + "learning_rate": 8.609406067879258e-06, + "loss": 1.1626, + "step": 762 + }, + { + "epoch": 0.5794019933554817, + "grad_norm": 0.34765625, + "learning_rate": 8.583826287628996e-06, + "loss": 1.2072, + "step": 763 + }, + { + "epoch": 0.5801613668723303, + "grad_norm": 0.328125, + "learning_rate": 8.558255959926533e-06, + "loss": 1.1492, + "step": 764 + }, + { + "epoch": 0.580920740389179, + "grad_norm": 0.40234375, + "learning_rate": 8.532695255446384e-06, + "loss": 1.1948, + "step": 765 + }, + { + "epoch": 0.5816801139060275, + "grad_norm": 0.345703125, + "learning_rate": 8.507144344798814e-06, + "loss": 1.1786, + "step": 766 + }, + { + "epoch": 0.5824394874228761, + "grad_norm": 0.34765625, + "learning_rate": 8.481603398528727e-06, + "loss": 1.172, + "step": 767 + }, + { + "epoch": 0.5831988609397247, + "grad_norm": 0.322265625, + "learning_rate": 8.456072587114516e-06, + "loss": 1.1431, + "step": 768 + }, + { + "epoch": 0.5839582344565734, + "grad_norm": 0.3671875, + "learning_rate": 8.430552080966918e-06, + "loss": 1.2079, + "step": 769 + }, + { + "epoch": 0.584717607973422, + "grad_norm": 0.349609375, + "learning_rate": 8.405042050427891e-06, + "loss": 1.1885, + "step": 770 + }, + { + "epoch": 0.5854769814902705, + "grad_norm": 0.390625, + "learning_rate": 8.37954266576948e-06, + "loss": 1.1858, + "step": 771 + }, + { + "epoch": 0.5862363550071191, + "grad_norm": 0.380859375, + "learning_rate": 8.35405409719266e-06, + "loss": 1.2242, + "step": 772 + }, + { + "epoch": 0.5869957285239678, + "grad_norm": 0.369140625, + "learning_rate": 8.328576514826222e-06, + "loss": 1.1984, + "step": 773 + }, + { + "epoch": 0.5877551020408164, + "grad_norm": 0.29296875, + "learning_rate": 8.30311008872561e-06, + "loss": 1.1178, + "step": 774 + }, + { + "epoch": 0.5885144755576649, + "grad_norm": 0.2890625, + "learning_rate": 8.277654988871819e-06, + "loss": 1.1126, + "step": 775 + }, + { + "epoch": 0.5892738490745135, + "grad_norm": 0.337890625, + "learning_rate": 8.252211385170242e-06, + "loss": 1.1394, + "step": 776 + }, + { + "epoch": 0.5900332225913621, + "grad_norm": 0.341796875, + "learning_rate": 8.226779447449538e-06, + "loss": 1.1999, + "step": 777 + }, + { + "epoch": 0.5907925961082108, + "grad_norm": 0.328125, + "learning_rate": 8.201359345460496e-06, + "loss": 1.1602, + "step": 778 + }, + { + "epoch": 0.5915519696250593, + "grad_norm": 0.38671875, + "learning_rate": 8.175951248874902e-06, + "loss": 1.1864, + "step": 779 + }, + { + "epoch": 0.5923113431419079, + "grad_norm": 0.302734375, + "learning_rate": 8.150555327284417e-06, + "loss": 1.1053, + "step": 780 + }, + { + "epoch": 0.5930707166587565, + "grad_norm": 0.275390625, + "learning_rate": 8.125171750199436e-06, + "loss": 1.1004, + "step": 781 + }, + { + "epoch": 0.5938300901756052, + "grad_norm": 0.36328125, + "learning_rate": 8.099800687047958e-06, + "loss": 1.1189, + "step": 782 + }, + { + "epoch": 0.5945894636924537, + "grad_norm": 0.400390625, + "learning_rate": 8.074442307174445e-06, + "loss": 1.2653, + "step": 783 + }, + { + "epoch": 0.5953488372093023, + "grad_norm": 0.298828125, + "learning_rate": 8.04909677983872e-06, + "loss": 1.1253, + "step": 784 + }, + { + "epoch": 0.5961082107261509, + "grad_norm": 0.373046875, + "learning_rate": 8.023764274214802e-06, + "loss": 1.1351, + "step": 785 + }, + { + "epoch": 0.5968675842429996, + "grad_norm": 0.33984375, + "learning_rate": 7.998444959389803e-06, + "loss": 1.145, + "step": 786 + }, + { + "epoch": 0.5976269577598481, + "grad_norm": 0.302734375, + "learning_rate": 7.973139004362794e-06, + "loss": 1.1679, + "step": 787 + }, + { + "epoch": 0.5983863312766967, + "grad_norm": 0.3203125, + "learning_rate": 7.947846578043658e-06, + "loss": 1.1475, + "step": 788 + }, + { + "epoch": 0.5991457047935453, + "grad_norm": 0.34765625, + "learning_rate": 7.922567849251995e-06, + "loss": 1.1941, + "step": 789 + }, + { + "epoch": 0.599905078310394, + "grad_norm": 0.365234375, + "learning_rate": 7.897302986715967e-06, + "loss": 1.1754, + "step": 790 + }, + { + "epoch": 0.6006644518272425, + "grad_norm": 0.37890625, + "learning_rate": 7.872052159071186e-06, + "loss": 1.1762, + "step": 791 + }, + { + "epoch": 0.6014238253440911, + "grad_norm": 0.302734375, + "learning_rate": 7.846815534859592e-06, + "loss": 1.1361, + "step": 792 + }, + { + "epoch": 0.6021831988609397, + "grad_norm": 0.41015625, + "learning_rate": 7.821593282528301e-06, + "loss": 1.2727, + "step": 793 + }, + { + "epoch": 0.6029425723777884, + "grad_norm": 0.291015625, + "learning_rate": 7.796385570428527e-06, + "loss": 1.1568, + "step": 794 + }, + { + "epoch": 0.6037019458946369, + "grad_norm": 0.396484375, + "learning_rate": 7.771192566814412e-06, + "loss": 1.2494, + "step": 795 + }, + { + "epoch": 0.6044613194114855, + "grad_norm": 0.3828125, + "learning_rate": 7.746014439841941e-06, + "loss": 1.223, + "step": 796 + }, + { + "epoch": 0.6052206929283341, + "grad_norm": 0.337890625, + "learning_rate": 7.720851357567778e-06, + "loss": 1.1366, + "step": 797 + }, + { + "epoch": 0.6059800664451828, + "grad_norm": 0.3203125, + "learning_rate": 7.69570348794819e-06, + "loss": 1.1451, + "step": 798 + }, + { + "epoch": 0.6067394399620313, + "grad_norm": 0.29296875, + "learning_rate": 7.670570998837889e-06, + "loss": 1.1189, + "step": 799 + }, + { + "epoch": 0.6074988134788799, + "grad_norm": 0.25390625, + "learning_rate": 7.645454057988942e-06, + "loss": 1.1005, + "step": 800 + }, + { + "epoch": 0.6082581869957285, + "grad_norm": 0.31640625, + "learning_rate": 7.6203528330496245e-06, + "loss": 1.1741, + "step": 801 + }, + { + "epoch": 0.6090175605125772, + "grad_norm": 0.2734375, + "learning_rate": 7.595267491563311e-06, + "loss": 1.1124, + "step": 802 + }, + { + "epoch": 0.6097769340294257, + "grad_norm": 0.33984375, + "learning_rate": 7.570198200967363e-06, + "loss": 1.1459, + "step": 803 + }, + { + "epoch": 0.6105363075462743, + "grad_norm": 0.36328125, + "learning_rate": 7.545145128592009e-06, + "loss": 1.1668, + "step": 804 + }, + { + "epoch": 0.6112956810631229, + "grad_norm": 0.29296875, + "learning_rate": 7.520108441659223e-06, + "loss": 1.1384, + "step": 805 + }, + { + "epoch": 0.6120550545799716, + "grad_norm": 0.314453125, + "learning_rate": 7.495088307281619e-06, + "loss": 1.1462, + "step": 806 + }, + { + "epoch": 0.6128144280968201, + "grad_norm": 0.330078125, + "learning_rate": 7.470084892461305e-06, + "loss": 1.1645, + "step": 807 + }, + { + "epoch": 0.6135738016136687, + "grad_norm": 0.3359375, + "learning_rate": 7.445098364088815e-06, + "loss": 1.1709, + "step": 808 + }, + { + "epoch": 0.6143331751305173, + "grad_norm": 0.298828125, + "learning_rate": 7.420128888941958e-06, + "loss": 1.1914, + "step": 809 + }, + { + "epoch": 0.615092548647366, + "grad_norm": 0.4140625, + "learning_rate": 7.395176633684726e-06, + "loss": 1.2529, + "step": 810 + }, + { + "epoch": 0.6158519221642145, + "grad_norm": 0.30859375, + "learning_rate": 7.370241764866169e-06, + "loss": 1.1245, + "step": 811 + }, + { + "epoch": 0.6166112956810631, + "grad_norm": 0.359375, + "learning_rate": 7.34532444891928e-06, + "loss": 1.1952, + "step": 812 + }, + { + "epoch": 0.6173706691979117, + "grad_norm": 0.296875, + "learning_rate": 7.3204248521599e-06, + "loss": 1.1247, + "step": 813 + }, + { + "epoch": 0.6181300427147604, + "grad_norm": 0.33203125, + "learning_rate": 7.295543140785604e-06, + "loss": 1.1417, + "step": 814 + }, + { + "epoch": 0.6188894162316089, + "grad_norm": 0.27734375, + "learning_rate": 7.27067948087458e-06, + "loss": 1.1264, + "step": 815 + }, + { + "epoch": 0.6196487897484575, + "grad_norm": 0.322265625, + "learning_rate": 7.245834038384523e-06, + "loss": 1.176, + "step": 816 + }, + { + "epoch": 0.6204081632653061, + "grad_norm": 0.314453125, + "learning_rate": 7.221006979151546e-06, + "loss": 1.1171, + "step": 817 + }, + { + "epoch": 0.6211675367821547, + "grad_norm": 0.3828125, + "learning_rate": 7.196198468889047e-06, + "loss": 1.1906, + "step": 818 + }, + { + "epoch": 0.6219269102990034, + "grad_norm": 0.3046875, + "learning_rate": 7.171408673186619e-06, + "loss": 1.1394, + "step": 819 + }, + { + "epoch": 0.6226862838158519, + "grad_norm": 0.3125, + "learning_rate": 7.14663775750895e-06, + "loss": 1.1334, + "step": 820 + }, + { + "epoch": 0.6234456573327005, + "grad_norm": 0.3359375, + "learning_rate": 7.1218858871946885e-06, + "loss": 1.149, + "step": 821 + }, + { + "epoch": 0.6242050308495491, + "grad_norm": 0.3359375, + "learning_rate": 7.097153227455379e-06, + "loss": 1.1593, + "step": 822 + }, + { + "epoch": 0.6249644043663978, + "grad_norm": 0.3984375, + "learning_rate": 7.072439943374331e-06, + "loss": 1.1399, + "step": 823 + }, + { + "epoch": 0.6257237778832463, + "grad_norm": 0.376953125, + "learning_rate": 7.0477461999055365e-06, + "loss": 1.2022, + "step": 824 + }, + { + "epoch": 0.6264831514000949, + "grad_norm": 0.337890625, + "learning_rate": 7.023072161872551e-06, + "loss": 1.1374, + "step": 825 + }, + { + "epoch": 0.6272425249169435, + "grad_norm": 0.2734375, + "learning_rate": 6.998417993967403e-06, + "loss": 1.1267, + "step": 826 + }, + { + "epoch": 0.6280018984337922, + "grad_norm": 0.330078125, + "learning_rate": 6.973783860749499e-06, + "loss": 1.179, + "step": 827 + }, + { + "epoch": 0.6287612719506407, + "grad_norm": 0.349609375, + "learning_rate": 6.949169926644513e-06, + "loss": 1.1685, + "step": 828 + }, + { + "epoch": 0.6295206454674893, + "grad_norm": 0.365234375, + "learning_rate": 6.9245763559432996e-06, + "loss": 1.2012, + "step": 829 + }, + { + "epoch": 0.6302800189843379, + "grad_norm": 0.353515625, + "learning_rate": 6.9000033128008e-06, + "loss": 1.187, + "step": 830 + }, + { + "epoch": 0.6310393925011866, + "grad_norm": 0.373046875, + "learning_rate": 6.875450961234924e-06, + "loss": 1.1949, + "step": 831 + }, + { + "epoch": 0.6317987660180351, + "grad_norm": 0.3515625, + "learning_rate": 6.8509194651254825e-06, + "loss": 1.1995, + "step": 832 + }, + { + "epoch": 0.6325581395348837, + "grad_norm": 0.376953125, + "learning_rate": 6.826408988213083e-06, + "loss": 1.1705, + "step": 833 + }, + { + "epoch": 0.6333175130517323, + "grad_norm": 0.326171875, + "learning_rate": 6.801919694098034e-06, + "loss": 1.1469, + "step": 834 + }, + { + "epoch": 0.634076886568581, + "grad_norm": 0.357421875, + "learning_rate": 6.777451746239249e-06, + "loss": 1.1363, + "step": 835 + }, + { + "epoch": 0.6348362600854295, + "grad_norm": 0.33984375, + "learning_rate": 6.7530053079531664e-06, + "loss": 1.1968, + "step": 836 + }, + { + "epoch": 0.6355956336022781, + "grad_norm": 0.376953125, + "learning_rate": 6.7285805424126585e-06, + "loss": 1.2189, + "step": 837 + }, + { + "epoch": 0.6363550071191267, + "grad_norm": 0.298828125, + "learning_rate": 6.70417761264593e-06, + "loss": 1.1232, + "step": 838 + }, + { + "epoch": 0.6371143806359754, + "grad_norm": 0.33984375, + "learning_rate": 6.679796681535451e-06, + "loss": 1.1898, + "step": 839 + }, + { + "epoch": 0.6378737541528239, + "grad_norm": 0.296875, + "learning_rate": 6.655437911816838e-06, + "loss": 1.1666, + "step": 840 + }, + { + "epoch": 0.6386331276696725, + "grad_norm": 0.296875, + "learning_rate": 6.631101466077801e-06, + "loss": 1.146, + "step": 841 + }, + { + "epoch": 0.6393925011865211, + "grad_norm": 0.419921875, + "learning_rate": 6.60678750675704e-06, + "loss": 1.1723, + "step": 842 + }, + { + "epoch": 0.6401518747033698, + "grad_norm": 0.34375, + "learning_rate": 6.582496196143167e-06, + "loss": 1.1488, + "step": 843 + }, + { + "epoch": 0.6409112482202183, + "grad_norm": 0.3203125, + "learning_rate": 6.558227696373617e-06, + "loss": 1.1899, + "step": 844 + }, + { + "epoch": 0.6416706217370669, + "grad_norm": 0.3515625, + "learning_rate": 6.533982169433568e-06, + "loss": 1.1478, + "step": 845 + }, + { + "epoch": 0.6424299952539155, + "grad_norm": 0.333984375, + "learning_rate": 6.509759777154864e-06, + "loss": 1.1353, + "step": 846 + }, + { + "epoch": 0.6431893687707642, + "grad_norm": 0.28515625, + "learning_rate": 6.485560681214933e-06, + "loss": 1.1481, + "step": 847 + }, + { + "epoch": 0.6439487422876127, + "grad_norm": 0.298828125, + "learning_rate": 6.461385043135704e-06, + "loss": 1.1222, + "step": 848 + }, + { + "epoch": 0.6447081158044613, + "grad_norm": 0.328125, + "learning_rate": 6.437233024282538e-06, + "loss": 1.1029, + "step": 849 + }, + { + "epoch": 0.6454674893213099, + "grad_norm": 0.376953125, + "learning_rate": 6.413104785863128e-06, + "loss": 1.192, + "step": 850 + }, + { + "epoch": 0.6462268628381586, + "grad_norm": 0.36328125, + "learning_rate": 6.389000488926459e-06, + "loss": 1.2227, + "step": 851 + }, + { + "epoch": 0.6469862363550071, + "grad_norm": 0.279296875, + "learning_rate": 6.364920294361701e-06, + "loss": 1.0898, + "step": 852 + }, + { + "epoch": 0.6477456098718557, + "grad_norm": 0.375, + "learning_rate": 6.3408643628971585e-06, + "loss": 1.1882, + "step": 853 + }, + { + "epoch": 0.6485049833887043, + "grad_norm": 0.33984375, + "learning_rate": 6.316832855099173e-06, + "loss": 1.1572, + "step": 854 + }, + { + "epoch": 0.649264356905553, + "grad_norm": 0.296875, + "learning_rate": 6.292825931371075e-06, + "loss": 1.1056, + "step": 855 + }, + { + "epoch": 0.6500237304224015, + "grad_norm": 0.298828125, + "learning_rate": 6.2688437519521e-06, + "loss": 1.1232, + "step": 856 + }, + { + "epoch": 0.6507831039392501, + "grad_norm": 0.373046875, + "learning_rate": 6.244886476916325e-06, + "loss": 1.1479, + "step": 857 + }, + { + "epoch": 0.6515424774560987, + "grad_norm": 0.294921875, + "learning_rate": 6.220954266171597e-06, + "loss": 1.1355, + "step": 858 + }, + { + "epoch": 0.6523018509729474, + "grad_norm": 0.357421875, + "learning_rate": 6.197047279458459e-06, + "loss": 1.185, + "step": 859 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.341796875, + "learning_rate": 6.173165676349103e-06, + "loss": 1.141, + "step": 860 + }, + { + "epoch": 0.6538205980066445, + "grad_norm": 0.314453125, + "learning_rate": 6.149309616246285e-06, + "loss": 1.129, + "step": 861 + }, + { + "epoch": 0.6545799715234931, + "grad_norm": 0.34375, + "learning_rate": 6.125479258382268e-06, + "loss": 1.1517, + "step": 862 + }, + { + "epoch": 0.6553393450403417, + "grad_norm": 0.326171875, + "learning_rate": 6.101674761817769e-06, + "loss": 1.0984, + "step": 863 + }, + { + "epoch": 0.6560987185571903, + "grad_norm": 0.341796875, + "learning_rate": 6.077896285440874e-06, + "loss": 1.175, + "step": 864 + }, + { + "epoch": 0.6568580920740389, + "grad_norm": 0.34375, + "learning_rate": 6.054143987966001e-06, + "loss": 1.1625, + "step": 865 + }, + { + "epoch": 0.6576174655908875, + "grad_norm": 0.357421875, + "learning_rate": 6.030418027932835e-06, + "loss": 1.2025, + "step": 866 + }, + { + "epoch": 0.6583768391077361, + "grad_norm": 0.3671875, + "learning_rate": 6.006718563705258e-06, + "loss": 1.1843, + "step": 867 + }, + { + "epoch": 0.6591362126245848, + "grad_norm": 0.3671875, + "learning_rate": 5.983045753470308e-06, + "loss": 1.1775, + "step": 868 + }, + { + "epoch": 0.6598955861414333, + "grad_norm": 0.3984375, + "learning_rate": 5.959399755237103e-06, + "loss": 1.1727, + "step": 869 + }, + { + "epoch": 0.6606549596582819, + "grad_norm": 0.353515625, + "learning_rate": 5.935780726835811e-06, + "loss": 1.1502, + "step": 870 + }, + { + "epoch": 0.6614143331751305, + "grad_norm": 0.3515625, + "learning_rate": 5.91218882591658e-06, + "loss": 1.1346, + "step": 871 + }, + { + "epoch": 0.6621737066919792, + "grad_norm": 0.41796875, + "learning_rate": 5.888624209948495e-06, + "loss": 1.1899, + "step": 872 + }, + { + "epoch": 0.6629330802088277, + "grad_norm": 0.345703125, + "learning_rate": 5.865087036218504e-06, + "loss": 1.1826, + "step": 873 + }, + { + "epoch": 0.6636924537256763, + "grad_norm": 0.337890625, + "learning_rate": 5.841577461830408e-06, + "loss": 1.1627, + "step": 874 + }, + { + "epoch": 0.6644518272425249, + "grad_norm": 0.33984375, + "learning_rate": 5.818095643703779e-06, + "loss": 1.1732, + "step": 875 + }, + { + "epoch": 0.6652112007593736, + "grad_norm": 0.294921875, + "learning_rate": 5.794641738572925e-06, + "loss": 1.1294, + "step": 876 + }, + { + "epoch": 0.6659705742762221, + "grad_norm": 0.271484375, + "learning_rate": 5.771215902985848e-06, + "loss": 1.1594, + "step": 877 + }, + { + "epoch": 0.6667299477930707, + "grad_norm": 0.279296875, + "learning_rate": 5.747818293303185e-06, + "loss": 1.1273, + "step": 878 + }, + { + "epoch": 0.6674893213099193, + "grad_norm": 0.3359375, + "learning_rate": 5.724449065697182e-06, + "loss": 1.1463, + "step": 879 + }, + { + "epoch": 0.668248694826768, + "grad_norm": 0.333984375, + "learning_rate": 5.701108376150635e-06, + "loss": 1.1557, + "step": 880 + }, + { + "epoch": 0.6690080683436165, + "grad_norm": 0.35546875, + "learning_rate": 5.677796380455862e-06, + "loss": 1.1537, + "step": 881 + }, + { + "epoch": 0.6697674418604651, + "grad_norm": 0.30859375, + "learning_rate": 5.654513234213663e-06, + "loss": 1.1203, + "step": 882 + }, + { + "epoch": 0.6705268153773137, + "grad_norm": 0.33203125, + "learning_rate": 5.631259092832265e-06, + "loss": 1.1744, + "step": 883 + }, + { + "epoch": 0.6712861888941624, + "grad_norm": 0.35546875, + "learning_rate": 5.608034111526298e-06, + "loss": 1.1531, + "step": 884 + }, + { + "epoch": 0.6720455624110109, + "grad_norm": 0.37109375, + "learning_rate": 5.584838445315764e-06, + "loss": 1.1989, + "step": 885 + }, + { + "epoch": 0.6728049359278595, + "grad_norm": 0.39453125, + "learning_rate": 5.561672249024988e-06, + "loss": 1.2282, + "step": 886 + }, + { + "epoch": 0.6735643094447081, + "grad_norm": 0.36328125, + "learning_rate": 5.538535677281608e-06, + "loss": 1.186, + "step": 887 + }, + { + "epoch": 0.6743236829615568, + "grad_norm": 0.357421875, + "learning_rate": 5.515428884515495e-06, + "loss": 1.1552, + "step": 888 + }, + { + "epoch": 0.6750830564784053, + "grad_norm": 0.349609375, + "learning_rate": 5.492352024957781e-06, + "loss": 1.1389, + "step": 889 + }, + { + "epoch": 0.6758424299952539, + "grad_norm": 0.33984375, + "learning_rate": 5.4693052526397965e-06, + "loss": 1.133, + "step": 890 + }, + { + "epoch": 0.6766018035121025, + "grad_norm": 0.365234375, + "learning_rate": 5.446288721392048e-06, + "loss": 1.2011, + "step": 891 + }, + { + "epoch": 0.6773611770289512, + "grad_norm": 0.30859375, + "learning_rate": 5.423302584843186e-06, + "loss": 1.1344, + "step": 892 + }, + { + "epoch": 0.6781205505457997, + "grad_norm": 0.328125, + "learning_rate": 5.400346996418988e-06, + "loss": 1.161, + "step": 893 + }, + { + "epoch": 0.6788799240626483, + "grad_norm": 0.2734375, + "learning_rate": 5.377422109341332e-06, + "loss": 1.1067, + "step": 894 + }, + { + "epoch": 0.6796392975794969, + "grad_norm": 0.306640625, + "learning_rate": 5.354528076627185e-06, + "loss": 1.1321, + "step": 895 + }, + { + "epoch": 0.6803986710963456, + "grad_norm": 0.37109375, + "learning_rate": 5.331665051087549e-06, + "loss": 1.1952, + "step": 896 + }, + { + "epoch": 0.6811580446131941, + "grad_norm": 0.267578125, + "learning_rate": 5.308833185326472e-06, + "loss": 1.1063, + "step": 897 + }, + { + "epoch": 0.6819174181300427, + "grad_norm": 0.357421875, + "learning_rate": 5.286032631740023e-06, + "loss": 1.19, + "step": 898 + }, + { + "epoch": 0.6826767916468913, + "grad_norm": 0.365234375, + "learning_rate": 5.263263542515273e-06, + "loss": 1.1727, + "step": 899 + }, + { + "epoch": 0.68343616516374, + "grad_norm": 0.3046875, + "learning_rate": 5.240526069629265e-06, + "loss": 1.172, + "step": 900 + }, + { + "epoch": 0.6841955386805885, + "grad_norm": 0.357421875, + "learning_rate": 5.217820364848027e-06, + "loss": 1.1787, + "step": 901 + }, + { + "epoch": 0.6849549121974371, + "grad_norm": 0.390625, + "learning_rate": 5.19514657972553e-06, + "loss": 1.2442, + "step": 902 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.337890625, + "learning_rate": 5.172504865602701e-06, + "loss": 1.1876, + "step": 903 + }, + { + "epoch": 0.6864736592311343, + "grad_norm": 0.37109375, + "learning_rate": 5.149895373606405e-06, + "loss": 1.2092, + "step": 904 + }, + { + "epoch": 0.6872330327479829, + "grad_norm": 0.265625, + "learning_rate": 5.127318254648418e-06, + "loss": 1.1086, + "step": 905 + }, + { + "epoch": 0.6879924062648315, + "grad_norm": 0.328125, + "learning_rate": 5.104773659424453e-06, + "loss": 1.1276, + "step": 906 + }, + { + "epoch": 0.6887517797816801, + "grad_norm": 0.369140625, + "learning_rate": 5.082261738413124e-06, + "loss": 1.2118, + "step": 907 + }, + { + "epoch": 0.6895111532985287, + "grad_norm": 0.33203125, + "learning_rate": 5.059782641874962e-06, + "loss": 1.1634, + "step": 908 + }, + { + "epoch": 0.6902705268153773, + "grad_norm": 0.33203125, + "learning_rate": 5.037336519851397e-06, + "loss": 1.1525, + "step": 909 + }, + { + "epoch": 0.6910299003322259, + "grad_norm": 0.306640625, + "learning_rate": 5.014923522163773e-06, + "loss": 1.1586, + "step": 910 + }, + { + "epoch": 0.6917892738490745, + "grad_norm": 0.318359375, + "learning_rate": 4.992543798412327e-06, + "loss": 1.185, + "step": 911 + }, + { + "epoch": 0.6925486473659231, + "grad_norm": 0.328125, + "learning_rate": 4.970197497975216e-06, + "loss": 1.1233, + "step": 912 + }, + { + "epoch": 0.6933080208827717, + "grad_norm": 0.337890625, + "learning_rate": 4.947884770007491e-06, + "loss": 1.1646, + "step": 913 + }, + { + "epoch": 0.6940673943996203, + "grad_norm": 0.373046875, + "learning_rate": 4.92560576344013e-06, + "loss": 1.1766, + "step": 914 + }, + { + "epoch": 0.6948267679164689, + "grad_norm": 0.337890625, + "learning_rate": 4.903360626979026e-06, + "loss": 1.1797, + "step": 915 + }, + { + "epoch": 0.6955861414333175, + "grad_norm": 0.291015625, + "learning_rate": 4.881149509103993e-06, + "loss": 1.1327, + "step": 916 + }, + { + "epoch": 0.6963455149501662, + "grad_norm": 0.3125, + "learning_rate": 4.858972558067784e-06, + "loss": 1.1353, + "step": 917 + }, + { + "epoch": 0.6971048884670147, + "grad_norm": 0.33984375, + "learning_rate": 4.836829921895103e-06, + "loss": 1.1603, + "step": 918 + }, + { + "epoch": 0.6978642619838633, + "grad_norm": 0.3359375, + "learning_rate": 4.814721748381608e-06, + "loss": 1.1768, + "step": 919 + }, + { + "epoch": 0.6986236355007119, + "grad_norm": 0.349609375, + "learning_rate": 4.7926481850929376e-06, + "loss": 1.1515, + "step": 920 + }, + { + "epoch": 0.6993830090175606, + "grad_norm": 0.380859375, + "learning_rate": 4.770609379363694e-06, + "loss": 1.2258, + "step": 921 + }, + { + "epoch": 0.7001423825344091, + "grad_norm": 0.3515625, + "learning_rate": 4.748605478296508e-06, + "loss": 1.1553, + "step": 922 + }, + { + "epoch": 0.7009017560512577, + "grad_norm": 0.380859375, + "learning_rate": 4.726636628761018e-06, + "loss": 1.1856, + "step": 923 + }, + { + "epoch": 0.7016611295681063, + "grad_norm": 0.33203125, + "learning_rate": 4.704702977392914e-06, + "loss": 1.172, + "step": 924 + }, + { + "epoch": 0.702420503084955, + "grad_norm": 0.318359375, + "learning_rate": 4.682804670592937e-06, + "loss": 1.145, + "step": 925 + }, + { + "epoch": 0.7031798766018035, + "grad_norm": 0.341796875, + "learning_rate": 4.660941854525917e-06, + "loss": 1.1645, + "step": 926 + }, + { + "epoch": 0.7039392501186521, + "grad_norm": 0.314453125, + "learning_rate": 4.639114675119797e-06, + "loss": 1.1369, + "step": 927 + }, + { + "epoch": 0.7046986236355007, + "grad_norm": 0.291015625, + "learning_rate": 4.617323278064657e-06, + "loss": 1.1206, + "step": 928 + }, + { + "epoch": 0.7054579971523494, + "grad_norm": 0.267578125, + "learning_rate": 4.595567808811735e-06, + "loss": 1.1056, + "step": 929 + }, + { + "epoch": 0.7062173706691979, + "grad_norm": 0.40234375, + "learning_rate": 4.573848412572458e-06, + "loss": 1.1796, + "step": 930 + }, + { + "epoch": 0.7069767441860465, + "grad_norm": 0.341796875, + "learning_rate": 4.552165234317486e-06, + "loss": 1.1623, + "step": 931 + }, + { + "epoch": 0.7077361177028951, + "grad_norm": 0.345703125, + "learning_rate": 4.530518418775734e-06, + "loss": 1.1729, + "step": 932 + }, + { + "epoch": 0.7084954912197438, + "grad_norm": 0.33984375, + "learning_rate": 4.508908110433393e-06, + "loss": 1.1316, + "step": 933 + }, + { + "epoch": 0.7092548647365923, + "grad_norm": 0.3515625, + "learning_rate": 4.487334453532998e-06, + "loss": 1.198, + "step": 934 + }, + { + "epoch": 0.7100142382534409, + "grad_norm": 0.369140625, + "learning_rate": 4.465797592072428e-06, + "loss": 1.2132, + "step": 935 + }, + { + "epoch": 0.7107736117702895, + "grad_norm": 0.341796875, + "learning_rate": 4.444297669803981e-06, + "loss": 1.1731, + "step": 936 + }, + { + "epoch": 0.7115329852871382, + "grad_norm": 0.298828125, + "learning_rate": 4.422834830233378e-06, + "loss": 1.119, + "step": 937 + }, + { + "epoch": 0.7122923588039867, + "grad_norm": 0.29296875, + "learning_rate": 4.4014092166188375e-06, + "loss": 1.1435, + "step": 938 + }, + { + "epoch": 0.7130517323208353, + "grad_norm": 0.3671875, + "learning_rate": 4.3800209719701055e-06, + "loss": 1.1884, + "step": 939 + }, + { + "epoch": 0.7138111058376839, + "grad_norm": 0.369140625, + "learning_rate": 4.35867023904749e-06, + "loss": 1.1715, + "step": 940 + }, + { + "epoch": 0.7145704793545326, + "grad_norm": 0.33203125, + "learning_rate": 4.337357160360931e-06, + "loss": 1.1819, + "step": 941 + }, + { + "epoch": 0.7153298528713811, + "grad_norm": 0.326171875, + "learning_rate": 4.3160818781690286e-06, + "loss": 1.165, + "step": 942 + }, + { + "epoch": 0.7160892263882297, + "grad_norm": 0.302734375, + "learning_rate": 4.294844534478107e-06, + "loss": 1.0917, + "step": 943 + }, + { + "epoch": 0.7168485999050783, + "grad_norm": 0.322265625, + "learning_rate": 4.2736452710412645e-06, + "loss": 1.1302, + "step": 944 + }, + { + "epoch": 0.717607973421927, + "grad_norm": 0.365234375, + "learning_rate": 4.25248422935742e-06, + "loss": 1.1528, + "step": 945 + }, + { + "epoch": 0.7183673469387755, + "grad_norm": 0.341796875, + "learning_rate": 4.2313615506703685e-06, + "loss": 1.1557, + "step": 946 + }, + { + "epoch": 0.7191267204556241, + "grad_norm": 0.37109375, + "learning_rate": 4.210277375967855e-06, + "loss": 1.2004, + "step": 947 + }, + { + "epoch": 0.7198860939724727, + "grad_norm": 0.326171875, + "learning_rate": 4.189231845980618e-06, + "loss": 1.1886, + "step": 948 + }, + { + "epoch": 0.7206454674893213, + "grad_norm": 0.33984375, + "learning_rate": 4.168225101181449e-06, + "loss": 1.1163, + "step": 949 + }, + { + "epoch": 0.7214048410061699, + "grad_norm": 0.287109375, + "learning_rate": 4.147257281784257e-06, + "loss": 1.1078, + "step": 950 + }, + { + "epoch": 0.7221642145230185, + "grad_norm": 0.306640625, + "learning_rate": 4.1263285277431465e-06, + "loss": 1.1385, + "step": 951 + }, + { + "epoch": 0.7229235880398671, + "grad_norm": 0.3515625, + "learning_rate": 4.105438978751465e-06, + "loss": 1.1829, + "step": 952 + }, + { + "epoch": 0.7236829615567157, + "grad_norm": 0.31640625, + "learning_rate": 4.084588774240884e-06, + "loss": 1.1458, + "step": 953 + }, + { + "epoch": 0.7244423350735643, + "grad_norm": 0.31640625, + "learning_rate": 4.063778053380446e-06, + "loss": 1.1388, + "step": 954 + }, + { + "epoch": 0.7252017085904129, + "grad_norm": 0.3125, + "learning_rate": 4.043006955075667e-06, + "loss": 1.1234, + "step": 955 + }, + { + "epoch": 0.7259610821072615, + "grad_norm": 0.3359375, + "learning_rate": 4.0222756179675915e-06, + "loss": 1.171, + "step": 956 + }, + { + "epoch": 0.7267204556241101, + "grad_norm": 0.30078125, + "learning_rate": 4.001584180431869e-06, + "loss": 1.1435, + "step": 957 + }, + { + "epoch": 0.7274798291409587, + "grad_norm": 0.2578125, + "learning_rate": 3.980932780577827e-06, + "loss": 1.1021, + "step": 958 + }, + { + "epoch": 0.7282392026578073, + "grad_norm": 0.357421875, + "learning_rate": 3.960321556247552e-06, + "loss": 1.1885, + "step": 959 + }, + { + "epoch": 0.7289985761746559, + "grad_norm": 0.29296875, + "learning_rate": 3.939750645014977e-06, + "loss": 1.1244, + "step": 960 + }, + { + "epoch": 0.7297579496915045, + "grad_norm": 0.3125, + "learning_rate": 3.919220184184959e-06, + "loss": 1.1245, + "step": 961 + }, + { + "epoch": 0.730517323208353, + "grad_norm": 0.314453125, + "learning_rate": 3.898730310792346e-06, + "loss": 1.1353, + "step": 962 + }, + { + "epoch": 0.7312766967252017, + "grad_norm": 0.29296875, + "learning_rate": 3.878281161601094e-06, + "loss": 1.1653, + "step": 963 + }, + { + "epoch": 0.7320360702420503, + "grad_norm": 0.294921875, + "learning_rate": 3.857872873103322e-06, + "loss": 1.1238, + "step": 964 + }, + { + "epoch": 0.7327954437588989, + "grad_norm": 0.380859375, + "learning_rate": 3.837505581518429e-06, + "loss": 1.1952, + "step": 965 + }, + { + "epoch": 0.7335548172757476, + "grad_norm": 0.380859375, + "learning_rate": 3.8171794227921585e-06, + "loss": 1.2425, + "step": 966 + }, + { + "epoch": 0.7343141907925961, + "grad_norm": 0.33203125, + "learning_rate": 3.7968945325957175e-06, + "loss": 1.099, + "step": 967 + }, + { + "epoch": 0.7350735643094447, + "grad_norm": 0.35546875, + "learning_rate": 3.776651046324843e-06, + "loss": 1.151, + "step": 968 + }, + { + "epoch": 0.7358329378262933, + "grad_norm": 0.287109375, + "learning_rate": 3.7564490990989276e-06, + "loss": 1.1206, + "step": 969 + }, + { + "epoch": 0.736592311343142, + "grad_norm": 0.302734375, + "learning_rate": 3.7362888257600894e-06, + "loss": 1.1203, + "step": 970 + }, + { + "epoch": 0.7373516848599905, + "grad_norm": 0.3671875, + "learning_rate": 3.716170360872294e-06, + "loss": 1.19, + "step": 971 + }, + { + "epoch": 0.7381110583768391, + "grad_norm": 0.365234375, + "learning_rate": 3.69609383872045e-06, + "loss": 1.1872, + "step": 972 + }, + { + "epoch": 0.7388704318936877, + "grad_norm": 0.32421875, + "learning_rate": 3.676059393309499e-06, + "loss": 1.1264, + "step": 973 + }, + { + "epoch": 0.7396298054105364, + "grad_norm": 0.392578125, + "learning_rate": 3.6560671583635467e-06, + "loss": 1.1832, + "step": 974 + }, + { + "epoch": 0.7403891789273849, + "grad_norm": 0.30859375, + "learning_rate": 3.636117267324941e-06, + "loss": 1.1855, + "step": 975 + }, + { + "epoch": 0.7411485524442335, + "grad_norm": 0.373046875, + "learning_rate": 3.6162098533534095e-06, + "loss": 1.2236, + "step": 976 + }, + { + "epoch": 0.7419079259610821, + "grad_norm": 0.30078125, + "learning_rate": 3.5963450493251552e-06, + "loss": 1.1248, + "step": 977 + }, + { + "epoch": 0.7426672994779308, + "grad_norm": 0.283203125, + "learning_rate": 3.576522987831965e-06, + "loss": 1.0895, + "step": 978 + }, + { + "epoch": 0.7434266729947793, + "grad_norm": 0.322265625, + "learning_rate": 3.5567438011803356e-06, + "loss": 1.1789, + "step": 979 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 0.283203125, + "learning_rate": 3.5370076213905904e-06, + "loss": 1.1332, + "step": 980 + }, + { + "epoch": 0.7449454200284765, + "grad_norm": 0.33203125, + "learning_rate": 3.5173145801959942e-06, + "loss": 1.1575, + "step": 981 + }, + { + "epoch": 0.7457047935453252, + "grad_norm": 0.29296875, + "learning_rate": 3.4976648090418685e-06, + "loss": 1.1542, + "step": 982 + }, + { + "epoch": 0.7464641670621737, + "grad_norm": 0.376953125, + "learning_rate": 3.4780584390847193e-06, + "loss": 1.2163, + "step": 983 + }, + { + "epoch": 0.7472235405790223, + "grad_norm": 0.341796875, + "learning_rate": 3.4584956011913693e-06, + "loss": 1.1658, + "step": 984 + }, + { + "epoch": 0.7479829140958709, + "grad_norm": 0.3125, + "learning_rate": 3.4389764259380754e-06, + "loss": 1.1344, + "step": 985 + }, + { + "epoch": 0.7487422876127195, + "grad_norm": 0.3203125, + "learning_rate": 3.4195010436096622e-06, + "loss": 1.1608, + "step": 986 + }, + { + "epoch": 0.7495016611295681, + "grad_norm": 0.38671875, + "learning_rate": 3.400069584198633e-06, + "loss": 1.2214, + "step": 987 + }, + { + "epoch": 0.7502610346464167, + "grad_norm": 0.353515625, + "learning_rate": 3.380682177404335e-06, + "loss": 1.1724, + "step": 988 + }, + { + "epoch": 0.7510204081632653, + "grad_norm": 0.333984375, + "learning_rate": 3.361338952632074e-06, + "loss": 1.1665, + "step": 989 + }, + { + "epoch": 0.751779781680114, + "grad_norm": 0.375, + "learning_rate": 3.3420400389922535e-06, + "loss": 1.2119, + "step": 990 + }, + { + "epoch": 0.7525391551969625, + "grad_norm": 0.296875, + "learning_rate": 3.32278556529951e-06, + "loss": 1.1508, + "step": 991 + }, + { + "epoch": 0.7532985287138111, + "grad_norm": 0.328125, + "learning_rate": 3.3035756600718515e-06, + "loss": 1.1584, + "step": 992 + }, + { + "epoch": 0.7540579022306597, + "grad_norm": 0.322265625, + "learning_rate": 3.284410451529816e-06, + "loss": 1.1329, + "step": 993 + }, + { + "epoch": 0.7548172757475083, + "grad_norm": 0.3515625, + "learning_rate": 3.2652900675956e-06, + "loss": 1.1675, + "step": 994 + }, + { + "epoch": 0.7555766492643569, + "grad_norm": 0.32421875, + "learning_rate": 3.2462146358922033e-06, + "loss": 1.1203, + "step": 995 + }, + { + "epoch": 0.7563360227812055, + "grad_norm": 0.2890625, + "learning_rate": 3.2271842837425917e-06, + "loss": 1.1085, + "step": 996 + }, + { + "epoch": 0.7570953962980541, + "grad_norm": 0.29296875, + "learning_rate": 3.208199138168826e-06, + "loss": 1.1281, + "step": 997 + }, + { + "epoch": 0.7578547698149027, + "grad_norm": 0.375, + "learning_rate": 3.1892593258912407e-06, + "loss": 1.1927, + "step": 998 + }, + { + "epoch": 0.7586141433317513, + "grad_norm": 0.34375, + "learning_rate": 3.1703649733275697e-06, + "loss": 1.1877, + "step": 999 + }, + { + "epoch": 0.7593735168485999, + "grad_norm": 0.326171875, + "learning_rate": 3.151516206592128e-06, + "loss": 1.1486, + "step": 1000 + }, + { + "epoch": 0.7601328903654485, + "grad_norm": 0.373046875, + "learning_rate": 3.132713151494955e-06, + "loss": 1.1856, + "step": 1001 + }, + { + "epoch": 0.7608922638822971, + "grad_norm": 0.30859375, + "learning_rate": 3.113955933540973e-06, + "loss": 1.1627, + "step": 1002 + }, + { + "epoch": 0.7616516373991457, + "grad_norm": 0.33203125, + "learning_rate": 3.0952446779291577e-06, + "loss": 1.1441, + "step": 1003 + }, + { + "epoch": 0.7624110109159943, + "grad_norm": 0.33203125, + "learning_rate": 3.0765795095517026e-06, + "loss": 1.1066, + "step": 1004 + }, + { + "epoch": 0.7631703844328429, + "grad_norm": 0.341796875, + "learning_rate": 3.0579605529931832e-06, + "loss": 1.1927, + "step": 1005 + }, + { + "epoch": 0.7639297579496915, + "grad_norm": 0.34375, + "learning_rate": 3.0393879325297136e-06, + "loss": 1.1468, + "step": 1006 + }, + { + "epoch": 0.76468913146654, + "grad_norm": 0.333984375, + "learning_rate": 3.020861772128145e-06, + "loss": 1.1106, + "step": 1007 + }, + { + "epoch": 0.7654485049833887, + "grad_norm": 0.326171875, + "learning_rate": 3.0023821954452036e-06, + "loss": 1.1217, + "step": 1008 + }, + { + "epoch": 0.7662078785002373, + "grad_norm": 0.318359375, + "learning_rate": 2.983949325826696e-06, + "loss": 1.156, + "step": 1009 + }, + { + "epoch": 0.7669672520170859, + "grad_norm": 0.33984375, + "learning_rate": 2.9655632863066696e-06, + "loss": 1.1315, + "step": 1010 + }, + { + "epoch": 0.7677266255339346, + "grad_norm": 0.328125, + "learning_rate": 2.9472241996065897e-06, + "loss": 1.1651, + "step": 1011 + }, + { + "epoch": 0.7684859990507831, + "grad_norm": 0.291015625, + "learning_rate": 2.9289321881345257e-06, + "loss": 1.1209, + "step": 1012 + }, + { + "epoch": 0.7692453725676317, + "grad_norm": 0.2890625, + "learning_rate": 2.910687373984339e-06, + "loss": 1.1137, + "step": 1013 + }, + { + "epoch": 0.7700047460844803, + "grad_norm": 0.326171875, + "learning_rate": 2.8924898789348645e-06, + "loss": 1.1695, + "step": 1014 + }, + { + "epoch": 0.770764119601329, + "grad_norm": 0.33984375, + "learning_rate": 2.874339824449085e-06, + "loss": 1.1603, + "step": 1015 + }, + { + "epoch": 0.7715234931181775, + "grad_norm": 0.296875, + "learning_rate": 2.856237331673336e-06, + "loss": 1.1263, + "step": 1016 + }, + { + "epoch": 0.7722828666350261, + "grad_norm": 0.30859375, + "learning_rate": 2.838182521436498e-06, + "loss": 1.1512, + "step": 1017 + }, + { + "epoch": 0.7730422401518747, + "grad_norm": 0.40234375, + "learning_rate": 2.8201755142491814e-06, + "loss": 1.2103, + "step": 1018 + }, + { + "epoch": 0.7738016136687234, + "grad_norm": 0.330078125, + "learning_rate": 2.8022164303029186e-06, + "loss": 1.1234, + "step": 1019 + }, + { + "epoch": 0.7745609871855719, + "grad_norm": 0.296875, + "learning_rate": 2.7843053894693805e-06, + "loss": 1.1291, + "step": 1020 + }, + { + "epoch": 0.7753203607024205, + "grad_norm": 0.3046875, + "learning_rate": 2.76644251129955e-06, + "loss": 1.1616, + "step": 1021 + }, + { + "epoch": 0.7760797342192691, + "grad_norm": 0.31640625, + "learning_rate": 2.74862791502295e-06, + "loss": 1.1467, + "step": 1022 + }, + { + "epoch": 0.7768391077361178, + "grad_norm": 0.314453125, + "learning_rate": 2.7308617195468336e-06, + "loss": 1.1435, + "step": 1023 + }, + { + "epoch": 0.7775984812529663, + "grad_norm": 0.353515625, + "learning_rate": 2.713144043455388e-06, + "loss": 1.1323, + "step": 1024 + }, + { + "epoch": 0.7783578547698149, + "grad_norm": 0.322265625, + "learning_rate": 2.695475005008946e-06, + "loss": 1.1765, + "step": 1025 + }, + { + "epoch": 0.7791172282866635, + "grad_norm": 0.30859375, + "learning_rate": 2.6778547221432063e-06, + "loss": 1.1441, + "step": 1026 + }, + { + "epoch": 0.7798766018035121, + "grad_norm": 0.31640625, + "learning_rate": 2.660283312468438e-06, + "loss": 1.1428, + "step": 1027 + }, + { + "epoch": 0.7806359753203607, + "grad_norm": 0.298828125, + "learning_rate": 2.642760893268684e-06, + "loss": 1.1243, + "step": 1028 + }, + { + "epoch": 0.7813953488372093, + "grad_norm": 0.349609375, + "learning_rate": 2.625287581501006e-06, + "loss": 1.1824, + "step": 1029 + }, + { + "epoch": 0.7821547223540579, + "grad_norm": 0.359375, + "learning_rate": 2.6078634937946724e-06, + "loss": 1.1663, + "step": 1030 + }, + { + "epoch": 0.7829140958709065, + "grad_norm": 0.365234375, + "learning_rate": 2.5904887464504115e-06, + "loss": 1.1911, + "step": 1031 + }, + { + "epoch": 0.7836734693877551, + "grad_norm": 0.359375, + "learning_rate": 2.573163455439601e-06, + "loss": 1.1811, + "step": 1032 + }, + { + "epoch": 0.7844328429046037, + "grad_norm": 0.37109375, + "learning_rate": 2.5558877364035286e-06, + "loss": 1.2266, + "step": 1033 + }, + { + "epoch": 0.7851922164214523, + "grad_norm": 0.333984375, + "learning_rate": 2.538661704652595e-06, + "loss": 1.1456, + "step": 1034 + }, + { + "epoch": 0.7859515899383009, + "grad_norm": 0.33203125, + "learning_rate": 2.521485475165555e-06, + "loss": 1.177, + "step": 1035 + }, + { + "epoch": 0.7867109634551495, + "grad_norm": 0.341796875, + "learning_rate": 2.504359162588741e-06, + "loss": 1.18, + "step": 1036 + }, + { + "epoch": 0.7874703369719981, + "grad_norm": 0.345703125, + "learning_rate": 2.4872828812353146e-06, + "loss": 1.1414, + "step": 1037 + }, + { + "epoch": 0.7882297104888467, + "grad_norm": 0.384765625, + "learning_rate": 2.470256745084488e-06, + "loss": 1.1995, + "step": 1038 + }, + { + "epoch": 0.7889890840056953, + "grad_norm": 0.349609375, + "learning_rate": 2.4532808677807772e-06, + "loss": 1.1283, + "step": 1039 + }, + { + "epoch": 0.7897484575225439, + "grad_norm": 0.345703125, + "learning_rate": 2.4363553626332157e-06, + "loss": 1.1844, + "step": 1040 + }, + { + "epoch": 0.7905078310393925, + "grad_norm": 0.369140625, + "learning_rate": 2.419480342614635e-06, + "loss": 1.1947, + "step": 1041 + }, + { + "epoch": 0.7912672045562411, + "grad_norm": 0.35546875, + "learning_rate": 2.402655920360889e-06, + "loss": 1.1751, + "step": 1042 + }, + { + "epoch": 0.7920265780730897, + "grad_norm": 0.365234375, + "learning_rate": 2.385882208170106e-06, + "loss": 1.1976, + "step": 1043 + }, + { + "epoch": 0.7927859515899383, + "grad_norm": 0.36328125, + "learning_rate": 2.369159318001937e-06, + "loss": 1.1705, + "step": 1044 + }, + { + "epoch": 0.7935453251067869, + "grad_norm": 0.30078125, + "learning_rate": 2.3524873614768085e-06, + "loss": 1.1149, + "step": 1045 + }, + { + "epoch": 0.7943046986236355, + "grad_norm": 0.3203125, + "learning_rate": 2.335866449875185e-06, + "loss": 1.1556, + "step": 1046 + }, + { + "epoch": 0.7950640721404841, + "grad_norm": 0.322265625, + "learning_rate": 2.3192966941368247e-06, + "loss": 1.1266, + "step": 1047 + }, + { + "epoch": 0.7958234456573327, + "grad_norm": 0.28515625, + "learning_rate": 2.3027782048600247e-06, + "loss": 1.0954, + "step": 1048 + }, + { + "epoch": 0.7965828191741813, + "grad_norm": 0.310546875, + "learning_rate": 2.2863110923008958e-06, + "loss": 1.1715, + "step": 1049 + }, + { + "epoch": 0.7973421926910299, + "grad_norm": 0.40234375, + "learning_rate": 2.26989546637263e-06, + "loss": 1.2394, + "step": 1050 + }, + { + "epoch": 0.7981015662078785, + "grad_norm": 0.37109375, + "learning_rate": 2.2535314366447625e-06, + "loss": 1.1812, + "step": 1051 + }, + { + "epoch": 0.798860939724727, + "grad_norm": 0.330078125, + "learning_rate": 2.237219112342426e-06, + "loss": 1.146, + "step": 1052 + }, + { + "epoch": 0.7996203132415757, + "grad_norm": 0.3046875, + "learning_rate": 2.2209586023456495e-06, + "loss": 1.1245, + "step": 1053 + }, + { + "epoch": 0.8003796867584243, + "grad_norm": 0.3359375, + "learning_rate": 2.2047500151886047e-06, + "loss": 1.1608, + "step": 1054 + }, + { + "epoch": 0.8011390602752729, + "grad_norm": 0.341796875, + "learning_rate": 2.1885934590589008e-06, + "loss": 1.1919, + "step": 1055 + }, + { + "epoch": 0.8018984337921214, + "grad_norm": 0.314453125, + "learning_rate": 2.172489041796856e-06, + "loss": 1.1411, + "step": 1056 + }, + { + "epoch": 0.8026578073089701, + "grad_norm": 0.3203125, + "learning_rate": 2.156436870894767e-06, + "loss": 1.1685, + "step": 1057 + }, + { + "epoch": 0.8034171808258187, + "grad_norm": 0.341796875, + "learning_rate": 2.140437053496214e-06, + "loss": 1.1709, + "step": 1058 + }, + { + "epoch": 0.8041765543426673, + "grad_norm": 0.353515625, + "learning_rate": 2.124489696395321e-06, + "loss": 1.1552, + "step": 1059 + }, + { + "epoch": 0.804935927859516, + "grad_norm": 0.328125, + "learning_rate": 2.1085949060360654e-06, + "loss": 1.1587, + "step": 1060 + }, + { + "epoch": 0.8056953013763645, + "grad_norm": 0.30859375, + "learning_rate": 2.092752788511546e-06, + "loss": 1.1752, + "step": 1061 + }, + { + "epoch": 0.8064546748932131, + "grad_norm": 0.3125, + "learning_rate": 2.0769634495632986e-06, + "loss": 1.1594, + "step": 1062 + }, + { + "epoch": 0.8072140484100617, + "grad_norm": 0.28515625, + "learning_rate": 2.061226994580563e-06, + "loss": 1.1164, + "step": 1063 + }, + { + "epoch": 0.8079734219269104, + "grad_norm": 0.294921875, + "learning_rate": 2.045543528599607e-06, + "loss": 1.0982, + "step": 1064 + }, + { + "epoch": 0.8087327954437589, + "grad_norm": 0.33984375, + "learning_rate": 2.0299131563030016e-06, + "loss": 1.1587, + "step": 1065 + }, + { + "epoch": 0.8094921689606075, + "grad_norm": 0.388671875, + "learning_rate": 2.0143359820189403e-06, + "loss": 1.1613, + "step": 1066 + }, + { + "epoch": 0.8102515424774561, + "grad_norm": 0.30078125, + "learning_rate": 1.998812109720535e-06, + "loss": 1.1486, + "step": 1067 + }, + { + "epoch": 0.8110109159943047, + "grad_norm": 0.349609375, + "learning_rate": 1.983341643025117e-06, + "loss": 1.1652, + "step": 1068 + }, + { + "epoch": 0.8117702895111533, + "grad_norm": 0.31640625, + "learning_rate": 1.967924685193552e-06, + "loss": 1.1593, + "step": 1069 + }, + { + "epoch": 0.8125296630280019, + "grad_norm": 0.34375, + "learning_rate": 1.952561339129554e-06, + "loss": 1.1904, + "step": 1070 + }, + { + "epoch": 0.8132890365448505, + "grad_norm": 0.32421875, + "learning_rate": 1.93725170737899e-06, + "loss": 1.151, + "step": 1071 + }, + { + "epoch": 0.8140484100616991, + "grad_norm": 0.29296875, + "learning_rate": 1.921995892129208e-06, + "loss": 1.1097, + "step": 1072 + }, + { + "epoch": 0.8148077835785477, + "grad_norm": 0.375, + "learning_rate": 1.906793995208328e-06, + "loss": 1.1875, + "step": 1073 + }, + { + "epoch": 0.8155671570953963, + "grad_norm": 0.400390625, + "learning_rate": 1.8916461180845968e-06, + "loss": 1.2437, + "step": 1074 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.375, + "learning_rate": 1.8765523618656923e-06, + "loss": 1.1949, + "step": 1075 + }, + { + "epoch": 0.8170859041290935, + "grad_norm": 0.33203125, + "learning_rate": 1.861512827298051e-06, + "loss": 1.1321, + "step": 1076 + }, + { + "epoch": 0.8178452776459421, + "grad_norm": 0.328125, + "learning_rate": 1.8465276147661905e-06, + "loss": 1.1811, + "step": 1077 + }, + { + "epoch": 0.8186046511627907, + "grad_norm": 0.35546875, + "learning_rate": 1.8315968242920446e-06, + "loss": 1.2074, + "step": 1078 + }, + { + "epoch": 0.8193640246796393, + "grad_norm": 0.345703125, + "learning_rate": 1.8167205555343027e-06, + "loss": 1.1378, + "step": 1079 + }, + { + "epoch": 0.8201233981964879, + "grad_norm": 0.314453125, + "learning_rate": 1.8018989077877368e-06, + "loss": 1.1401, + "step": 1080 + }, + { + "epoch": 0.8208827717133365, + "grad_norm": 0.3203125, + "learning_rate": 1.7871319799825316e-06, + "loss": 1.1455, + "step": 1081 + }, + { + "epoch": 0.8216421452301851, + "grad_norm": 0.365234375, + "learning_rate": 1.7724198706836372e-06, + "loss": 1.1678, + "step": 1082 + }, + { + "epoch": 0.8224015187470337, + "grad_norm": 0.447265625, + "learning_rate": 1.757762678090107e-06, + "loss": 1.1541, + "step": 1083 + }, + { + "epoch": 0.8231608922638823, + "grad_norm": 0.365234375, + "learning_rate": 1.743160500034443e-06, + "loss": 1.1924, + "step": 1084 + }, + { + "epoch": 0.8239202657807309, + "grad_norm": 0.30859375, + "learning_rate": 1.7286134339819337e-06, + "loss": 1.1414, + "step": 1085 + }, + { + "epoch": 0.8246796392975795, + "grad_norm": 0.322265625, + "learning_rate": 1.7141215770300202e-06, + "loss": 1.1341, + "step": 1086 + }, + { + "epoch": 0.8254390128144281, + "grad_norm": 0.359375, + "learning_rate": 1.6996850259076303e-06, + "loss": 1.1874, + "step": 1087 + }, + { + "epoch": 0.8261983863312767, + "grad_norm": 0.3359375, + "learning_rate": 1.6853038769745466e-06, + "loss": 1.1982, + "step": 1088 + }, + { + "epoch": 0.8269577598481253, + "grad_norm": 0.369140625, + "learning_rate": 1.670978226220762e-06, + "loss": 1.2065, + "step": 1089 + }, + { + "epoch": 0.8277171333649739, + "grad_norm": 0.322265625, + "learning_rate": 1.6567081692658238e-06, + "loss": 1.148, + "step": 1090 + }, + { + "epoch": 0.8284765068818225, + "grad_norm": 0.3046875, + "learning_rate": 1.642493801358218e-06, + "loss": 1.1179, + "step": 1091 + }, + { + "epoch": 0.8292358803986711, + "grad_norm": 0.3359375, + "learning_rate": 1.6283352173747148e-06, + "loss": 1.1411, + "step": 1092 + }, + { + "epoch": 0.8299952539155196, + "grad_norm": 0.369140625, + "learning_rate": 1.6142325118197488e-06, + "loss": 1.1431, + "step": 1093 + }, + { + "epoch": 0.8307546274323683, + "grad_norm": 0.32421875, + "learning_rate": 1.6001857788247755e-06, + "loss": 1.1494, + "step": 1094 + }, + { + "epoch": 0.8315140009492169, + "grad_norm": 0.365234375, + "learning_rate": 1.5861951121476571e-06, + "loss": 1.1864, + "step": 1095 + }, + { + "epoch": 0.8322733744660655, + "grad_norm": 0.26171875, + "learning_rate": 1.5722606051720268e-06, + "loss": 1.1363, + "step": 1096 + }, + { + "epoch": 0.833032747982914, + "grad_norm": 0.322265625, + "learning_rate": 1.5583823509066665e-06, + "loss": 1.1366, + "step": 1097 + }, + { + "epoch": 0.8337921214997627, + "grad_norm": 0.275390625, + "learning_rate": 1.5445604419848858e-06, + "loss": 1.1422, + "step": 1098 + }, + { + "epoch": 0.8345514950166113, + "grad_norm": 0.380859375, + "learning_rate": 1.5307949706639114e-06, + "loss": 1.1861, + "step": 1099 + }, + { + "epoch": 0.8353108685334599, + "grad_norm": 0.408203125, + "learning_rate": 1.5170860288242638e-06, + "loss": 1.1732, + "step": 1100 + }, + { + "epoch": 0.8360702420503084, + "grad_norm": 0.349609375, + "learning_rate": 1.503433707969142e-06, + "loss": 1.1638, + "step": 1101 + }, + { + "epoch": 0.8368296155671571, + "grad_norm": 0.310546875, + "learning_rate": 1.489838099223816e-06, + "loss": 1.1235, + "step": 1102 + }, + { + "epoch": 0.8375889890840057, + "grad_norm": 0.318359375, + "learning_rate": 1.476299293335024e-06, + "loss": 1.1356, + "step": 1103 + }, + { + "epoch": 0.8383483626008543, + "grad_norm": 0.27734375, + "learning_rate": 1.4628173806703594e-06, + "loss": 1.1142, + "step": 1104 + }, + { + "epoch": 0.8391077361177028, + "grad_norm": 0.30859375, + "learning_rate": 1.4493924512176748e-06, + "loss": 1.1373, + "step": 1105 + }, + { + "epoch": 0.8398671096345515, + "grad_norm": 0.40625, + "learning_rate": 1.436024594584461e-06, + "loss": 1.2117, + "step": 1106 + }, + { + "epoch": 0.8406264831514001, + "grad_norm": 0.248046875, + "learning_rate": 1.4227138999972801e-06, + "loss": 1.077, + "step": 1107 + }, + { + "epoch": 0.8413858566682487, + "grad_norm": 0.353515625, + "learning_rate": 1.409460456301147e-06, + "loss": 1.1294, + "step": 1108 + }, + { + "epoch": 0.8421452301850973, + "grad_norm": 0.31640625, + "learning_rate": 1.3962643519589502e-06, + "loss": 1.1354, + "step": 1109 + }, + { + "epoch": 0.8429046037019459, + "grad_norm": 0.412109375, + "learning_rate": 1.3831256750508449e-06, + "loss": 1.1973, + "step": 1110 + }, + { + "epoch": 0.8436639772187945, + "grad_norm": 0.30078125, + "learning_rate": 1.3700445132736795e-06, + "loss": 1.1396, + "step": 1111 + }, + { + "epoch": 0.8444233507356431, + "grad_norm": 0.302734375, + "learning_rate": 1.3570209539404067e-06, + "loss": 1.1354, + "step": 1112 + }, + { + "epoch": 0.8451827242524917, + "grad_norm": 0.322265625, + "learning_rate": 1.3440550839795008e-06, + "loss": 1.1847, + "step": 1113 + }, + { + "epoch": 0.8459420977693403, + "grad_norm": 0.306640625, + "learning_rate": 1.3311469899343698e-06, + "loss": 1.1425, + "step": 1114 + }, + { + "epoch": 0.8467014712861889, + "grad_norm": 0.298828125, + "learning_rate": 1.3182967579627948e-06, + "loss": 1.1266, + "step": 1115 + }, + { + "epoch": 0.8474608448030375, + "grad_norm": 0.318359375, + "learning_rate": 1.305504473836331e-06, + "loss": 1.1409, + "step": 1116 + }, + { + "epoch": 0.8482202183198861, + "grad_norm": 0.341796875, + "learning_rate": 1.2927702229397633e-06, + "loss": 1.1686, + "step": 1117 + }, + { + "epoch": 0.8489795918367347, + "grad_norm": 0.36328125, + "learning_rate": 1.2800940902705072e-06, + "loss": 1.1655, + "step": 1118 + }, + { + "epoch": 0.8497389653535833, + "grad_norm": 0.322265625, + "learning_rate": 1.2674761604380692e-06, + "loss": 1.1476, + "step": 1119 + }, + { + "epoch": 0.8504983388704319, + "grad_norm": 0.388671875, + "learning_rate": 1.2549165176634582e-06, + "loss": 1.2241, + "step": 1120 + }, + { + "epoch": 0.8512577123872805, + "grad_norm": 0.3203125, + "learning_rate": 1.2424152457786408e-06, + "loss": 1.1283, + "step": 1121 + }, + { + "epoch": 0.8520170859041291, + "grad_norm": 0.330078125, + "learning_rate": 1.2299724282259685e-06, + "loss": 1.1519, + "step": 1122 + }, + { + "epoch": 0.8527764594209777, + "grad_norm": 0.31640625, + "learning_rate": 1.2175881480576347e-06, + "loss": 1.1268, + "step": 1123 + }, + { + "epoch": 0.8535358329378263, + "grad_norm": 0.30859375, + "learning_rate": 1.2052624879351105e-06, + "loss": 1.0941, + "step": 1124 + }, + { + "epoch": 0.8542952064546749, + "grad_norm": 0.3359375, + "learning_rate": 1.1929955301285889e-06, + "loss": 1.1533, + "step": 1125 + }, + { + "epoch": 0.8550545799715235, + "grad_norm": 0.365234375, + "learning_rate": 1.1807873565164507e-06, + "loss": 1.1927, + "step": 1126 + }, + { + "epoch": 0.8558139534883721, + "grad_norm": 0.361328125, + "learning_rate": 1.1686380485847027e-06, + "loss": 1.1902, + "step": 1127 + }, + { + "epoch": 0.8565733270052207, + "grad_norm": 0.287109375, + "learning_rate": 1.1565476874264448e-06, + "loss": 1.1152, + "step": 1128 + }, + { + "epoch": 0.8573327005220693, + "grad_norm": 0.330078125, + "learning_rate": 1.144516353741324e-06, + "loss": 1.1328, + "step": 1129 + }, + { + "epoch": 0.8580920740389179, + "grad_norm": 0.333984375, + "learning_rate": 1.1325441278349935e-06, + "loss": 1.1626, + "step": 1130 + }, + { + "epoch": 0.8588514475557665, + "grad_norm": 0.3671875, + "learning_rate": 1.120631089618579e-06, + "loss": 1.1927, + "step": 1131 + }, + { + "epoch": 0.8596108210726151, + "grad_norm": 0.365234375, + "learning_rate": 1.1087773186081474e-06, + "loss": 1.2139, + "step": 1132 + }, + { + "epoch": 0.8603701945894637, + "grad_norm": 0.33984375, + "learning_rate": 1.0969828939241779e-06, + "loss": 1.1491, + "step": 1133 + }, + { + "epoch": 0.8611295681063122, + "grad_norm": 0.341796875, + "learning_rate": 1.0852478942910228e-06, + "loss": 1.156, + "step": 1134 + }, + { + "epoch": 0.8618889416231609, + "grad_norm": 0.33984375, + "learning_rate": 1.0735723980363921e-06, + "loss": 1.1736, + "step": 1135 + }, + { + "epoch": 0.8626483151400095, + "grad_norm": 0.365234375, + "learning_rate": 1.0619564830908303e-06, + "loss": 1.1818, + "step": 1136 + }, + { + "epoch": 0.8634076886568581, + "grad_norm": 0.3515625, + "learning_rate": 1.0504002269871927e-06, + "loss": 1.1886, + "step": 1137 + }, + { + "epoch": 0.8641670621737066, + "grad_norm": 0.357421875, + "learning_rate": 1.0389037068601325e-06, + "loss": 1.2172, + "step": 1138 + }, + { + "epoch": 0.8649264356905553, + "grad_norm": 0.302734375, + "learning_rate": 1.027466999445572e-06, + "loss": 1.1286, + "step": 1139 + }, + { + "epoch": 0.8656858092074039, + "grad_norm": 0.32421875, + "learning_rate": 1.0160901810802114e-06, + "loss": 1.1688, + "step": 1140 + }, + { + "epoch": 0.8664451827242525, + "grad_norm": 0.36328125, + "learning_rate": 1.0047733277010064e-06, + "loss": 1.2127, + "step": 1141 + }, + { + "epoch": 0.867204556241101, + "grad_norm": 0.35546875, + "learning_rate": 9.935165148446658e-07, + "loss": 1.1628, + "step": 1142 + }, + { + "epoch": 0.8679639297579497, + "grad_norm": 0.30859375, + "learning_rate": 9.823198176471381e-07, + "loss": 1.1454, + "step": 1143 + }, + { + "epoch": 0.8687233032747983, + "grad_norm": 0.306640625, + "learning_rate": 9.711833108431234e-07, + "loss": 1.1546, + "step": 1144 + }, + { + "epoch": 0.8694826767916469, + "grad_norm": 0.34765625, + "learning_rate": 9.601070687655667e-07, + "loss": 1.1958, + "step": 1145 + }, + { + "epoch": 0.8702420503084954, + "grad_norm": 0.30859375, + "learning_rate": 9.490911653451651e-07, + "loss": 1.1511, + "step": 1146 + }, + { + "epoch": 0.8710014238253441, + "grad_norm": 0.3125, + "learning_rate": 9.381356741098702e-07, + "loss": 1.148, + "step": 1147 + }, + { + "epoch": 0.8717607973421927, + "grad_norm": 0.328125, + "learning_rate": 9.272406681844015e-07, + "loss": 1.1383, + "step": 1148 + }, + { + "epoch": 0.8725201708590413, + "grad_norm": 0.345703125, + "learning_rate": 9.164062202897539e-07, + "loss": 1.137, + "step": 1149 + }, + { + "epoch": 0.8732795443758898, + "grad_norm": 0.33984375, + "learning_rate": 9.05632402742721e-07, + "loss": 1.1381, + "step": 1150 + }, + { + "epoch": 0.8740389178927385, + "grad_norm": 0.365234375, + "learning_rate": 8.949192874553991e-07, + "loss": 1.1854, + "step": 1151 + }, + { + "epoch": 0.8747982914095871, + "grad_norm": 0.42578125, + "learning_rate": 8.842669459347186e-07, + "loss": 1.199, + "step": 1152 + }, + { + "epoch": 0.8755576649264357, + "grad_norm": 0.35546875, + "learning_rate": 8.736754492819655e-07, + "loss": 1.1787, + "step": 1153 + }, + { + "epoch": 0.8763170384432842, + "grad_norm": 0.36328125, + "learning_rate": 8.631448681922994e-07, + "loss": 1.1742, + "step": 1154 + }, + { + "epoch": 0.8770764119601329, + "grad_norm": 0.3359375, + "learning_rate": 8.526752729542831e-07, + "loss": 1.1326, + "step": 1155 + }, + { + "epoch": 0.8778357854769815, + "grad_norm": 0.365234375, + "learning_rate": 8.42266733449425e-07, + "loss": 1.1984, + "step": 1156 + }, + { + "epoch": 0.8785951589938301, + "grad_norm": 0.296875, + "learning_rate": 8.319193191517016e-07, + "loss": 1.1403, + "step": 1157 + }, + { + "epoch": 0.8793545325106787, + "grad_norm": 0.333984375, + "learning_rate": 8.216330991270916e-07, + "loss": 1.1532, + "step": 1158 + }, + { + "epoch": 0.8801139060275273, + "grad_norm": 0.283203125, + "learning_rate": 8.114081420331266e-07, + "loss": 1.1398, + "step": 1159 + }, + { + "epoch": 0.8808732795443759, + "grad_norm": 0.283203125, + "learning_rate": 8.012445161184179e-07, + "loss": 1.1201, + "step": 1160 + }, + { + "epoch": 0.8816326530612245, + "grad_norm": 0.306640625, + "learning_rate": 7.911422892222165e-07, + "loss": 1.1367, + "step": 1161 + }, + { + "epoch": 0.8823920265780731, + "grad_norm": 0.36328125, + "learning_rate": 7.81101528773951e-07, + "loss": 1.1888, + "step": 1162 + }, + { + "epoch": 0.8831514000949217, + "grad_norm": 0.373046875, + "learning_rate": 7.711223017927783e-07, + "loss": 1.1283, + "step": 1163 + }, + { + "epoch": 0.8839107736117703, + "grad_norm": 0.298828125, + "learning_rate": 7.612046748871327e-07, + "loss": 1.114, + "step": 1164 + }, + { + "epoch": 0.8846701471286189, + "grad_norm": 0.330078125, + "learning_rate": 7.513487142542941e-07, + "loss": 1.1995, + "step": 1165 + }, + { + "epoch": 0.8854295206454675, + "grad_norm": 0.302734375, + "learning_rate": 7.415544856799362e-07, + "loss": 1.1137, + "step": 1166 + }, + { + "epoch": 0.886188894162316, + "grad_norm": 0.33984375, + "learning_rate": 7.318220545376842e-07, + "loss": 1.1919, + "step": 1167 + }, + { + "epoch": 0.8869482676791647, + "grad_norm": 0.298828125, + "learning_rate": 7.221514857886857e-07, + "loss": 1.1217, + "step": 1168 + }, + { + "epoch": 0.8877076411960133, + "grad_norm": 0.3046875, + "learning_rate": 7.125428439811765e-07, + "loss": 1.1266, + "step": 1169 + }, + { + "epoch": 0.8884670147128619, + "grad_norm": 0.318359375, + "learning_rate": 7.029961932500506e-07, + "loss": 1.159, + "step": 1170 + }, + { + "epoch": 0.8892263882297105, + "grad_norm": 0.33984375, + "learning_rate": 6.935115973164208e-07, + "loss": 1.1782, + "step": 1171 + }, + { + "epoch": 0.8899857617465591, + "grad_norm": 0.3046875, + "learning_rate": 6.840891194872112e-07, + "loss": 1.109, + "step": 1172 + }, + { + "epoch": 0.8907451352634077, + "grad_norm": 0.341796875, + "learning_rate": 6.7472882265472e-07, + "loss": 1.2068, + "step": 1173 + }, + { + "epoch": 0.8915045087802563, + "grad_norm": 0.296875, + "learning_rate": 6.65430769296207e-07, + "loss": 1.1619, + "step": 1174 + }, + { + "epoch": 0.8922638822971048, + "grad_norm": 0.306640625, + "learning_rate": 6.56195021473478e-07, + "loss": 1.1534, + "step": 1175 + }, + { + "epoch": 0.8930232558139535, + "grad_norm": 0.384765625, + "learning_rate": 6.470216408324626e-07, + "loss": 1.1999, + "step": 1176 + }, + { + "epoch": 0.8937826293308021, + "grad_norm": 0.3046875, + "learning_rate": 6.379106886028086e-07, + "loss": 1.1417, + "step": 1177 + }, + { + "epoch": 0.8945420028476507, + "grad_norm": 0.328125, + "learning_rate": 6.288622255974741e-07, + "loss": 1.1552, + "step": 1178 + }, + { + "epoch": 0.8953013763644992, + "grad_norm": 0.341796875, + "learning_rate": 6.198763122123208e-07, + "loss": 1.1639, + "step": 1179 + }, + { + "epoch": 0.8960607498813479, + "grad_norm": 0.2890625, + "learning_rate": 6.109530084257043e-07, + "loss": 1.1234, + "step": 1180 + }, + { + "epoch": 0.8968201233981965, + "grad_norm": 0.353515625, + "learning_rate": 6.020923737980877e-07, + "loss": 1.1633, + "step": 1181 + }, + { + "epoch": 0.8975794969150451, + "grad_norm": 0.318359375, + "learning_rate": 5.932944674716279e-07, + "loss": 1.1606, + "step": 1182 + }, + { + "epoch": 0.8983388704318936, + "grad_norm": 0.322265625, + "learning_rate": 5.845593481697931e-07, + "loss": 1.1113, + "step": 1183 + }, + { + "epoch": 0.8990982439487423, + "grad_norm": 0.341796875, + "learning_rate": 5.758870741969635e-07, + "loss": 1.1429, + "step": 1184 + }, + { + "epoch": 0.8998576174655909, + "grad_norm": 0.306640625, + "learning_rate": 5.672777034380483e-07, + "loss": 1.1521, + "step": 1185 + }, + { + "epoch": 0.9006169909824395, + "grad_norm": 0.30078125, + "learning_rate": 5.587312933580946e-07, + "loss": 1.1341, + "step": 1186 + }, + { + "epoch": 0.901376364499288, + "grad_norm": 0.318359375, + "learning_rate": 5.502479010019046e-07, + "loss": 1.143, + "step": 1187 + }, + { + "epoch": 0.9021357380161367, + "grad_norm": 0.337890625, + "learning_rate": 5.418275829936537e-07, + "loss": 1.1586, + "step": 1188 + }, + { + "epoch": 0.9028951115329853, + "grad_norm": 0.33203125, + "learning_rate": 5.334703955365183e-07, + "loss": 1.1349, + "step": 1189 + }, + { + "epoch": 0.9036544850498339, + "grad_norm": 0.3671875, + "learning_rate": 5.251763944122956e-07, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.9044138585666824, + "grad_norm": 0.349609375, + "learning_rate": 5.169456349810342e-07, + "loss": 1.2073, + "step": 1191 + }, + { + "epoch": 0.9051732320835311, + "grad_norm": 0.369140625, + "learning_rate": 5.087781721806539e-07, + "loss": 1.162, + "step": 1192 + }, + { + "epoch": 0.9059326056003797, + "grad_norm": 0.36328125, + "learning_rate": 5.00674060526598e-07, + "loss": 1.1938, + "step": 1193 + }, + { + "epoch": 0.9066919791172283, + "grad_norm": 0.345703125, + "learning_rate": 4.926333541114558e-07, + "loss": 1.1564, + "step": 1194 + }, + { + "epoch": 0.9074513526340768, + "grad_norm": 0.412109375, + "learning_rate": 4.846561066046063e-07, + "loss": 1.2107, + "step": 1195 + }, + { + "epoch": 0.9082107261509255, + "grad_norm": 0.380859375, + "learning_rate": 4.7674237125185597e-07, + "loss": 1.2019, + "step": 1196 + }, + { + "epoch": 0.9089700996677741, + "grad_norm": 0.37109375, + "learning_rate": 4.6889220087508514e-07, + "loss": 1.1731, + "step": 1197 + }, + { + "epoch": 0.9097294731846227, + "grad_norm": 0.341796875, + "learning_rate": 4.611056478719023e-07, + "loss": 1.1591, + "step": 1198 + }, + { + "epoch": 0.9104888467014712, + "grad_norm": 0.349609375, + "learning_rate": 4.5338276421528435e-07, + "loss": 1.1698, + "step": 1199 + }, + { + "epoch": 0.9112482202183199, + "grad_norm": 0.3359375, + "learning_rate": 4.45723601453234e-07, + "loss": 1.179, + "step": 1200 + }, + { + "epoch": 0.9120075937351685, + "grad_norm": 0.35546875, + "learning_rate": 4.3812821070843394e-07, + "loss": 1.1383, + "step": 1201 + }, + { + "epoch": 0.9127669672520171, + "grad_norm": 0.326171875, + "learning_rate": 4.305966426779118e-07, + "loss": 1.118, + "step": 1202 + }, + { + "epoch": 0.9135263407688657, + "grad_norm": 0.267578125, + "learning_rate": 4.2312894763269385e-07, + "loss": 1.1147, + "step": 1203 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.39453125, + "learning_rate": 4.1572517541747294e-07, + "loss": 1.2228, + "step": 1204 + }, + { + "epoch": 0.9150450878025629, + "grad_norm": 0.326171875, + "learning_rate": 4.0838537545027755e-07, + "loss": 1.144, + "step": 1205 + }, + { + "epoch": 0.9158044613194115, + "grad_norm": 0.32421875, + "learning_rate": 4.0110959672213676e-07, + "loss": 1.1403, + "step": 1206 + }, + { + "epoch": 0.9165638348362601, + "grad_norm": 0.341796875, + "learning_rate": 3.9389788779675806e-07, + "loss": 1.1552, + "step": 1207 + }, + { + "epoch": 0.9173232083531087, + "grad_norm": 0.349609375, + "learning_rate": 3.867502968102055e-07, + "loss": 1.1785, + "step": 1208 + }, + { + "epoch": 0.9180825818699573, + "grad_norm": 0.357421875, + "learning_rate": 3.7966687147056533e-07, + "loss": 1.1487, + "step": 1209 + }, + { + "epoch": 0.9188419553868059, + "grad_norm": 0.3125, + "learning_rate": 3.7264765905764776e-07, + "loss": 1.1304, + "step": 1210 + }, + { + "epoch": 0.9196013289036545, + "grad_norm": 0.28515625, + "learning_rate": 3.656927064226512e-07, + "loss": 1.1109, + "step": 1211 + }, + { + "epoch": 0.920360702420503, + "grad_norm": 0.298828125, + "learning_rate": 3.588020599878639e-07, + "loss": 1.148, + "step": 1212 + }, + { + "epoch": 0.9211200759373517, + "grad_norm": 0.33984375, + "learning_rate": 3.519757657463474e-07, + "loss": 1.1745, + "step": 1213 + }, + { + "epoch": 0.9218794494542003, + "grad_norm": 0.34765625, + "learning_rate": 3.4521386926163134e-07, + "loss": 1.1452, + "step": 1214 + }, + { + "epoch": 0.9226388229710489, + "grad_norm": 0.330078125, + "learning_rate": 3.3851641566740813e-07, + "loss": 1.1598, + "step": 1215 + }, + { + "epoch": 0.9233981964878974, + "grad_norm": 0.365234375, + "learning_rate": 3.3188344966723516e-07, + "loss": 1.1889, + "step": 1216 + }, + { + "epoch": 0.9241575700047461, + "grad_norm": 0.345703125, + "learning_rate": 3.2531501553422884e-07, + "loss": 1.1822, + "step": 1217 + }, + { + "epoch": 0.9249169435215947, + "grad_norm": 0.318359375, + "learning_rate": 3.1881115711077994e-07, + "loss": 1.1675, + "step": 1218 + }, + { + "epoch": 0.9256763170384433, + "grad_norm": 0.294921875, + "learning_rate": 3.123719178082529e-07, + "loss": 1.1539, + "step": 1219 + }, + { + "epoch": 0.9264356905552918, + "grad_norm": 0.3671875, + "learning_rate": 3.059973406066963e-07, + "loss": 1.1554, + "step": 1220 + }, + { + "epoch": 0.9271950640721405, + "grad_norm": 0.361328125, + "learning_rate": 2.996874680545603e-07, + "loss": 1.1506, + "step": 1221 + }, + { + "epoch": 0.9279544375889891, + "grad_norm": 0.34765625, + "learning_rate": 2.9344234226840964e-07, + "loss": 1.167, + "step": 1222 + }, + { + "epoch": 0.9287138111058377, + "grad_norm": 0.302734375, + "learning_rate": 2.872620049326436e-07, + "loss": 1.1533, + "step": 1223 + }, + { + "epoch": 0.9294731846226862, + "grad_norm": 0.345703125, + "learning_rate": 2.811464972992195e-07, + "loss": 1.1686, + "step": 1224 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.361328125, + "learning_rate": 2.7509586018736764e-07, + "loss": 1.1638, + "step": 1225 + }, + { + "epoch": 0.9309919316563835, + "grad_norm": 0.361328125, + "learning_rate": 2.6911013398333464e-07, + "loss": 1.1969, + "step": 1226 + }, + { + "epoch": 0.9317513051732321, + "grad_norm": 0.333984375, + "learning_rate": 2.6318935864010133e-07, + "loss": 1.1527, + "step": 1227 + }, + { + "epoch": 0.9325106786900806, + "grad_norm": 0.357421875, + "learning_rate": 2.573335736771254e-07, + "loss": 1.1725, + "step": 1228 + }, + { + "epoch": 0.9332700522069293, + "grad_norm": 0.259765625, + "learning_rate": 2.51542818180065e-07, + "loss": 1.0826, + "step": 1229 + }, + { + "epoch": 0.9340294257237779, + "grad_norm": 0.314453125, + "learning_rate": 2.458171308005308e-07, + "loss": 1.1372, + "step": 1230 + }, + { + "epoch": 0.9347887992406265, + "grad_norm": 0.29296875, + "learning_rate": 2.4015654975582225e-07, + "loss": 1.1359, + "step": 1231 + }, + { + "epoch": 0.935548172757475, + "grad_norm": 0.294921875, + "learning_rate": 2.3456111282867178e-07, + "loss": 1.1214, + "step": 1232 + }, + { + "epoch": 0.9363075462743237, + "grad_norm": 0.28125, + "learning_rate": 2.2903085736699414e-07, + "loss": 1.0865, + "step": 1233 + }, + { + "epoch": 0.9370669197911723, + "grad_norm": 0.3828125, + "learning_rate": 2.2356582028363548e-07, + "loss": 1.1849, + "step": 1234 + }, + { + "epoch": 0.9378262933080209, + "grad_norm": 0.28125, + "learning_rate": 2.1816603805613012e-07, + "loss": 1.137, + "step": 1235 + }, + { + "epoch": 0.9385856668248694, + "grad_norm": 0.30859375, + "learning_rate": 2.1283154672645522e-07, + "loss": 1.1179, + "step": 1236 + }, + { + "epoch": 0.9393450403417181, + "grad_norm": 0.333984375, + "learning_rate": 2.0756238190078991e-07, + "loss": 1.1576, + "step": 1237 + }, + { + "epoch": 0.9401044138585667, + "grad_norm": 0.3359375, + "learning_rate": 2.0235857874927655e-07, + "loss": 1.1685, + "step": 1238 + }, + { + "epoch": 0.9408637873754153, + "grad_norm": 0.359375, + "learning_rate": 1.9722017200578757e-07, + "loss": 1.167, + "step": 1239 + }, + { + "epoch": 0.9416231608922638, + "grad_norm": 0.302734375, + "learning_rate": 1.921471959676957e-07, + "loss": 1.0967, + "step": 1240 + }, + { + "epoch": 0.9423825344091125, + "grad_norm": 0.35546875, + "learning_rate": 1.8713968449564079e-07, + "loss": 1.185, + "step": 1241 + }, + { + "epoch": 0.9431419079259611, + "grad_norm": 0.265625, + "learning_rate": 1.8219767101330442e-07, + "loss": 1.1248, + "step": 1242 + }, + { + "epoch": 0.9439012814428097, + "grad_norm": 0.353515625, + "learning_rate": 1.7732118850719237e-07, + "loss": 1.1056, + "step": 1243 + }, + { + "epoch": 0.9446606549596582, + "grad_norm": 0.322265625, + "learning_rate": 1.7251026952640583e-07, + "loss": 1.1053, + "step": 1244 + }, + { + "epoch": 0.9454200284765069, + "grad_norm": 0.328125, + "learning_rate": 1.6776494618243156e-07, + "loss": 1.1511, + "step": 1245 + }, + { + "epoch": 0.9461794019933555, + "grad_norm": 0.341796875, + "learning_rate": 1.6308525014892217e-07, + "loss": 1.1568, + "step": 1246 + }, + { + "epoch": 0.9469387755102041, + "grad_norm": 0.357421875, + "learning_rate": 1.5847121266148847e-07, + "loss": 1.1354, + "step": 1247 + }, + { + "epoch": 0.9476981490270526, + "grad_norm": 0.345703125, + "learning_rate": 1.539228645174895e-07, + "loss": 1.2015, + "step": 1248 + }, + { + "epoch": 0.9484575225439013, + "grad_norm": 0.29296875, + "learning_rate": 1.4944023607582737e-07, + "loss": 1.1045, + "step": 1249 + }, + { + "epoch": 0.9492168960607499, + "grad_norm": 0.318359375, + "learning_rate": 1.4502335725674165e-07, + "loss": 1.1576, + "step": 1250 + }, + { + "epoch": 0.9499762695775985, + "grad_norm": 0.32421875, + "learning_rate": 1.406722575416164e-07, + "loss": 1.1525, + "step": 1251 + }, + { + "epoch": 0.9507356430944471, + "grad_norm": 0.384765625, + "learning_rate": 1.3638696597277678e-07, + "loss": 1.1828, + "step": 1252 + }, + { + "epoch": 0.9514950166112957, + "grad_norm": 0.322265625, + "learning_rate": 1.3216751115329718e-07, + "loss": 1.1428, + "step": 1253 + }, + { + "epoch": 0.9522543901281443, + "grad_norm": 0.294921875, + "learning_rate": 1.2801392124681233e-07, + "loss": 1.1528, + "step": 1254 + }, + { + "epoch": 0.9530137636449929, + "grad_norm": 0.318359375, + "learning_rate": 1.2392622397732756e-07, + "loss": 1.1491, + "step": 1255 + }, + { + "epoch": 0.9537731371618415, + "grad_norm": 0.326171875, + "learning_rate": 1.1990444662903445e-07, + "loss": 1.2012, + "step": 1256 + }, + { + "epoch": 0.95453251067869, + "grad_norm": 0.275390625, + "learning_rate": 1.159486160461265e-07, + "loss": 1.1128, + "step": 1257 + }, + { + "epoch": 0.9552918841955387, + "grad_norm": 0.33203125, + "learning_rate": 1.1205875863262272e-07, + "loss": 1.1725, + "step": 1258 + }, + { + "epoch": 0.9560512577123873, + "grad_norm": 0.359375, + "learning_rate": 1.0823490035218986e-07, + "loss": 1.1942, + "step": 1259 + }, + { + "epoch": 0.9568106312292359, + "grad_norm": 0.33984375, + "learning_rate": 1.0447706672797264e-07, + "loss": 1.1906, + "step": 1260 + }, + { + "epoch": 0.9575700047460844, + "grad_norm": 0.3671875, + "learning_rate": 1.0078528284241606e-07, + "loss": 1.1831, + "step": 1261 + }, + { + "epoch": 0.9583293782629331, + "grad_norm": 0.388671875, + "learning_rate": 9.715957333710447e-08, + "loss": 1.1504, + "step": 1262 + }, + { + "epoch": 0.9590887517797817, + "grad_norm": 0.322265625, + "learning_rate": 9.359996241259384e-08, + "loss": 1.1406, + "step": 1263 + }, + { + "epoch": 0.9598481252966303, + "grad_norm": 0.330078125, + "learning_rate": 9.010647382825421e-08, + "loss": 1.1464, + "step": 1264 + }, + { + "epoch": 0.9606074988134788, + "grad_norm": 0.341796875, + "learning_rate": 8.667913090210534e-08, + "loss": 1.1418, + "step": 1265 + }, + { + "epoch": 0.9613668723303275, + "grad_norm": 0.40234375, + "learning_rate": 8.331795651066455e-08, + "loss": 1.1785, + "step": 1266 + }, + { + "epoch": 0.9621262458471761, + "grad_norm": 0.333984375, + "learning_rate": 8.002297308879359e-08, + "loss": 1.1703, + "step": 1267 + }, + { + "epoch": 0.9628856193640247, + "grad_norm": 0.349609375, + "learning_rate": 7.679420262954984e-08, + "loss": 1.1569, + "step": 1268 + }, + { + "epoch": 0.9636449928808732, + "grad_norm": 0.3125, + "learning_rate": 7.363166668403643e-08, + "loss": 1.1488, + "step": 1269 + }, + { + "epoch": 0.9644043663977219, + "grad_norm": 0.37890625, + "learning_rate": 7.053538636126123e-08, + "loss": 1.1948, + "step": 1270 + }, + { + "epoch": 0.9651637399145705, + "grad_norm": 0.341796875, + "learning_rate": 6.750538232799586e-08, + "loss": 1.1496, + "step": 1271 + }, + { + "epoch": 0.9659231134314191, + "grad_norm": 0.330078125, + "learning_rate": 6.454167480863694e-08, + "loss": 1.1463, + "step": 1272 + }, + { + "epoch": 0.9666824869482676, + "grad_norm": 0.302734375, + "learning_rate": 6.164428358506947e-08, + "loss": 1.1507, + "step": 1273 + }, + { + "epoch": 0.9674418604651163, + "grad_norm": 0.369140625, + "learning_rate": 5.881322799653699e-08, + "loss": 1.1549, + "step": 1274 + }, + { + "epoch": 0.9682012339819649, + "grad_norm": 0.3203125, + "learning_rate": 5.6048526939512794e-08, + "loss": 1.1406, + "step": 1275 + }, + { + "epoch": 0.9689606074988135, + "grad_norm": 0.30859375, + "learning_rate": 5.3350198867574424e-08, + "loss": 1.1267, + "step": 1276 + }, + { + "epoch": 0.969719981015662, + "grad_norm": 0.31640625, + "learning_rate": 5.0718261791274924e-08, + "loss": 1.147, + "step": 1277 + }, + { + "epoch": 0.9704793545325107, + "grad_norm": 0.30078125, + "learning_rate": 4.815273327803183e-08, + "loss": 1.1504, + "step": 1278 + }, + { + "epoch": 0.9712387280493593, + "grad_norm": 0.29296875, + "learning_rate": 4.5653630451998335e-08, + "loss": 1.1471, + "step": 1279 + }, + { + "epoch": 0.9719981015662079, + "grad_norm": 0.3203125, + "learning_rate": 4.32209699939623e-08, + "loss": 1.1204, + "step": 1280 + }, + { + "epoch": 0.9727574750830564, + "grad_norm": 0.353515625, + "learning_rate": 4.085476814122413e-08, + "loss": 1.1692, + "step": 1281 + }, + { + "epoch": 0.9735168485999051, + "grad_norm": 0.310546875, + "learning_rate": 3.8555040687493494e-08, + "loss": 1.1089, + "step": 1282 + }, + { + "epoch": 0.9742762221167537, + "grad_norm": 0.279296875, + "learning_rate": 3.632180298278165e-08, + "loss": 1.0833, + "step": 1283 + }, + { + "epoch": 0.9750355956336023, + "grad_norm": 0.322265625, + "learning_rate": 3.4155069933301535e-08, + "loss": 1.1362, + "step": 1284 + }, + { + "epoch": 0.9757949691504508, + "grad_norm": 0.365234375, + "learning_rate": 3.2054856001366706e-08, + "loss": 1.2, + "step": 1285 + }, + { + "epoch": 0.9765543426672995, + "grad_norm": 0.439453125, + "learning_rate": 3.0021175205294794e-08, + "loss": 1.2642, + "step": 1286 + }, + { + "epoch": 0.9773137161841481, + "grad_norm": 0.365234375, + "learning_rate": 2.805404111931198e-08, + "loss": 1.1712, + "step": 1287 + }, + { + "epoch": 0.9780730897009967, + "grad_norm": 0.373046875, + "learning_rate": 2.6153466873468646e-08, + "loss": 1.1773, + "step": 1288 + }, + { + "epoch": 0.9788324632178452, + "grad_norm": 0.314453125, + "learning_rate": 2.4319465153543886e-08, + "loss": 1.1556, + "step": 1289 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.326171875, + "learning_rate": 2.255204820096668e-08, + "loss": 1.1467, + "step": 1290 + }, + { + "epoch": 0.9803512102515425, + "grad_norm": 0.34375, + "learning_rate": 2.0851227812731523e-08, + "loss": 1.1793, + "step": 1291 + }, + { + "epoch": 0.9811105837683911, + "grad_norm": 0.326171875, + "learning_rate": 1.9217015341318478e-08, + "loss": 1.1366, + "step": 1292 + }, + { + "epoch": 0.9818699572852396, + "grad_norm": 0.33984375, + "learning_rate": 1.764942169462325e-08, + "loss": 1.1893, + "step": 1293 + }, + { + "epoch": 0.9826293308020883, + "grad_norm": 0.291015625, + "learning_rate": 1.6148457335876112e-08, + "loss": 1.1308, + "step": 1294 + }, + { + "epoch": 0.9833887043189369, + "grad_norm": 0.4453125, + "learning_rate": 1.4714132283577543e-08, + "loss": 1.2597, + "step": 1295 + }, + { + "epoch": 0.9841480778357855, + "grad_norm": 0.294921875, + "learning_rate": 1.3346456111430484e-08, + "loss": 1.1048, + "step": 1296 + }, + { + "epoch": 0.984907451352634, + "grad_norm": 0.30078125, + "learning_rate": 1.2045437948275952e-08, + "loss": 1.1165, + "step": 1297 + }, + { + "epoch": 0.9856668248694826, + "grad_norm": 0.365234375, + "learning_rate": 1.0811086478031973e-08, + "loss": 1.1419, + "step": 1298 + }, + { + "epoch": 0.9864261983863313, + "grad_norm": 0.330078125, + "learning_rate": 9.643409939636972e-09, + "loss": 1.1656, + "step": 1299 + }, + { + "epoch": 0.9871855719031799, + "grad_norm": 0.318359375, + "learning_rate": 8.542416126989805e-09, + "loss": 1.1344, + "step": 1300 + }, + { + "epoch": 0.9879449454200285, + "grad_norm": 0.34375, + "learning_rate": 7.508112388905363e-09, + "loss": 1.1509, + "step": 1301 + }, + { + "epoch": 0.988704318936877, + "grad_norm": 0.365234375, + "learning_rate": 6.540505629061278e-09, + "loss": 1.1836, + "step": 1302 + }, + { + "epoch": 0.9894636924537257, + "grad_norm": 0.3359375, + "learning_rate": 5.639602305950176e-09, + "loss": 1.1659, + "step": 1303 + }, + { + "epoch": 0.9902230659705743, + "grad_norm": 0.32421875, + "learning_rate": 4.80540843283972e-09, + "loss": 1.1539, + "step": 1304 + }, + { + "epoch": 0.9909824394874229, + "grad_norm": 0.298828125, + "learning_rate": 4.037929577732636e-09, + "loss": 1.1051, + "step": 1305 + }, + { + "epoch": 0.9917418130042714, + "grad_norm": 0.3203125, + "learning_rate": 3.3371708633267443e-09, + "loss": 1.153, + "step": 1306 + }, + { + "epoch": 0.9925011865211201, + "grad_norm": 0.3828125, + "learning_rate": 2.7031369669816566e-09, + "loss": 1.1997, + "step": 1307 + }, + { + "epoch": 0.9932605600379687, + "grad_norm": 0.283203125, + "learning_rate": 2.1358321206899067e-09, + "loss": 1.1305, + "step": 1308 + }, + { + "epoch": 0.9940199335548173, + "grad_norm": 0.265625, + "learning_rate": 1.6352601110469768e-09, + "loss": 1.0931, + "step": 1309 + }, + { + "epoch": 0.9947793070716658, + "grad_norm": 0.333984375, + "learning_rate": 1.20142427922465e-09, + "loss": 1.1754, + "step": 1310 + }, + { + "epoch": 0.9955386805885145, + "grad_norm": 0.39453125, + "learning_rate": 8.343275209521384e-10, + "loss": 1.2122, + "step": 1311 + }, + { + "epoch": 0.9962980541053631, + "grad_norm": 0.345703125, + "learning_rate": 5.339722864927677e-10, + "loss": 1.1428, + "step": 1312 + }, + { + "epoch": 0.9970574276222117, + "grad_norm": 0.326171875, + "learning_rate": 3.003605806306542e-10, + "loss": 1.1282, + "step": 1313 + }, + { + "epoch": 0.9978168011390602, + "grad_norm": 0.322265625, + "learning_rate": 1.3349396265516235e-10, + "loss": 1.1608, + "step": 1314 + }, + { + "epoch": 0.9985761746559089, + "grad_norm": 0.3203125, + "learning_rate": 3.3373546353132614e-11, + "loss": 1.1562, + "step": 1315 + }, + { + "epoch": 0.9993355481727575, + "grad_norm": 0.28515625, + "learning_rate": 0.0, + "loss": 1.1401, + "step": 1316 + }, + { + "epoch": 0.9993355481727575, + "eval_loss": 1.151589274406433, + "eval_runtime": 640.5297, + "eval_samples_per_second": 92.364, + "eval_steps_per_second": 7.698, + "step": 1316 + } + ], + "logging_steps": 1, + "max_steps": 1316, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.953779885289767e+18, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}