{ "best_metric": 1.1857857704162598, "best_model_checkpoint": "runs/deepseek-full-hard-low-lr/checkpoint-92500", "epoch": 2.5, "eval_steps": 2500, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 24.515625, "learning_rate": 6e-06, "loss": 1.9991, "step": 100 }, { "epoch": 0.01, "grad_norm": 22.859375, "learning_rate": 1.2e-05, "loss": 1.8155, "step": 200 }, { "epoch": 0.01, "grad_norm": 8.25, "learning_rate": 1.8e-05, "loss": 1.8039, "step": 300 }, { "epoch": 0.01, "grad_norm": 8.0234375, "learning_rate": 2.4e-05, "loss": 1.8399, "step": 400 }, { "epoch": 0.01, "grad_norm": 17.375, "learning_rate": 3e-05, "loss": 1.8311, "step": 500 }, { "epoch": 0.01, "grad_norm": 11.421875, "learning_rate": 2.9969849246231156e-05, "loss": 1.8306, "step": 600 }, { "epoch": 0.02, "grad_norm": 12.40625, "learning_rate": 2.993969849246231e-05, "loss": 1.8191, "step": 700 }, { "epoch": 0.02, "grad_norm": 4.71484375, "learning_rate": 2.990954773869347e-05, "loss": 1.8153, "step": 800 }, { "epoch": 0.02, "grad_norm": 8.1796875, "learning_rate": 2.9879396984924625e-05, "loss": 1.8115, "step": 900 }, { "epoch": 0.03, "grad_norm": 12.7734375, "learning_rate": 2.984924623115578e-05, "loss": 1.8093, "step": 1000 }, { "epoch": 0.03, "grad_norm": 7.9140625, "learning_rate": 2.9819095477386932e-05, "loss": 1.8475, "step": 1100 }, { "epoch": 0.03, "grad_norm": 4.83203125, "learning_rate": 2.978894472361809e-05, "loss": 1.7814, "step": 1200 }, { "epoch": 0.03, "grad_norm": 10.2421875, "learning_rate": 2.9758793969849246e-05, "loss": 1.7792, "step": 1300 }, { "epoch": 0.04, "grad_norm": 5.38671875, "learning_rate": 2.97286432160804e-05, "loss": 1.7633, "step": 1400 }, { "epoch": 0.04, "grad_norm": 7.8515625, "learning_rate": 2.9698492462311557e-05, "loss": 1.8018, "step": 1500 }, { "epoch": 0.04, "grad_norm": 7.8671875, "learning_rate": 2.9668341708542715e-05, "loss": 1.7645, "step": 1600 }, { "epoch": 0.04, "grad_norm": 6.91015625, "learning_rate": 2.963819095477387e-05, "loss": 1.7771, "step": 1700 }, { "epoch": 0.04, "grad_norm": 4.61328125, "learning_rate": 2.9608040201005026e-05, "loss": 1.7965, "step": 1800 }, { "epoch": 0.05, "grad_norm": 8.6796875, "learning_rate": 2.957788944723618e-05, "loss": 1.767, "step": 1900 }, { "epoch": 0.05, "grad_norm": 6.11328125, "learning_rate": 2.954773869346734e-05, "loss": 1.7448, "step": 2000 }, { "epoch": 0.05, "grad_norm": 4.79296875, "learning_rate": 2.9517587939698495e-05, "loss": 1.7822, "step": 2100 }, { "epoch": 0.06, "grad_norm": 6.8046875, "learning_rate": 2.9487437185929647e-05, "loss": 1.7997, "step": 2200 }, { "epoch": 0.06, "grad_norm": 5.39453125, "learning_rate": 2.9457286432160802e-05, "loss": 1.7795, "step": 2300 }, { "epoch": 0.06, "grad_norm": 9.2578125, "learning_rate": 2.942713567839196e-05, "loss": 1.7529, "step": 2400 }, { "epoch": 0.06, "grad_norm": 8.8828125, "learning_rate": 2.9396984924623116e-05, "loss": 1.7433, "step": 2500 }, { "epoch": 0.06, "eval_loss": 1.7661924362182617, "eval_runtime": 58.5928, "eval_samples_per_second": 17.067, "eval_steps_per_second": 4.267, "step": 2500 }, { "epoch": 0.07, "grad_norm": 7.8984375, "learning_rate": 2.936683417085427e-05, "loss": 1.7891, "step": 2600 }, { "epoch": 0.07, "grad_norm": 9.3828125, "learning_rate": 2.9336683417085427e-05, "loss": 1.7309, "step": 2700 }, { "epoch": 0.07, "grad_norm": 6.703125, "learning_rate": 2.9306532663316585e-05, "loss": 1.6948, "step": 2800 }, { "epoch": 0.07, "grad_norm": 9.546875, "learning_rate": 2.927638190954774e-05, "loss": 1.7142, "step": 2900 }, { "epoch": 0.07, "grad_norm": 8.59375, "learning_rate": 2.9246231155778896e-05, "loss": 1.7304, "step": 3000 }, { "epoch": 0.08, "grad_norm": 5.9375, "learning_rate": 2.921608040201005e-05, "loss": 1.7201, "step": 3100 }, { "epoch": 0.08, "grad_norm": 11.7734375, "learning_rate": 2.9185929648241207e-05, "loss": 1.7399, "step": 3200 }, { "epoch": 0.08, "grad_norm": 10.2578125, "learning_rate": 2.9155778894472365e-05, "loss": 1.7489, "step": 3300 }, { "epoch": 0.09, "grad_norm": 5.3671875, "learning_rate": 2.9125628140703517e-05, "loss": 1.7593, "step": 3400 }, { "epoch": 0.09, "grad_norm": 4.7890625, "learning_rate": 2.9095477386934672e-05, "loss": 1.7298, "step": 3500 }, { "epoch": 0.09, "grad_norm": 5.6328125, "learning_rate": 2.9065326633165828e-05, "loss": 1.727, "step": 3600 }, { "epoch": 0.09, "grad_norm": 9.0546875, "learning_rate": 2.9035175879396986e-05, "loss": 1.7101, "step": 3700 }, { "epoch": 0.1, "grad_norm": 6.26171875, "learning_rate": 2.900502512562814e-05, "loss": 1.728, "step": 3800 }, { "epoch": 0.1, "grad_norm": 8.0078125, "learning_rate": 2.8974874371859297e-05, "loss": 1.7085, "step": 3900 }, { "epoch": 0.1, "grad_norm": 5.2265625, "learning_rate": 2.8944723618090452e-05, "loss": 1.7097, "step": 4000 }, { "epoch": 0.1, "grad_norm": 5.74609375, "learning_rate": 2.891457286432161e-05, "loss": 1.7237, "step": 4100 }, { "epoch": 0.1, "grad_norm": 8.3203125, "learning_rate": 2.8884422110552766e-05, "loss": 1.7338, "step": 4200 }, { "epoch": 0.11, "grad_norm": 7.015625, "learning_rate": 2.885427135678392e-05, "loss": 1.731, "step": 4300 }, { "epoch": 0.11, "grad_norm": 7.00390625, "learning_rate": 2.8824120603015077e-05, "loss": 1.7338, "step": 4400 }, { "epoch": 0.11, "grad_norm": 4.61328125, "learning_rate": 2.8793969849246232e-05, "loss": 1.7275, "step": 4500 }, { "epoch": 0.12, "grad_norm": 7.4375, "learning_rate": 2.8763819095477387e-05, "loss": 1.737, "step": 4600 }, { "epoch": 0.12, "grad_norm": 9.625, "learning_rate": 2.8733668341708542e-05, "loss": 1.6846, "step": 4700 }, { "epoch": 0.12, "grad_norm": 6.83984375, "learning_rate": 2.8703517587939698e-05, "loss": 1.7758, "step": 4800 }, { "epoch": 0.12, "grad_norm": 5.9609375, "learning_rate": 2.8673366834170856e-05, "loss": 1.6865, "step": 4900 }, { "epoch": 0.12, "grad_norm": 7.47265625, "learning_rate": 2.864321608040201e-05, "loss": 1.7066, "step": 5000 }, { "epoch": 0.12, "eval_loss": 1.6524672508239746, "eval_runtime": 58.7554, "eval_samples_per_second": 17.02, "eval_steps_per_second": 4.255, "step": 5000 }, { "epoch": 0.13, "grad_norm": 7.13671875, "learning_rate": 2.8613065326633167e-05, "loss": 1.6966, "step": 5100 }, { "epoch": 0.13, "grad_norm": 8.0, "learning_rate": 2.8582914572864322e-05, "loss": 1.7145, "step": 5200 }, { "epoch": 0.13, "grad_norm": 7.125, "learning_rate": 2.8552763819095477e-05, "loss": 1.7035, "step": 5300 }, { "epoch": 0.14, "grad_norm": 9.4453125, "learning_rate": 2.8522613065326636e-05, "loss": 1.7273, "step": 5400 }, { "epoch": 0.14, "grad_norm": 6.609375, "learning_rate": 2.849246231155779e-05, "loss": 1.685, "step": 5500 }, { "epoch": 0.14, "grad_norm": 4.6171875, "learning_rate": 2.8462311557788943e-05, "loss": 1.6505, "step": 5600 }, { "epoch": 0.14, "grad_norm": 7.92578125, "learning_rate": 2.84321608040201e-05, "loss": 1.7042, "step": 5700 }, { "epoch": 0.14, "grad_norm": 6.94140625, "learning_rate": 2.8402010050251257e-05, "loss": 1.6963, "step": 5800 }, { "epoch": 0.15, "grad_norm": 7.96484375, "learning_rate": 2.8371859296482412e-05, "loss": 1.7243, "step": 5900 }, { "epoch": 0.15, "grad_norm": 7.5078125, "learning_rate": 2.8341708542713568e-05, "loss": 1.6677, "step": 6000 }, { "epoch": 0.15, "grad_norm": 7.20703125, "learning_rate": 2.8311557788944723e-05, "loss": 1.679, "step": 6100 }, { "epoch": 0.15, "grad_norm": 4.51953125, "learning_rate": 2.828140703517588e-05, "loss": 1.6874, "step": 6200 }, { "epoch": 0.16, "grad_norm": 9.5546875, "learning_rate": 2.8251256281407037e-05, "loss": 1.6843, "step": 6300 }, { "epoch": 0.16, "grad_norm": 6.48046875, "learning_rate": 2.8221105527638192e-05, "loss": 1.7014, "step": 6400 }, { "epoch": 0.16, "grad_norm": 7.5546875, "learning_rate": 2.8190954773869347e-05, "loss": 1.6564, "step": 6500 }, { "epoch": 0.17, "grad_norm": 6.37890625, "learning_rate": 2.8160804020100506e-05, "loss": 1.7107, "step": 6600 }, { "epoch": 0.17, "grad_norm": 9.5, "learning_rate": 2.813065326633166e-05, "loss": 1.7179, "step": 6700 }, { "epoch": 0.17, "grad_norm": 5.59765625, "learning_rate": 2.8100502512562813e-05, "loss": 1.6851, "step": 6800 }, { "epoch": 0.17, "grad_norm": 7.33984375, "learning_rate": 2.807035175879397e-05, "loss": 1.6718, "step": 6900 }, { "epoch": 0.17, "grad_norm": 7.5, "learning_rate": 2.8040201005025127e-05, "loss": 1.6659, "step": 7000 }, { "epoch": 0.18, "grad_norm": 9.90625, "learning_rate": 2.8010050251256282e-05, "loss": 1.6703, "step": 7100 }, { "epoch": 0.18, "grad_norm": 7.30078125, "learning_rate": 2.7979899497487438e-05, "loss": 1.6585, "step": 7200 }, { "epoch": 0.18, "grad_norm": 5.578125, "learning_rate": 2.7949748743718593e-05, "loss": 1.6567, "step": 7300 }, { "epoch": 0.18, "grad_norm": 6.83203125, "learning_rate": 2.791959798994975e-05, "loss": 1.633, "step": 7400 }, { "epoch": 0.19, "grad_norm": 6.90234375, "learning_rate": 2.7889447236180907e-05, "loss": 1.6951, "step": 7500 }, { "epoch": 0.19, "eval_loss": 1.6509525775909424, "eval_runtime": 58.6377, "eval_samples_per_second": 17.054, "eval_steps_per_second": 4.263, "step": 7500 }, { "epoch": 0.19, "grad_norm": 11.390625, "learning_rate": 2.7859296482412062e-05, "loss": 1.6665, "step": 7600 }, { "epoch": 0.19, "grad_norm": 6.58203125, "learning_rate": 2.7829145728643217e-05, "loss": 1.6476, "step": 7700 }, { "epoch": 0.2, "grad_norm": 5.8671875, "learning_rate": 2.7798994974874373e-05, "loss": 1.6488, "step": 7800 }, { "epoch": 0.2, "grad_norm": 4.69140625, "learning_rate": 2.7768844221105528e-05, "loss": 1.618, "step": 7900 }, { "epoch": 0.2, "grad_norm": 13.1796875, "learning_rate": 2.7738693467336683e-05, "loss": 1.6614, "step": 8000 }, { "epoch": 0.2, "grad_norm": 9.1171875, "learning_rate": 2.770854271356784e-05, "loss": 1.6136, "step": 8100 }, { "epoch": 0.2, "grad_norm": 6.9140625, "learning_rate": 2.7678391959798994e-05, "loss": 1.6854, "step": 8200 }, { "epoch": 0.21, "grad_norm": 7.15625, "learning_rate": 2.7648241206030152e-05, "loss": 1.6302, "step": 8300 }, { "epoch": 0.21, "grad_norm": 5.05078125, "learning_rate": 2.7618090452261308e-05, "loss": 1.5978, "step": 8400 }, { "epoch": 0.21, "grad_norm": 9.4765625, "learning_rate": 2.7587939698492463e-05, "loss": 1.6275, "step": 8500 }, { "epoch": 0.21, "grad_norm": 4.4765625, "learning_rate": 2.7557788944723618e-05, "loss": 1.6216, "step": 8600 }, { "epoch": 0.22, "grad_norm": 6.62109375, "learning_rate": 2.7527638190954777e-05, "loss": 1.6664, "step": 8700 }, { "epoch": 0.22, "grad_norm": 5.609375, "learning_rate": 2.7497487437185932e-05, "loss": 1.6589, "step": 8800 }, { "epoch": 0.22, "grad_norm": 7.2109375, "learning_rate": 2.7467336683417087e-05, "loss": 1.654, "step": 8900 }, { "epoch": 0.23, "grad_norm": 5.9921875, "learning_rate": 2.743718592964824e-05, "loss": 1.639, "step": 9000 }, { "epoch": 0.23, "grad_norm": 9.21875, "learning_rate": 2.7407035175879398e-05, "loss": 1.6147, "step": 9100 }, { "epoch": 0.23, "grad_norm": 6.84765625, "learning_rate": 2.7376884422110553e-05, "loss": 1.6392, "step": 9200 }, { "epoch": 0.23, "grad_norm": 4.75390625, "learning_rate": 2.734673366834171e-05, "loss": 1.6202, "step": 9300 }, { "epoch": 0.23, "grad_norm": 6.0, "learning_rate": 2.7316582914572864e-05, "loss": 1.6408, "step": 9400 }, { "epoch": 0.24, "grad_norm": 4.44921875, "learning_rate": 2.7286432160804022e-05, "loss": 1.6437, "step": 9500 }, { "epoch": 0.24, "grad_norm": 3.9921875, "learning_rate": 2.7256281407035178e-05, "loss": 1.6165, "step": 9600 }, { "epoch": 0.24, "grad_norm": 7.15234375, "learning_rate": 2.7226130653266333e-05, "loss": 1.6217, "step": 9700 }, { "epoch": 0.24, "grad_norm": 5.38671875, "learning_rate": 2.7195979899497488e-05, "loss": 1.6584, "step": 9800 }, { "epoch": 0.25, "grad_norm": 6.62109375, "learning_rate": 2.7165829145728643e-05, "loss": 1.6204, "step": 9900 }, { "epoch": 0.25, "grad_norm": 8.6796875, "learning_rate": 2.7135678391959802e-05, "loss": 1.6176, "step": 10000 }, { "epoch": 0.25, "eval_loss": 1.5965248346328735, "eval_runtime": 58.635, "eval_samples_per_second": 17.055, "eval_steps_per_second": 4.264, "step": 10000 }, { "epoch": 0.25, "grad_norm": 4.96484375, "learning_rate": 2.7105527638190957e-05, "loss": 1.6494, "step": 10100 }, { "epoch": 0.26, "grad_norm": 5.84375, "learning_rate": 2.707537688442211e-05, "loss": 1.5987, "step": 10200 }, { "epoch": 0.26, "grad_norm": 8.953125, "learning_rate": 2.7045226130653264e-05, "loss": 1.6262, "step": 10300 }, { "epoch": 0.26, "grad_norm": 6.92578125, "learning_rate": 2.7015075376884423e-05, "loss": 1.6362, "step": 10400 }, { "epoch": 0.26, "grad_norm": 5.35546875, "learning_rate": 2.698492462311558e-05, "loss": 1.6, "step": 10500 }, { "epoch": 0.27, "grad_norm": 7.41015625, "learning_rate": 2.6954773869346734e-05, "loss": 1.693, "step": 10600 }, { "epoch": 0.27, "grad_norm": 5.64453125, "learning_rate": 2.692462311557789e-05, "loss": 1.6496, "step": 10700 }, { "epoch": 0.27, "grad_norm": 4.953125, "learning_rate": 2.6894472361809048e-05, "loss": 1.6043, "step": 10800 }, { "epoch": 0.27, "grad_norm": 5.82421875, "learning_rate": 2.6864321608040203e-05, "loss": 1.6291, "step": 10900 }, { "epoch": 0.28, "grad_norm": 6.1484375, "learning_rate": 2.6834170854271358e-05, "loss": 1.5974, "step": 11000 }, { "epoch": 0.28, "grad_norm": 5.90234375, "learning_rate": 2.6804020100502513e-05, "loss": 1.6811, "step": 11100 }, { "epoch": 0.28, "grad_norm": 8.2890625, "learning_rate": 2.6773869346733672e-05, "loss": 1.6045, "step": 11200 }, { "epoch": 0.28, "grad_norm": 5.09765625, "learning_rate": 2.6743718592964824e-05, "loss": 1.6116, "step": 11300 }, { "epoch": 0.28, "grad_norm": 4.29296875, "learning_rate": 2.671356783919598e-05, "loss": 1.5674, "step": 11400 }, { "epoch": 0.29, "grad_norm": 5.8359375, "learning_rate": 2.6683417085427134e-05, "loss": 1.6231, "step": 11500 }, { "epoch": 0.29, "grad_norm": 5.5234375, "learning_rate": 2.6653266331658293e-05, "loss": 1.618, "step": 11600 }, { "epoch": 0.29, "grad_norm": 6.93359375, "learning_rate": 2.6623115577889448e-05, "loss": 1.5943, "step": 11700 }, { "epoch": 0.29, "grad_norm": 6.33203125, "learning_rate": 2.6592964824120604e-05, "loss": 1.6126, "step": 11800 }, { "epoch": 0.3, "grad_norm": 9.625, "learning_rate": 2.656281407035176e-05, "loss": 1.5968, "step": 11900 }, { "epoch": 0.3, "grad_norm": 6.55859375, "learning_rate": 2.6532663316582917e-05, "loss": 1.6228, "step": 12000 }, { "epoch": 0.3, "grad_norm": 5.8359375, "learning_rate": 2.6502512562814073e-05, "loss": 1.6006, "step": 12100 }, { "epoch": 0.3, "grad_norm": 10.2578125, "learning_rate": 2.6472361809045228e-05, "loss": 1.5515, "step": 12200 }, { "epoch": 0.31, "grad_norm": 7.23046875, "learning_rate": 2.6442211055276383e-05, "loss": 1.6433, "step": 12300 }, { "epoch": 0.31, "grad_norm": 7.5625, "learning_rate": 2.6412060301507535e-05, "loss": 1.6273, "step": 12400 }, { "epoch": 0.31, "grad_norm": 4.7421875, "learning_rate": 2.6381909547738694e-05, "loss": 1.5738, "step": 12500 }, { "epoch": 0.31, "eval_loss": 1.6012969017028809, "eval_runtime": 58.7203, "eval_samples_per_second": 17.03, "eval_steps_per_second": 4.257, "step": 12500 }, { "epoch": 0.32, "grad_norm": 8.2421875, "learning_rate": 2.635175879396985e-05, "loss": 1.5637, "step": 12600 }, { "epoch": 0.32, "grad_norm": 7.13671875, "learning_rate": 2.6321608040201004e-05, "loss": 1.6395, "step": 12700 }, { "epoch": 0.32, "grad_norm": 9.0390625, "learning_rate": 2.629145728643216e-05, "loss": 1.6293, "step": 12800 }, { "epoch": 0.32, "grad_norm": 6.546875, "learning_rate": 2.6261306532663318e-05, "loss": 1.5729, "step": 12900 }, { "epoch": 0.33, "grad_norm": 5.2734375, "learning_rate": 2.6231155778894474e-05, "loss": 1.5743, "step": 13000 }, { "epoch": 0.33, "grad_norm": 5.83984375, "learning_rate": 2.620100502512563e-05, "loss": 1.5871, "step": 13100 }, { "epoch": 0.33, "grad_norm": 4.55078125, "learning_rate": 2.6170854271356784e-05, "loss": 1.6303, "step": 13200 }, { "epoch": 0.33, "grad_norm": 7.61328125, "learning_rate": 2.6140703517587943e-05, "loss": 1.5893, "step": 13300 }, { "epoch": 0.34, "grad_norm": 9.7890625, "learning_rate": 2.6110552763819098e-05, "loss": 1.6201, "step": 13400 }, { "epoch": 0.34, "grad_norm": 7.26953125, "learning_rate": 2.6080402010050253e-05, "loss": 1.5352, "step": 13500 }, { "epoch": 0.34, "grad_norm": 5.3515625, "learning_rate": 2.6050251256281405e-05, "loss": 1.5675, "step": 13600 }, { "epoch": 0.34, "grad_norm": 6.16796875, "learning_rate": 2.6020100502512564e-05, "loss": 1.5841, "step": 13700 }, { "epoch": 0.34, "grad_norm": 4.64453125, "learning_rate": 2.598994974874372e-05, "loss": 1.5995, "step": 13800 }, { "epoch": 0.35, "grad_norm": 7.046875, "learning_rate": 2.5959798994974874e-05, "loss": 1.5882, "step": 13900 }, { "epoch": 0.35, "grad_norm": 9.5703125, "learning_rate": 2.592964824120603e-05, "loss": 1.5834, "step": 14000 }, { "epoch": 0.35, "grad_norm": 6.1640625, "learning_rate": 2.5899497487437188e-05, "loss": 1.5723, "step": 14100 }, { "epoch": 0.35, "grad_norm": 6.83984375, "learning_rate": 2.5869346733668344e-05, "loss": 1.5971, "step": 14200 }, { "epoch": 0.36, "grad_norm": 6.5625, "learning_rate": 2.58391959798995e-05, "loss": 1.5571, "step": 14300 }, { "epoch": 0.36, "grad_norm": 4.80859375, "learning_rate": 2.5809045226130654e-05, "loss": 1.6284, "step": 14400 }, { "epoch": 0.36, "grad_norm": 7.25390625, "learning_rate": 2.577889447236181e-05, "loss": 1.6099, "step": 14500 }, { "epoch": 0.36, "grad_norm": 7.44921875, "learning_rate": 2.5748743718592968e-05, "loss": 1.6083, "step": 14600 }, { "epoch": 0.37, "grad_norm": 5.109375, "learning_rate": 2.571859296482412e-05, "loss": 1.5841, "step": 14700 }, { "epoch": 0.37, "grad_norm": 6.93359375, "learning_rate": 2.5688442211055275e-05, "loss": 1.5752, "step": 14800 }, { "epoch": 0.37, "grad_norm": 7.38671875, "learning_rate": 2.565829145728643e-05, "loss": 1.5535, "step": 14900 }, { "epoch": 0.38, "grad_norm": 5.40234375, "learning_rate": 2.562814070351759e-05, "loss": 1.5804, "step": 15000 }, { "epoch": 0.38, "eval_loss": 1.5745244026184082, "eval_runtime": 58.6931, "eval_samples_per_second": 17.038, "eval_steps_per_second": 4.259, "step": 15000 }, { "epoch": 0.38, "grad_norm": 5.33203125, "learning_rate": 2.5597989949748744e-05, "loss": 1.527, "step": 15100 }, { "epoch": 0.38, "grad_norm": 6.8828125, "learning_rate": 2.55678391959799e-05, "loss": 1.5769, "step": 15200 }, { "epoch": 0.38, "grad_norm": 7.4140625, "learning_rate": 2.5537688442211055e-05, "loss": 1.5983, "step": 15300 }, { "epoch": 0.39, "grad_norm": 5.0390625, "learning_rate": 2.5507537688442214e-05, "loss": 1.5873, "step": 15400 }, { "epoch": 0.39, "grad_norm": 6.39453125, "learning_rate": 2.547738693467337e-05, "loss": 1.5888, "step": 15500 }, { "epoch": 0.39, "grad_norm": 7.703125, "learning_rate": 2.5447236180904524e-05, "loss": 1.559, "step": 15600 }, { "epoch": 0.39, "grad_norm": 10.5859375, "learning_rate": 2.541708542713568e-05, "loss": 1.5862, "step": 15700 }, { "epoch": 0.4, "grad_norm": 6.8984375, "learning_rate": 2.5386934673366835e-05, "loss": 1.5644, "step": 15800 }, { "epoch": 0.4, "grad_norm": 5.9453125, "learning_rate": 2.535678391959799e-05, "loss": 1.5663, "step": 15900 }, { "epoch": 0.4, "grad_norm": 6.1171875, "learning_rate": 2.5326633165829145e-05, "loss": 1.6077, "step": 16000 }, { "epoch": 0.4, "grad_norm": 5.06640625, "learning_rate": 2.52964824120603e-05, "loss": 1.52, "step": 16100 }, { "epoch": 0.41, "grad_norm": 5.140625, "learning_rate": 2.526633165829146e-05, "loss": 1.5387, "step": 16200 }, { "epoch": 0.41, "grad_norm": 5.19140625, "learning_rate": 2.5236180904522614e-05, "loss": 1.5523, "step": 16300 }, { "epoch": 0.41, "grad_norm": 6.859375, "learning_rate": 2.520603015075377e-05, "loss": 1.5684, "step": 16400 }, { "epoch": 0.41, "grad_norm": 8.9765625, "learning_rate": 2.5175879396984925e-05, "loss": 1.591, "step": 16500 }, { "epoch": 0.41, "grad_norm": 6.6640625, "learning_rate": 2.5145728643216084e-05, "loss": 1.6167, "step": 16600 }, { "epoch": 0.42, "grad_norm": 7.8203125, "learning_rate": 2.511557788944724e-05, "loss": 1.5475, "step": 16700 }, { "epoch": 0.42, "grad_norm": 6.125, "learning_rate": 2.5085427135678394e-05, "loss": 1.5604, "step": 16800 }, { "epoch": 0.42, "grad_norm": 6.7421875, "learning_rate": 2.5055276381909546e-05, "loss": 1.6347, "step": 16900 }, { "epoch": 0.42, "grad_norm": 8.8828125, "learning_rate": 2.50251256281407e-05, "loss": 1.5626, "step": 17000 }, { "epoch": 0.43, "grad_norm": 8.0078125, "learning_rate": 2.499497487437186e-05, "loss": 1.5119, "step": 17100 }, { "epoch": 0.43, "grad_norm": 6.69921875, "learning_rate": 2.4964824120603015e-05, "loss": 1.5075, "step": 17200 }, { "epoch": 0.43, "grad_norm": 7.83203125, "learning_rate": 2.493467336683417e-05, "loss": 1.5604, "step": 17300 }, { "epoch": 0.43, "grad_norm": 6.86328125, "learning_rate": 2.4904522613065326e-05, "loss": 1.5727, "step": 17400 }, { "epoch": 0.44, "grad_norm": 7.91015625, "learning_rate": 2.4874371859296484e-05, "loss": 1.538, "step": 17500 }, { "epoch": 0.44, "eval_loss": 1.5860785245895386, "eval_runtime": 58.5282, "eval_samples_per_second": 17.086, "eval_steps_per_second": 4.271, "step": 17500 }, { "epoch": 0.44, "grad_norm": 8.484375, "learning_rate": 2.484422110552764e-05, "loss": 1.5551, "step": 17600 }, { "epoch": 0.44, "grad_norm": 7.64453125, "learning_rate": 2.4814070351758795e-05, "loss": 1.5169, "step": 17700 }, { "epoch": 0.45, "grad_norm": 8.4375, "learning_rate": 2.478391959798995e-05, "loss": 1.5385, "step": 17800 }, { "epoch": 0.45, "grad_norm": 6.08203125, "learning_rate": 2.475376884422111e-05, "loss": 1.5492, "step": 17900 }, { "epoch": 0.45, "grad_norm": 11.84375, "learning_rate": 2.4723618090452264e-05, "loss": 1.5695, "step": 18000 }, { "epoch": 0.45, "grad_norm": 5.98046875, "learning_rate": 2.4693467336683416e-05, "loss": 1.5804, "step": 18100 }, { "epoch": 0.46, "grad_norm": 7.75, "learning_rate": 2.466331658291457e-05, "loss": 1.5567, "step": 18200 }, { "epoch": 0.46, "grad_norm": 5.51953125, "learning_rate": 2.463316582914573e-05, "loss": 1.5492, "step": 18300 }, { "epoch": 0.46, "grad_norm": 6.86328125, "learning_rate": 2.4603015075376885e-05, "loss": 1.5733, "step": 18400 }, { "epoch": 0.46, "grad_norm": 6.56640625, "learning_rate": 2.457286432160804e-05, "loss": 1.586, "step": 18500 }, { "epoch": 0.47, "grad_norm": 4.23828125, "learning_rate": 2.4542713567839196e-05, "loss": 1.5428, "step": 18600 }, { "epoch": 0.47, "grad_norm": 7.76171875, "learning_rate": 2.4512562814070354e-05, "loss": 1.5085, "step": 18700 }, { "epoch": 0.47, "grad_norm": 6.984375, "learning_rate": 2.448241206030151e-05, "loss": 1.5726, "step": 18800 }, { "epoch": 0.47, "grad_norm": 7.92578125, "learning_rate": 2.4452261306532665e-05, "loss": 1.5666, "step": 18900 }, { "epoch": 0.47, "grad_norm": 7.66796875, "learning_rate": 2.442211055276382e-05, "loss": 1.4978, "step": 19000 }, { "epoch": 0.48, "grad_norm": 6.89453125, "learning_rate": 2.4391959798994975e-05, "loss": 1.5385, "step": 19100 }, { "epoch": 0.48, "grad_norm": 10.1484375, "learning_rate": 2.436180904522613e-05, "loss": 1.5172, "step": 19200 }, { "epoch": 0.48, "grad_norm": 6.75, "learning_rate": 2.4331658291457286e-05, "loss": 1.5754, "step": 19300 }, { "epoch": 0.48, "grad_norm": 4.90625, "learning_rate": 2.430150753768844e-05, "loss": 1.4749, "step": 19400 }, { "epoch": 0.49, "grad_norm": 8.3515625, "learning_rate": 2.4271356783919596e-05, "loss": 1.5676, "step": 19500 }, { "epoch": 0.49, "grad_norm": 9.671875, "learning_rate": 2.4241206030150755e-05, "loss": 1.5169, "step": 19600 }, { "epoch": 0.49, "grad_norm": 7.02734375, "learning_rate": 2.421105527638191e-05, "loss": 1.5026, "step": 19700 }, { "epoch": 0.49, "grad_norm": 8.0390625, "learning_rate": 2.4180904522613066e-05, "loss": 1.5726, "step": 19800 }, { "epoch": 0.5, "grad_norm": 6.265625, "learning_rate": 2.415075376884422e-05, "loss": 1.4948, "step": 19900 }, { "epoch": 0.5, "grad_norm": 6.828125, "learning_rate": 2.412060301507538e-05, "loss": 1.553, "step": 20000 }, { "epoch": 0.5, "eval_loss": 1.5626012086868286, "eval_runtime": 58.5266, "eval_samples_per_second": 17.086, "eval_steps_per_second": 4.272, "step": 20000 }, { "epoch": 0.5, "grad_norm": 6.4453125, "learning_rate": 2.4090452261306535e-05, "loss": 1.5808, "step": 20100 }, { "epoch": 0.51, "grad_norm": 5.28515625, "learning_rate": 2.406030150753769e-05, "loss": 1.5917, "step": 20200 }, { "epoch": 0.51, "grad_norm": 12.0390625, "learning_rate": 2.4030150753768842e-05, "loss": 1.5786, "step": 20300 }, { "epoch": 0.51, "grad_norm": 8.3125, "learning_rate": 2.4e-05, "loss": 1.5467, "step": 20400 }, { "epoch": 0.51, "grad_norm": 5.46484375, "learning_rate": 2.3969849246231156e-05, "loss": 1.5572, "step": 20500 }, { "epoch": 0.52, "grad_norm": 7.2109375, "learning_rate": 2.393969849246231e-05, "loss": 1.4939, "step": 20600 }, { "epoch": 0.52, "grad_norm": 8.703125, "learning_rate": 2.3909547738693466e-05, "loss": 1.5634, "step": 20700 }, { "epoch": 0.52, "grad_norm": 5.453125, "learning_rate": 2.3879396984924625e-05, "loss": 1.5502, "step": 20800 }, { "epoch": 0.52, "grad_norm": 3.52734375, "learning_rate": 2.384924623115578e-05, "loss": 1.5421, "step": 20900 }, { "epoch": 0.53, "grad_norm": 5.87890625, "learning_rate": 2.3819095477386936e-05, "loss": 1.569, "step": 21000 }, { "epoch": 0.53, "grad_norm": 6.60546875, "learning_rate": 2.378894472361809e-05, "loss": 1.5416, "step": 21100 }, { "epoch": 0.53, "grad_norm": 7.68359375, "learning_rate": 2.375879396984925e-05, "loss": 1.5318, "step": 21200 }, { "epoch": 0.53, "grad_norm": 7.07421875, "learning_rate": 2.3728643216080405e-05, "loss": 1.5376, "step": 21300 }, { "epoch": 0.54, "grad_norm": 6.71875, "learning_rate": 2.369849246231156e-05, "loss": 1.5728, "step": 21400 }, { "epoch": 0.54, "grad_norm": 5.34375, "learning_rate": 2.3668341708542712e-05, "loss": 1.544, "step": 21500 }, { "epoch": 0.54, "grad_norm": 5.00390625, "learning_rate": 2.3638190954773867e-05, "loss": 1.4737, "step": 21600 }, { "epoch": 0.54, "grad_norm": 11.5, "learning_rate": 2.3608040201005026e-05, "loss": 1.5458, "step": 21700 }, { "epoch": 0.55, "grad_norm": 8.75, "learning_rate": 2.357788944723618e-05, "loss": 1.5746, "step": 21800 }, { "epoch": 0.55, "grad_norm": 7.98828125, "learning_rate": 2.3547738693467336e-05, "loss": 1.4738, "step": 21900 }, { "epoch": 0.55, "grad_norm": 11.9609375, "learning_rate": 2.351758793969849e-05, "loss": 1.5951, "step": 22000 }, { "epoch": 0.55, "grad_norm": 7.09375, "learning_rate": 2.348743718592965e-05, "loss": 1.4439, "step": 22100 }, { "epoch": 0.56, "grad_norm": 9.9296875, "learning_rate": 2.3457286432160806e-05, "loss": 1.5166, "step": 22200 }, { "epoch": 0.56, "grad_norm": 4.30078125, "learning_rate": 2.342713567839196e-05, "loss": 1.496, "step": 22300 }, { "epoch": 0.56, "grad_norm": 8.171875, "learning_rate": 2.3396984924623116e-05, "loss": 1.5075, "step": 22400 }, { "epoch": 0.56, "grad_norm": 7.54296875, "learning_rate": 2.3366834170854275e-05, "loss": 1.4837, "step": 22500 }, { "epoch": 0.56, "eval_loss": 1.512613296508789, "eval_runtime": 58.4743, "eval_samples_per_second": 17.102, "eval_steps_per_second": 4.275, "step": 22500 }, { "epoch": 0.56, "grad_norm": 10.2265625, "learning_rate": 2.3336683417085427e-05, "loss": 1.5465, "step": 22600 }, { "epoch": 0.57, "grad_norm": 10.234375, "learning_rate": 2.3306532663316582e-05, "loss": 1.533, "step": 22700 }, { "epoch": 0.57, "grad_norm": 6.2734375, "learning_rate": 2.3276381909547737e-05, "loss": 1.5222, "step": 22800 }, { "epoch": 0.57, "grad_norm": 8.125, "learning_rate": 2.3246231155778896e-05, "loss": 1.5249, "step": 22900 }, { "epoch": 0.57, "grad_norm": 9.375, "learning_rate": 2.321608040201005e-05, "loss": 1.4978, "step": 23000 }, { "epoch": 0.58, "grad_norm": 7.8515625, "learning_rate": 2.3185929648241206e-05, "loss": 1.5416, "step": 23100 }, { "epoch": 0.58, "grad_norm": 8.796875, "learning_rate": 2.315577889447236e-05, "loss": 1.5141, "step": 23200 }, { "epoch": 0.58, "grad_norm": 6.9140625, "learning_rate": 2.312562814070352e-05, "loss": 1.4859, "step": 23300 }, { "epoch": 0.58, "grad_norm": 5.29296875, "learning_rate": 2.3095477386934676e-05, "loss": 1.4955, "step": 23400 }, { "epoch": 0.59, "grad_norm": 5.82421875, "learning_rate": 2.306532663316583e-05, "loss": 1.4968, "step": 23500 }, { "epoch": 0.59, "grad_norm": 7.01953125, "learning_rate": 2.3035175879396986e-05, "loss": 1.4652, "step": 23600 }, { "epoch": 0.59, "grad_norm": 6.625, "learning_rate": 2.3005025125628138e-05, "loss": 1.5003, "step": 23700 }, { "epoch": 0.59, "grad_norm": 6.21875, "learning_rate": 2.2974874371859297e-05, "loss": 1.5142, "step": 23800 }, { "epoch": 0.6, "grad_norm": 4.73828125, "learning_rate": 2.2944723618090452e-05, "loss": 1.5327, "step": 23900 }, { "epoch": 0.6, "grad_norm": 6.79296875, "learning_rate": 2.2914572864321607e-05, "loss": 1.4745, "step": 24000 }, { "epoch": 0.6, "grad_norm": 9.515625, "learning_rate": 2.2884422110552762e-05, "loss": 1.4769, "step": 24100 }, { "epoch": 0.6, "grad_norm": 5.5859375, "learning_rate": 2.285427135678392e-05, "loss": 1.5434, "step": 24200 }, { "epoch": 0.61, "grad_norm": 8.1796875, "learning_rate": 2.2824120603015076e-05, "loss": 1.5288, "step": 24300 }, { "epoch": 0.61, "grad_norm": 4.390625, "learning_rate": 2.279396984924623e-05, "loss": 1.4798, "step": 24400 }, { "epoch": 0.61, "grad_norm": 4.91796875, "learning_rate": 2.2763819095477387e-05, "loss": 1.499, "step": 24500 }, { "epoch": 0.61, "grad_norm": 8.9921875, "learning_rate": 2.2733668341708546e-05, "loss": 1.4819, "step": 24600 }, { "epoch": 0.62, "grad_norm": 5.5078125, "learning_rate": 2.27035175879397e-05, "loss": 1.5138, "step": 24700 }, { "epoch": 0.62, "grad_norm": 8.453125, "learning_rate": 2.2673366834170856e-05, "loss": 1.4908, "step": 24800 }, { "epoch": 0.62, "grad_norm": 7.09375, "learning_rate": 2.2643216080402008e-05, "loss": 1.5043, "step": 24900 }, { "epoch": 0.62, "grad_norm": 7.32421875, "learning_rate": 2.2613065326633167e-05, "loss": 1.5215, "step": 25000 }, { "epoch": 0.62, "eval_loss": 1.5021257400512695, "eval_runtime": 58.4197, "eval_samples_per_second": 17.118, "eval_steps_per_second": 4.279, "step": 25000 }, { "epoch": 0.63, "grad_norm": 4.16796875, "learning_rate": 2.2582914572864322e-05, "loss": 1.5173, "step": 25100 }, { "epoch": 0.63, "grad_norm": 5.7265625, "learning_rate": 2.2552763819095477e-05, "loss": 1.4825, "step": 25200 }, { "epoch": 0.63, "grad_norm": 6.3984375, "learning_rate": 2.2522613065326632e-05, "loss": 1.496, "step": 25300 }, { "epoch": 0.64, "grad_norm": 5.75, "learning_rate": 2.249246231155779e-05, "loss": 1.4803, "step": 25400 }, { "epoch": 0.64, "grad_norm": 6.71484375, "learning_rate": 2.2462311557788946e-05, "loss": 1.4516, "step": 25500 }, { "epoch": 0.64, "grad_norm": 7.2265625, "learning_rate": 2.24321608040201e-05, "loss": 1.4969, "step": 25600 }, { "epoch": 0.64, "grad_norm": 5.28125, "learning_rate": 2.2402010050251257e-05, "loss": 1.5185, "step": 25700 }, { "epoch": 0.65, "grad_norm": 6.40234375, "learning_rate": 2.2371859296482416e-05, "loss": 1.4593, "step": 25800 }, { "epoch": 0.65, "grad_norm": 6.3671875, "learning_rate": 2.234170854271357e-05, "loss": 1.5272, "step": 25900 }, { "epoch": 0.65, "grad_norm": 4.5625, "learning_rate": 2.2311557788944723e-05, "loss": 1.496, "step": 26000 }, { "epoch": 0.65, "grad_norm": 5.69140625, "learning_rate": 2.2281407035175878e-05, "loss": 1.4677, "step": 26100 }, { "epoch": 0.66, "grad_norm": 5.9609375, "learning_rate": 2.2251256281407033e-05, "loss": 1.5153, "step": 26200 }, { "epoch": 0.66, "grad_norm": 4.890625, "learning_rate": 2.2221105527638192e-05, "loss": 1.5064, "step": 26300 }, { "epoch": 0.66, "grad_norm": 7.5859375, "learning_rate": 2.2190954773869347e-05, "loss": 1.4875, "step": 26400 }, { "epoch": 0.66, "grad_norm": 9.3984375, "learning_rate": 2.2160804020100502e-05, "loss": 1.5198, "step": 26500 }, { "epoch": 0.67, "grad_norm": 5.72265625, "learning_rate": 2.2130653266331658e-05, "loss": 1.5047, "step": 26600 }, { "epoch": 0.67, "grad_norm": 4.1484375, "learning_rate": 2.2100502512562816e-05, "loss": 1.4954, "step": 26700 }, { "epoch": 0.67, "grad_norm": 4.5859375, "learning_rate": 2.207035175879397e-05, "loss": 1.5074, "step": 26800 }, { "epoch": 0.67, "grad_norm": 7.98828125, "learning_rate": 2.2040201005025127e-05, "loss": 1.5158, "step": 26900 }, { "epoch": 0.68, "grad_norm": 4.3125, "learning_rate": 2.2010050251256282e-05, "loss": 1.4735, "step": 27000 }, { "epoch": 0.68, "grad_norm": 11.2421875, "learning_rate": 2.1979899497487437e-05, "loss": 1.4868, "step": 27100 }, { "epoch": 0.68, "grad_norm": 5.69921875, "learning_rate": 2.1949748743718593e-05, "loss": 1.501, "step": 27200 }, { "epoch": 0.68, "grad_norm": 8.078125, "learning_rate": 2.1919597989949748e-05, "loss": 1.4897, "step": 27300 }, { "epoch": 0.69, "grad_norm": 8.8828125, "learning_rate": 2.1889447236180903e-05, "loss": 1.5144, "step": 27400 }, { "epoch": 0.69, "grad_norm": 9.109375, "learning_rate": 2.1859296482412062e-05, "loss": 1.4581, "step": 27500 }, { "epoch": 0.69, "eval_loss": 1.4510895013809204, "eval_runtime": 58.4398, "eval_samples_per_second": 17.112, "eval_steps_per_second": 4.278, "step": 27500 }, { "epoch": 0.69, "grad_norm": 4.703125, "learning_rate": 2.1829145728643217e-05, "loss": 1.5377, "step": 27600 }, { "epoch": 0.69, "grad_norm": 8.09375, "learning_rate": 2.1798994974874372e-05, "loss": 1.4735, "step": 27700 }, { "epoch": 0.69, "grad_norm": 8.328125, "learning_rate": 2.1768844221105528e-05, "loss": 1.4867, "step": 27800 }, { "epoch": 0.7, "grad_norm": 6.74609375, "learning_rate": 2.1738693467336686e-05, "loss": 1.4836, "step": 27900 }, { "epoch": 0.7, "grad_norm": 11.1875, "learning_rate": 2.170854271356784e-05, "loss": 1.4511, "step": 28000 }, { "epoch": 0.7, "grad_norm": 6.3984375, "learning_rate": 2.1678391959798997e-05, "loss": 1.4891, "step": 28100 }, { "epoch": 0.7, "grad_norm": 9.015625, "learning_rate": 2.1648241206030152e-05, "loss": 1.4966, "step": 28200 }, { "epoch": 0.71, "grad_norm": 6.51171875, "learning_rate": 2.1618090452261304e-05, "loss": 1.4795, "step": 28300 }, { "epoch": 0.71, "grad_norm": 4.52734375, "learning_rate": 2.1587939698492463e-05, "loss": 1.4697, "step": 28400 }, { "epoch": 0.71, "grad_norm": 6.171875, "learning_rate": 2.1557788944723618e-05, "loss": 1.4834, "step": 28500 }, { "epoch": 0.71, "grad_norm": 7.125, "learning_rate": 2.1527638190954773e-05, "loss": 1.4917, "step": 28600 }, { "epoch": 0.72, "grad_norm": 4.58984375, "learning_rate": 2.149748743718593e-05, "loss": 1.5206, "step": 28700 }, { "epoch": 0.72, "grad_norm": 5.5, "learning_rate": 2.1467336683417087e-05, "loss": 1.4826, "step": 28800 }, { "epoch": 0.72, "grad_norm": 5.23828125, "learning_rate": 2.1437185929648242e-05, "loss": 1.4568, "step": 28900 }, { "epoch": 0.72, "grad_norm": 3.708984375, "learning_rate": 2.1407035175879398e-05, "loss": 1.5095, "step": 29000 }, { "epoch": 0.73, "grad_norm": 8.46875, "learning_rate": 2.1376884422110553e-05, "loss": 1.4935, "step": 29100 }, { "epoch": 0.73, "grad_norm": 5.75, "learning_rate": 2.134673366834171e-05, "loss": 1.4553, "step": 29200 }, { "epoch": 0.73, "grad_norm": 5.58203125, "learning_rate": 2.1316582914572867e-05, "loss": 1.4365, "step": 29300 }, { "epoch": 0.73, "grad_norm": 8.6015625, "learning_rate": 2.128643216080402e-05, "loss": 1.4688, "step": 29400 }, { "epoch": 0.74, "grad_norm": 6.953125, "learning_rate": 2.1256281407035174e-05, "loss": 1.4744, "step": 29500 }, { "epoch": 0.74, "grad_norm": 4.53515625, "learning_rate": 2.1226130653266333e-05, "loss": 1.4777, "step": 29600 }, { "epoch": 0.74, "grad_norm": 5.68359375, "learning_rate": 2.1195979899497488e-05, "loss": 1.4753, "step": 29700 }, { "epoch": 0.74, "grad_norm": 7.76953125, "learning_rate": 2.1165829145728643e-05, "loss": 1.4355, "step": 29800 }, { "epoch": 0.75, "grad_norm": 9.9453125, "learning_rate": 2.11356783919598e-05, "loss": 1.5261, "step": 29900 }, { "epoch": 0.75, "grad_norm": 5.51171875, "learning_rate": 2.1105527638190957e-05, "loss": 1.4413, "step": 30000 }, { "epoch": 0.75, "eval_loss": 1.4673473834991455, "eval_runtime": 58.3856, "eval_samples_per_second": 17.127, "eval_steps_per_second": 4.282, "step": 30000 }, { "epoch": 0.75, "grad_norm": 7.42578125, "learning_rate": 2.1075376884422112e-05, "loss": 1.4513, "step": 30100 }, { "epoch": 0.76, "grad_norm": 5.3046875, "learning_rate": 2.1045226130653268e-05, "loss": 1.4775, "step": 30200 }, { "epoch": 0.76, "grad_norm": 7.609375, "learning_rate": 2.1015075376884423e-05, "loss": 1.4477, "step": 30300 }, { "epoch": 0.76, "grad_norm": 10.3203125, "learning_rate": 2.098492462311558e-05, "loss": 1.4584, "step": 30400 }, { "epoch": 0.76, "grad_norm": 6.4453125, "learning_rate": 2.0954773869346733e-05, "loss": 1.4934, "step": 30500 }, { "epoch": 0.77, "grad_norm": 3.701171875, "learning_rate": 2.092462311557789e-05, "loss": 1.4539, "step": 30600 }, { "epoch": 0.77, "grad_norm": 4.0390625, "learning_rate": 2.0894472361809044e-05, "loss": 1.4547, "step": 30700 }, { "epoch": 0.77, "grad_norm": 6.56640625, "learning_rate": 2.08643216080402e-05, "loss": 1.4646, "step": 30800 }, { "epoch": 0.77, "grad_norm": 8.1875, "learning_rate": 2.0834170854271358e-05, "loss": 1.4742, "step": 30900 }, { "epoch": 0.78, "grad_norm": 6.68359375, "learning_rate": 2.0804020100502513e-05, "loss": 1.4942, "step": 31000 }, { "epoch": 0.78, "grad_norm": 6.046875, "learning_rate": 2.077386934673367e-05, "loss": 1.4995, "step": 31100 }, { "epoch": 0.78, "grad_norm": 5.5390625, "learning_rate": 2.0743718592964824e-05, "loss": 1.4689, "step": 31200 }, { "epoch": 0.78, "grad_norm": 9.515625, "learning_rate": 2.0713567839195982e-05, "loss": 1.4649, "step": 31300 }, { "epoch": 0.79, "grad_norm": 12.640625, "learning_rate": 2.0683417085427138e-05, "loss": 1.4445, "step": 31400 }, { "epoch": 0.79, "grad_norm": 8.53125, "learning_rate": 2.0653266331658293e-05, "loss": 1.4136, "step": 31500 }, { "epoch": 0.79, "grad_norm": 9.1640625, "learning_rate": 2.0623115577889448e-05, "loss": 1.4387, "step": 31600 }, { "epoch": 0.79, "grad_norm": 6.32421875, "learning_rate": 2.0592964824120603e-05, "loss": 1.4485, "step": 31700 }, { "epoch": 0.8, "grad_norm": 7.34765625, "learning_rate": 2.056281407035176e-05, "loss": 1.4799, "step": 31800 }, { "epoch": 0.8, "grad_norm": 5.546875, "learning_rate": 2.0532663316582914e-05, "loss": 1.4794, "step": 31900 }, { "epoch": 0.8, "grad_norm": 7.671875, "learning_rate": 2.050251256281407e-05, "loss": 1.4681, "step": 32000 }, { "epoch": 0.8, "grad_norm": 4.66796875, "learning_rate": 2.0472361809045228e-05, "loss": 1.4127, "step": 32100 }, { "epoch": 0.81, "grad_norm": 9.65625, "learning_rate": 2.0442211055276383e-05, "loss": 1.4809, "step": 32200 }, { "epoch": 0.81, "grad_norm": 7.37890625, "learning_rate": 2.041206030150754e-05, "loss": 1.4713, "step": 32300 }, { "epoch": 0.81, "grad_norm": 8.15625, "learning_rate": 2.0381909547738694e-05, "loss": 1.4872, "step": 32400 }, { "epoch": 0.81, "grad_norm": 8.5234375, "learning_rate": 2.0351758793969852e-05, "loss": 1.465, "step": 32500 }, { "epoch": 0.81, "eval_loss": 1.4116058349609375, "eval_runtime": 58.4781, "eval_samples_per_second": 17.1, "eval_steps_per_second": 4.275, "step": 32500 }, { "epoch": 0.81, "grad_norm": 8.0859375, "learning_rate": 2.0321608040201008e-05, "loss": 1.4717, "step": 32600 }, { "epoch": 0.82, "grad_norm": 8.34375, "learning_rate": 2.0291457286432163e-05, "loss": 1.4254, "step": 32700 }, { "epoch": 0.82, "grad_norm": 5.984375, "learning_rate": 2.0261306532663315e-05, "loss": 1.5016, "step": 32800 }, { "epoch": 0.82, "grad_norm": 5.50390625, "learning_rate": 2.023115577889447e-05, "loss": 1.4433, "step": 32900 }, { "epoch": 0.82, "grad_norm": 3.642578125, "learning_rate": 2.020100502512563e-05, "loss": 1.3997, "step": 33000 }, { "epoch": 0.83, "grad_norm": 7.01953125, "learning_rate": 2.0170854271356784e-05, "loss": 1.4126, "step": 33100 }, { "epoch": 0.83, "grad_norm": 6.78515625, "learning_rate": 2.014070351758794e-05, "loss": 1.4777, "step": 33200 }, { "epoch": 0.83, "grad_norm": 4.07421875, "learning_rate": 2.0110552763819094e-05, "loss": 1.4435, "step": 33300 }, { "epoch": 0.83, "grad_norm": 6.2265625, "learning_rate": 2.0080402010050253e-05, "loss": 1.4336, "step": 33400 }, { "epoch": 0.84, "grad_norm": 5.74609375, "learning_rate": 2.005025125628141e-05, "loss": 1.4649, "step": 33500 }, { "epoch": 0.84, "grad_norm": 3.865234375, "learning_rate": 2.0020100502512564e-05, "loss": 1.4996, "step": 33600 }, { "epoch": 0.84, "grad_norm": 6.6484375, "learning_rate": 1.998994974874372e-05, "loss": 1.4449, "step": 33700 }, { "epoch": 0.84, "grad_norm": 7.8046875, "learning_rate": 1.9959798994974878e-05, "loss": 1.4507, "step": 33800 }, { "epoch": 0.85, "grad_norm": 5.7421875, "learning_rate": 1.992964824120603e-05, "loss": 1.4989, "step": 33900 }, { "epoch": 0.85, "grad_norm": 5.15234375, "learning_rate": 1.9899497487437185e-05, "loss": 1.4287, "step": 34000 }, { "epoch": 0.85, "grad_norm": 4.73828125, "learning_rate": 1.986934673366834e-05, "loss": 1.4336, "step": 34100 }, { "epoch": 0.85, "grad_norm": 5.66796875, "learning_rate": 1.98391959798995e-05, "loss": 1.4686, "step": 34200 }, { "epoch": 0.86, "grad_norm": 4.45703125, "learning_rate": 1.9809045226130654e-05, "loss": 1.4165, "step": 34300 }, { "epoch": 0.86, "grad_norm": 10.9453125, "learning_rate": 1.977889447236181e-05, "loss": 1.4619, "step": 34400 }, { "epoch": 0.86, "grad_norm": 7.75, "learning_rate": 1.9748743718592964e-05, "loss": 1.4575, "step": 34500 }, { "epoch": 0.86, "grad_norm": 7.22265625, "learning_rate": 1.9718592964824123e-05, "loss": 1.5055, "step": 34600 }, { "epoch": 0.87, "grad_norm": 6.09375, "learning_rate": 1.968844221105528e-05, "loss": 1.4246, "step": 34700 }, { "epoch": 0.87, "grad_norm": 5.6328125, "learning_rate": 1.9658291457286434e-05, "loss": 1.4136, "step": 34800 }, { "epoch": 0.87, "grad_norm": 7.3359375, "learning_rate": 1.962814070351759e-05, "loss": 1.4495, "step": 34900 }, { "epoch": 0.88, "grad_norm": 5.203125, "learning_rate": 1.9597989949748744e-05, "loss": 1.4444, "step": 35000 }, { "epoch": 0.88, "eval_loss": 1.447482943534851, "eval_runtime": 58.4319, "eval_samples_per_second": 17.114, "eval_steps_per_second": 4.278, "step": 35000 }, { "epoch": 0.88, "grad_norm": 6.86328125, "learning_rate": 1.95678391959799e-05, "loss": 1.4882, "step": 35100 }, { "epoch": 0.88, "grad_norm": 8.125, "learning_rate": 1.9537688442211055e-05, "loss": 1.4831, "step": 35200 }, { "epoch": 0.88, "grad_norm": 7.953125, "learning_rate": 1.950753768844221e-05, "loss": 1.391, "step": 35300 }, { "epoch": 0.89, "grad_norm": 9.1875, "learning_rate": 1.9477386934673365e-05, "loss": 1.4193, "step": 35400 }, { "epoch": 0.89, "grad_norm": 5.23828125, "learning_rate": 1.9447236180904524e-05, "loss": 1.4528, "step": 35500 }, { "epoch": 0.89, "grad_norm": 7.90234375, "learning_rate": 1.941708542713568e-05, "loss": 1.4088, "step": 35600 }, { "epoch": 0.89, "grad_norm": 6.12109375, "learning_rate": 1.9386934673366834e-05, "loss": 1.4242, "step": 35700 }, { "epoch": 0.9, "grad_norm": 5.70703125, "learning_rate": 1.935678391959799e-05, "loss": 1.382, "step": 35800 }, { "epoch": 0.9, "grad_norm": 18.375, "learning_rate": 1.932663316582915e-05, "loss": 1.4321, "step": 35900 }, { "epoch": 0.9, "grad_norm": 7.921875, "learning_rate": 1.9296482412060304e-05, "loss": 1.4566, "step": 36000 }, { "epoch": 0.9, "grad_norm": 6.87109375, "learning_rate": 1.926633165829146e-05, "loss": 1.4037, "step": 36100 }, { "epoch": 0.91, "grad_norm": 10.6953125, "learning_rate": 1.923618090452261e-05, "loss": 1.4484, "step": 36200 }, { "epoch": 0.91, "grad_norm": 9.4765625, "learning_rate": 1.920603015075377e-05, "loss": 1.4357, "step": 36300 }, { "epoch": 0.91, "grad_norm": 8.421875, "learning_rate": 1.9175879396984925e-05, "loss": 1.3922, "step": 36400 }, { "epoch": 0.91, "grad_norm": 7.39453125, "learning_rate": 1.914572864321608e-05, "loss": 1.4405, "step": 36500 }, { "epoch": 0.92, "grad_norm": 11.0625, "learning_rate": 1.9115577889447235e-05, "loss": 1.4351, "step": 36600 }, { "epoch": 0.92, "grad_norm": 4.6796875, "learning_rate": 1.9085427135678394e-05, "loss": 1.4234, "step": 36700 }, { "epoch": 0.92, "grad_norm": 3.60546875, "learning_rate": 1.905527638190955e-05, "loss": 1.4558, "step": 36800 }, { "epoch": 0.92, "grad_norm": 5.05078125, "learning_rate": 1.9025125628140704e-05, "loss": 1.4191, "step": 36900 }, { "epoch": 0.93, "grad_norm": 7.02734375, "learning_rate": 1.899497487437186e-05, "loss": 1.4019, "step": 37000 }, { "epoch": 0.93, "grad_norm": 5.7421875, "learning_rate": 1.896482412060302e-05, "loss": 1.4376, "step": 37100 }, { "epoch": 0.93, "grad_norm": 9.09375, "learning_rate": 1.8934673366834174e-05, "loss": 1.448, "step": 37200 }, { "epoch": 0.93, "grad_norm": 5.546875, "learning_rate": 1.8904522613065325e-05, "loss": 1.5019, "step": 37300 }, { "epoch": 0.94, "grad_norm": 5.43359375, "learning_rate": 1.887437185929648e-05, "loss": 1.4002, "step": 37400 }, { "epoch": 0.94, "grad_norm": 6.73046875, "learning_rate": 1.884422110552764e-05, "loss": 1.4595, "step": 37500 }, { "epoch": 0.94, "eval_loss": 1.4337172508239746, "eval_runtime": 58.4211, "eval_samples_per_second": 17.117, "eval_steps_per_second": 4.279, "step": 37500 }, { "epoch": 0.94, "grad_norm": 3.904296875, "learning_rate": 1.8814070351758795e-05, "loss": 1.4425, "step": 37600 }, { "epoch": 0.94, "grad_norm": 6.24609375, "learning_rate": 1.878391959798995e-05, "loss": 1.4301, "step": 37700 }, { "epoch": 0.94, "grad_norm": 6.34375, "learning_rate": 1.8753768844221105e-05, "loss": 1.4815, "step": 37800 }, { "epoch": 0.95, "grad_norm": 7.95703125, "learning_rate": 1.872361809045226e-05, "loss": 1.4687, "step": 37900 }, { "epoch": 0.95, "grad_norm": 3.689453125, "learning_rate": 1.869346733668342e-05, "loss": 1.4096, "step": 38000 }, { "epoch": 0.95, "grad_norm": 6.2578125, "learning_rate": 1.8663316582914574e-05, "loss": 1.4627, "step": 38100 }, { "epoch": 0.95, "grad_norm": 7.89453125, "learning_rate": 1.863316582914573e-05, "loss": 1.4379, "step": 38200 }, { "epoch": 0.96, "grad_norm": 9.0546875, "learning_rate": 1.8603015075376885e-05, "loss": 1.4455, "step": 38300 }, { "epoch": 0.96, "grad_norm": 3.818359375, "learning_rate": 1.857286432160804e-05, "loss": 1.4592, "step": 38400 }, { "epoch": 0.96, "grad_norm": 6.12109375, "learning_rate": 1.8542713567839195e-05, "loss": 1.4622, "step": 38500 }, { "epoch": 0.96, "grad_norm": 6.3671875, "learning_rate": 1.851256281407035e-05, "loss": 1.4051, "step": 38600 }, { "epoch": 0.97, "grad_norm": 6.67578125, "learning_rate": 1.8482412060301506e-05, "loss": 1.4165, "step": 38700 }, { "epoch": 0.97, "grad_norm": 9.421875, "learning_rate": 1.8452261306532665e-05, "loss": 1.4038, "step": 38800 }, { "epoch": 0.97, "grad_norm": 6.25390625, "learning_rate": 1.842211055276382e-05, "loss": 1.3569, "step": 38900 }, { "epoch": 0.97, "grad_norm": 6.1953125, "learning_rate": 1.8391959798994975e-05, "loss": 1.4433, "step": 39000 }, { "epoch": 0.98, "grad_norm": 6.71875, "learning_rate": 1.836180904522613e-05, "loss": 1.4515, "step": 39100 }, { "epoch": 0.98, "grad_norm": 7.2890625, "learning_rate": 1.833165829145729e-05, "loss": 1.4557, "step": 39200 }, { "epoch": 0.98, "grad_norm": 4.93359375, "learning_rate": 1.8301507537688444e-05, "loss": 1.4292, "step": 39300 }, { "epoch": 0.98, "grad_norm": 5.2734375, "learning_rate": 1.82713567839196e-05, "loss": 1.3862, "step": 39400 }, { "epoch": 0.99, "grad_norm": 3.75, "learning_rate": 1.8241206030150755e-05, "loss": 1.3642, "step": 39500 }, { "epoch": 0.99, "grad_norm": 6.80078125, "learning_rate": 1.821105527638191e-05, "loss": 1.4442, "step": 39600 }, { "epoch": 0.99, "grad_norm": 4.80078125, "learning_rate": 1.8180904522613065e-05, "loss": 1.3977, "step": 39700 }, { "epoch": 0.99, "grad_norm": 3.52734375, "learning_rate": 1.815075376884422e-05, "loss": 1.4081, "step": 39800 }, { "epoch": 1.0, "grad_norm": 9.40625, "learning_rate": 1.8120603015075376e-05, "loss": 1.3949, "step": 39900 }, { "epoch": 1.0, "grad_norm": 10.0234375, "learning_rate": 1.809045226130653e-05, "loss": 1.3952, "step": 40000 }, { "epoch": 1.0, "eval_loss": 1.4081411361694336, "eval_runtime": 58.4216, "eval_samples_per_second": 17.117, "eval_steps_per_second": 4.279, "step": 40000 }, { "epoch": 1.0, "grad_norm": 7.73828125, "learning_rate": 1.806030150753769e-05, "loss": 1.3862, "step": 40100 }, { "epoch": 1.0, "grad_norm": 6.5859375, "learning_rate": 1.8030150753768845e-05, "loss": 1.4586, "step": 40200 }, { "epoch": 1.01, "grad_norm": 7.20703125, "learning_rate": 1.8e-05, "loss": 1.4018, "step": 40300 }, { "epoch": 1.01, "grad_norm": 8.2265625, "learning_rate": 1.7969849246231156e-05, "loss": 1.4112, "step": 40400 }, { "epoch": 1.01, "grad_norm": 7.18359375, "learning_rate": 1.7939698492462314e-05, "loss": 1.3652, "step": 40500 }, { "epoch": 1.01, "grad_norm": 6.2109375, "learning_rate": 1.790954773869347e-05, "loss": 1.4143, "step": 40600 }, { "epoch": 1.02, "grad_norm": 4.31640625, "learning_rate": 1.787939698492462e-05, "loss": 1.4024, "step": 40700 }, { "epoch": 1.02, "grad_norm": 7.57421875, "learning_rate": 1.7849246231155777e-05, "loss": 1.442, "step": 40800 }, { "epoch": 1.02, "grad_norm": 6.2109375, "learning_rate": 1.7819095477386935e-05, "loss": 1.3748, "step": 40900 }, { "epoch": 1.02, "grad_norm": 6.99609375, "learning_rate": 1.778894472361809e-05, "loss": 1.4098, "step": 41000 }, { "epoch": 1.03, "grad_norm": 9.4453125, "learning_rate": 1.7758793969849246e-05, "loss": 1.3765, "step": 41100 }, { "epoch": 1.03, "grad_norm": 8.390625, "learning_rate": 1.77286432160804e-05, "loss": 1.4175, "step": 41200 }, { "epoch": 1.03, "grad_norm": 6.94140625, "learning_rate": 1.769849246231156e-05, "loss": 1.3656, "step": 41300 }, { "epoch": 1.03, "grad_norm": 8.8203125, "learning_rate": 1.7668341708542715e-05, "loss": 1.319, "step": 41400 }, { "epoch": 1.04, "grad_norm": 9.3125, "learning_rate": 1.763819095477387e-05, "loss": 1.4038, "step": 41500 }, { "epoch": 1.04, "grad_norm": 4.515625, "learning_rate": 1.7608040201005026e-05, "loss": 1.4096, "step": 41600 }, { "epoch": 1.04, "grad_norm": 6.9375, "learning_rate": 1.7577889447236184e-05, "loss": 1.4093, "step": 41700 }, { "epoch": 1.04, "grad_norm": 5.62890625, "learning_rate": 1.7547738693467336e-05, "loss": 1.4099, "step": 41800 }, { "epoch": 1.05, "grad_norm": 6.1484375, "learning_rate": 1.751758793969849e-05, "loss": 1.3245, "step": 41900 }, { "epoch": 1.05, "grad_norm": 7.41796875, "learning_rate": 1.7487437185929647e-05, "loss": 1.4454, "step": 42000 }, { "epoch": 1.05, "grad_norm": 4.08203125, "learning_rate": 1.7457286432160805e-05, "loss": 1.4143, "step": 42100 }, { "epoch": 1.05, "grad_norm": 8.3046875, "learning_rate": 1.742713567839196e-05, "loss": 1.3896, "step": 42200 }, { "epoch": 1.06, "grad_norm": 6.328125, "learning_rate": 1.7396984924623116e-05, "loss": 1.4059, "step": 42300 }, { "epoch": 1.06, "grad_norm": 6.92578125, "learning_rate": 1.736683417085427e-05, "loss": 1.3806, "step": 42400 }, { "epoch": 1.06, "grad_norm": 4.88671875, "learning_rate": 1.7336683417085427e-05, "loss": 1.382, "step": 42500 }, { "epoch": 1.06, "eval_loss": 1.4033676385879517, "eval_runtime": 58.3505, "eval_samples_per_second": 17.138, "eval_steps_per_second": 4.284, "step": 42500 }, { "epoch": 1.06, "grad_norm": 4.0390625, "learning_rate": 1.7306532663316585e-05, "loss": 1.4022, "step": 42600 }, { "epoch": 1.07, "grad_norm": 7.046875, "learning_rate": 1.727638190954774e-05, "loss": 1.3839, "step": 42700 }, { "epoch": 1.07, "grad_norm": 6.36328125, "learning_rate": 1.7246231155778896e-05, "loss": 1.3864, "step": 42800 }, { "epoch": 1.07, "grad_norm": 7.46484375, "learning_rate": 1.721608040201005e-05, "loss": 1.3551, "step": 42900 }, { "epoch": 1.07, "grad_norm": 6.73828125, "learning_rate": 1.7185929648241206e-05, "loss": 1.4038, "step": 43000 }, { "epoch": 1.08, "grad_norm": 5.7109375, "learning_rate": 1.715577889447236e-05, "loss": 1.4333, "step": 43100 }, { "epoch": 1.08, "grad_norm": 6.84765625, "learning_rate": 1.7125628140703517e-05, "loss": 1.3559, "step": 43200 }, { "epoch": 1.08, "grad_norm": 6.77734375, "learning_rate": 1.7095477386934672e-05, "loss": 1.3974, "step": 43300 }, { "epoch": 1.08, "grad_norm": 7.0703125, "learning_rate": 1.706532663316583e-05, "loss": 1.4373, "step": 43400 }, { "epoch": 1.09, "grad_norm": 6.5390625, "learning_rate": 1.7035175879396986e-05, "loss": 1.441, "step": 43500 }, { "epoch": 1.09, "grad_norm": 5.91015625, "learning_rate": 1.700502512562814e-05, "loss": 1.4241, "step": 43600 }, { "epoch": 1.09, "grad_norm": 8.2421875, "learning_rate": 1.6974874371859296e-05, "loss": 1.3692, "step": 43700 }, { "epoch": 1.09, "grad_norm": 5.36328125, "learning_rate": 1.6944723618090455e-05, "loss": 1.4058, "step": 43800 }, { "epoch": 1.1, "grad_norm": 5.71484375, "learning_rate": 1.691457286432161e-05, "loss": 1.3616, "step": 43900 }, { "epoch": 1.1, "grad_norm": 7.41015625, "learning_rate": 1.6884422110552766e-05, "loss": 1.4298, "step": 44000 }, { "epoch": 1.1, "grad_norm": 7.79296875, "learning_rate": 1.6854271356783918e-05, "loss": 1.4036, "step": 44100 }, { "epoch": 1.1, "grad_norm": 9.09375, "learning_rate": 1.6824120603015076e-05, "loss": 1.3283, "step": 44200 }, { "epoch": 1.11, "grad_norm": 5.484375, "learning_rate": 1.679396984924623e-05, "loss": 1.3767, "step": 44300 }, { "epoch": 1.11, "grad_norm": 8.5, "learning_rate": 1.6763819095477387e-05, "loss": 1.3854, "step": 44400 }, { "epoch": 1.11, "grad_norm": 5.47265625, "learning_rate": 1.6733668341708542e-05, "loss": 1.4026, "step": 44500 }, { "epoch": 1.11, "grad_norm": 4.12109375, "learning_rate": 1.6703517587939697e-05, "loss": 1.3708, "step": 44600 }, { "epoch": 1.12, "grad_norm": 4.5625, "learning_rate": 1.6673366834170856e-05, "loss": 1.4159, "step": 44700 }, { "epoch": 1.12, "grad_norm": 7.28515625, "learning_rate": 1.664321608040201e-05, "loss": 1.3806, "step": 44800 }, { "epoch": 1.12, "grad_norm": 7.31640625, "learning_rate": 1.6613065326633166e-05, "loss": 1.3393, "step": 44900 }, { "epoch": 1.12, "grad_norm": 7.17578125, "learning_rate": 1.6582914572864322e-05, "loss": 1.34, "step": 45000 }, { "epoch": 1.12, "eval_loss": 1.4302189350128174, "eval_runtime": 58.4696, "eval_samples_per_second": 17.103, "eval_steps_per_second": 4.276, "step": 45000 }, { "epoch": 1.13, "grad_norm": 7.31640625, "learning_rate": 1.655276381909548e-05, "loss": 1.392, "step": 45100 }, { "epoch": 1.13, "grad_norm": 4.79296875, "learning_rate": 1.6522613065326632e-05, "loss": 1.4112, "step": 45200 }, { "epoch": 1.13, "grad_norm": 5.48828125, "learning_rate": 1.6492462311557788e-05, "loss": 1.3707, "step": 45300 }, { "epoch": 1.14, "grad_norm": 4.9921875, "learning_rate": 1.6462311557788943e-05, "loss": 1.4066, "step": 45400 }, { "epoch": 1.14, "grad_norm": 8.21875, "learning_rate": 1.64321608040201e-05, "loss": 1.4262, "step": 45500 }, { "epoch": 1.14, "grad_norm": 7.63671875, "learning_rate": 1.6402010050251257e-05, "loss": 1.3637, "step": 45600 }, { "epoch": 1.14, "grad_norm": 6.12890625, "learning_rate": 1.6371859296482412e-05, "loss": 1.3972, "step": 45700 }, { "epoch": 1.15, "grad_norm": 6.765625, "learning_rate": 1.6341708542713567e-05, "loss": 1.3538, "step": 45800 }, { "epoch": 1.15, "grad_norm": 5.44921875, "learning_rate": 1.6311557788944726e-05, "loss": 1.3902, "step": 45900 }, { "epoch": 1.15, "grad_norm": 5.734375, "learning_rate": 1.628140703517588e-05, "loss": 1.407, "step": 46000 }, { "epoch": 1.15, "grad_norm": 7.203125, "learning_rate": 1.6251256281407036e-05, "loss": 1.3354, "step": 46100 }, { "epoch": 1.16, "grad_norm": 7.87109375, "learning_rate": 1.6221105527638192e-05, "loss": 1.3464, "step": 46200 }, { "epoch": 1.16, "grad_norm": 3.75, "learning_rate": 1.6190954773869347e-05, "loss": 1.3949, "step": 46300 }, { "epoch": 1.16, "grad_norm": 6.1875, "learning_rate": 1.6160804020100502e-05, "loss": 1.4038, "step": 46400 }, { "epoch": 1.16, "grad_norm": 6.44140625, "learning_rate": 1.6130653266331658e-05, "loss": 1.3513, "step": 46500 }, { "epoch": 1.17, "grad_norm": 3.861328125, "learning_rate": 1.6100502512562813e-05, "loss": 1.349, "step": 46600 }, { "epoch": 1.17, "grad_norm": 6.92578125, "learning_rate": 1.607035175879397e-05, "loss": 1.3736, "step": 46700 }, { "epoch": 1.17, "grad_norm": 5.6328125, "learning_rate": 1.6040201005025127e-05, "loss": 1.392, "step": 46800 }, { "epoch": 1.17, "grad_norm": 8.125, "learning_rate": 1.6010050251256282e-05, "loss": 1.387, "step": 46900 }, { "epoch": 1.18, "grad_norm": 10.765625, "learning_rate": 1.5979899497487437e-05, "loss": 1.365, "step": 47000 }, { "epoch": 1.18, "grad_norm": 5.28125, "learning_rate": 1.5949748743718593e-05, "loss": 1.3596, "step": 47100 }, { "epoch": 1.18, "grad_norm": 5.96875, "learning_rate": 1.591959798994975e-05, "loss": 1.3406, "step": 47200 }, { "epoch": 1.18, "grad_norm": 4.4609375, "learning_rate": 1.5889447236180906e-05, "loss": 1.3779, "step": 47300 }, { "epoch": 1.19, "grad_norm": 4.47265625, "learning_rate": 1.5859296482412062e-05, "loss": 1.3254, "step": 47400 }, { "epoch": 1.19, "grad_norm": 5.77734375, "learning_rate": 1.5829145728643214e-05, "loss": 1.3618, "step": 47500 }, { "epoch": 1.19, "eval_loss": 1.3727421760559082, "eval_runtime": 58.4695, "eval_samples_per_second": 17.103, "eval_steps_per_second": 4.276, "step": 47500 }, { "epoch": 1.19, "grad_norm": 4.19140625, "learning_rate": 1.5798994974874372e-05, "loss": 1.3738, "step": 47600 }, { "epoch": 1.19, "grad_norm": 7.140625, "learning_rate": 1.5768844221105528e-05, "loss": 1.3791, "step": 47700 }, { "epoch": 1.2, "grad_norm": 7.6015625, "learning_rate": 1.5738693467336683e-05, "loss": 1.3381, "step": 47800 }, { "epoch": 1.2, "grad_norm": 6.31640625, "learning_rate": 1.5708542713567838e-05, "loss": 1.3935, "step": 47900 }, { "epoch": 1.2, "grad_norm": 5.19921875, "learning_rate": 1.5678391959798997e-05, "loss": 1.3314, "step": 48000 }, { "epoch": 1.2, "grad_norm": 8.390625, "learning_rate": 1.5648241206030152e-05, "loss": 1.3614, "step": 48100 }, { "epoch": 1.21, "grad_norm": 4.45703125, "learning_rate": 1.5618090452261307e-05, "loss": 1.3347, "step": 48200 }, { "epoch": 1.21, "grad_norm": 4.8125, "learning_rate": 1.5587939698492463e-05, "loss": 1.3963, "step": 48300 }, { "epoch": 1.21, "grad_norm": 5.7734375, "learning_rate": 1.555778894472362e-05, "loss": 1.4068, "step": 48400 }, { "epoch": 1.21, "grad_norm": 8.109375, "learning_rate": 1.5527638190954776e-05, "loss": 1.2939, "step": 48500 }, { "epoch": 1.22, "grad_norm": 5.67578125, "learning_rate": 1.5497487437185928e-05, "loss": 1.2909, "step": 48600 }, { "epoch": 1.22, "grad_norm": 5.0, "learning_rate": 1.5467336683417084e-05, "loss": 1.342, "step": 48700 }, { "epoch": 1.22, "grad_norm": 6.62109375, "learning_rate": 1.5437185929648242e-05, "loss": 1.3664, "step": 48800 }, { "epoch": 1.22, "grad_norm": 4.0, "learning_rate": 1.5407035175879397e-05, "loss": 1.3519, "step": 48900 }, { "epoch": 1.23, "grad_norm": 7.35546875, "learning_rate": 1.5376884422110553e-05, "loss": 1.3675, "step": 49000 }, { "epoch": 1.23, "grad_norm": 7.02734375, "learning_rate": 1.5346733668341708e-05, "loss": 1.3635, "step": 49100 }, { "epoch": 1.23, "grad_norm": 10.796875, "learning_rate": 1.5316582914572863e-05, "loss": 1.3527, "step": 49200 }, { "epoch": 1.23, "grad_norm": 6.140625, "learning_rate": 1.5286432160804022e-05, "loss": 1.3308, "step": 49300 }, { "epoch": 1.23, "grad_norm": 7.33984375, "learning_rate": 1.5256281407035177e-05, "loss": 1.3418, "step": 49400 }, { "epoch": 1.24, "grad_norm": 8.34375, "learning_rate": 1.522613065326633e-05, "loss": 1.3602, "step": 49500 }, { "epoch": 1.24, "grad_norm": 6.44140625, "learning_rate": 1.5195979899497486e-05, "loss": 1.4051, "step": 49600 }, { "epoch": 1.24, "grad_norm": 5.20703125, "learning_rate": 1.5165829145728645e-05, "loss": 1.3698, "step": 49700 }, { "epoch": 1.25, "grad_norm": 4.57421875, "learning_rate": 1.51356783919598e-05, "loss": 1.369, "step": 49800 }, { "epoch": 1.25, "grad_norm": 5.0078125, "learning_rate": 1.5105527638190955e-05, "loss": 1.3848, "step": 49900 }, { "epoch": 1.25, "grad_norm": 6.70703125, "learning_rate": 1.5075376884422109e-05, "loss": 1.3905, "step": 50000 }, { "epoch": 1.25, "eval_loss": 1.3214133977890015, "eval_runtime": 58.3319, "eval_samples_per_second": 17.143, "eval_steps_per_second": 4.286, "step": 50000 }, { "epoch": 1.25, "grad_norm": 7.6484375, "learning_rate": 1.5045226130653267e-05, "loss": 1.3656, "step": 50100 }, { "epoch": 1.25, "grad_norm": 5.04296875, "learning_rate": 1.5015075376884423e-05, "loss": 1.3529, "step": 50200 }, { "epoch": 1.26, "grad_norm": 5.41796875, "learning_rate": 1.4984924623115578e-05, "loss": 1.348, "step": 50300 }, { "epoch": 1.26, "grad_norm": 4.77734375, "learning_rate": 1.4954773869346735e-05, "loss": 1.3442, "step": 50400 }, { "epoch": 1.26, "grad_norm": 5.02734375, "learning_rate": 1.492462311557789e-05, "loss": 1.34, "step": 50500 }, { "epoch": 1.27, "grad_norm": 7.89453125, "learning_rate": 1.4894472361809046e-05, "loss": 1.3631, "step": 50600 }, { "epoch": 1.27, "grad_norm": 17.5625, "learning_rate": 1.48643216080402e-05, "loss": 1.3492, "step": 50700 }, { "epoch": 1.27, "grad_norm": 6.22265625, "learning_rate": 1.4834170854271358e-05, "loss": 1.3604, "step": 50800 }, { "epoch": 1.27, "grad_norm": 5.5234375, "learning_rate": 1.4804020100502513e-05, "loss": 1.3412, "step": 50900 }, { "epoch": 1.27, "grad_norm": 5.93359375, "learning_rate": 1.477386934673367e-05, "loss": 1.3628, "step": 51000 }, { "epoch": 1.28, "grad_norm": 8.46875, "learning_rate": 1.4743718592964824e-05, "loss": 1.3496, "step": 51100 }, { "epoch": 1.28, "grad_norm": 6.32421875, "learning_rate": 1.471356783919598e-05, "loss": 1.3683, "step": 51200 }, { "epoch": 1.28, "grad_norm": 5.64453125, "learning_rate": 1.4683417085427136e-05, "loss": 1.3648, "step": 51300 }, { "epoch": 1.28, "grad_norm": 5.328125, "learning_rate": 1.4653266331658293e-05, "loss": 1.365, "step": 51400 }, { "epoch": 1.29, "grad_norm": 5.7578125, "learning_rate": 1.4623115577889448e-05, "loss": 1.3176, "step": 51500 }, { "epoch": 1.29, "grad_norm": 6.6875, "learning_rate": 1.4592964824120603e-05, "loss": 1.339, "step": 51600 }, { "epoch": 1.29, "grad_norm": 4.8671875, "learning_rate": 1.4562814070351759e-05, "loss": 1.3725, "step": 51700 }, { "epoch": 1.29, "grad_norm": 7.10546875, "learning_rate": 1.4532663316582914e-05, "loss": 1.3552, "step": 51800 }, { "epoch": 1.3, "grad_norm": 6.890625, "learning_rate": 1.450251256281407e-05, "loss": 1.2784, "step": 51900 }, { "epoch": 1.3, "grad_norm": 7.17578125, "learning_rate": 1.4472361809045226e-05, "loss": 1.3438, "step": 52000 }, { "epoch": 1.3, "grad_norm": 10.40625, "learning_rate": 1.4442211055276383e-05, "loss": 1.2733, "step": 52100 }, { "epoch": 1.3, "grad_norm": 7.32421875, "learning_rate": 1.4412060301507538e-05, "loss": 1.3616, "step": 52200 }, { "epoch": 1.31, "grad_norm": 3.615234375, "learning_rate": 1.4381909547738694e-05, "loss": 1.3346, "step": 52300 }, { "epoch": 1.31, "grad_norm": 6.54296875, "learning_rate": 1.4351758793969849e-05, "loss": 1.3893, "step": 52400 }, { "epoch": 1.31, "grad_norm": 7.2578125, "learning_rate": 1.4321608040201006e-05, "loss": 1.313, "step": 52500 }, { "epoch": 1.31, "eval_loss": 1.3575935363769531, "eval_runtime": 58.4472, "eval_samples_per_second": 17.109, "eval_steps_per_second": 4.277, "step": 52500 }, { "epoch": 1.31, "grad_norm": 7.12109375, "learning_rate": 1.4291457286432161e-05, "loss": 1.3669, "step": 52600 }, { "epoch": 1.32, "grad_norm": 5.38671875, "learning_rate": 1.4261306532663318e-05, "loss": 1.3356, "step": 52700 }, { "epoch": 1.32, "grad_norm": 6.97265625, "learning_rate": 1.4231155778894472e-05, "loss": 1.326, "step": 52800 }, { "epoch": 1.32, "grad_norm": 6.5078125, "learning_rate": 1.4201005025125629e-05, "loss": 1.4087, "step": 52900 }, { "epoch": 1.32, "grad_norm": 7.66015625, "learning_rate": 1.4170854271356784e-05, "loss": 1.3237, "step": 53000 }, { "epoch": 1.33, "grad_norm": 4.0, "learning_rate": 1.414070351758794e-05, "loss": 1.3516, "step": 53100 }, { "epoch": 1.33, "grad_norm": 6.90625, "learning_rate": 1.4110552763819096e-05, "loss": 1.3495, "step": 53200 }, { "epoch": 1.33, "grad_norm": 8.5625, "learning_rate": 1.4080402010050253e-05, "loss": 1.3308, "step": 53300 }, { "epoch": 1.33, "grad_norm": 8.078125, "learning_rate": 1.4050251256281407e-05, "loss": 1.3762, "step": 53400 }, { "epoch": 1.34, "grad_norm": 4.453125, "learning_rate": 1.4020100502512564e-05, "loss": 1.3929, "step": 53500 }, { "epoch": 1.34, "grad_norm": 8.0234375, "learning_rate": 1.3989949748743719e-05, "loss": 1.2895, "step": 53600 }, { "epoch": 1.34, "grad_norm": 8.125, "learning_rate": 1.3959798994974876e-05, "loss": 1.3772, "step": 53700 }, { "epoch": 1.34, "grad_norm": 5.9609375, "learning_rate": 1.3929648241206031e-05, "loss": 1.3427, "step": 53800 }, { "epoch": 1.35, "grad_norm": 6.0625, "learning_rate": 1.3899497487437186e-05, "loss": 1.3433, "step": 53900 }, { "epoch": 1.35, "grad_norm": 5.52734375, "learning_rate": 1.3869346733668342e-05, "loss": 1.3341, "step": 54000 }, { "epoch": 1.35, "grad_norm": 8.1328125, "learning_rate": 1.3839195979899497e-05, "loss": 1.377, "step": 54100 }, { "epoch": 1.35, "grad_norm": 5.265625, "learning_rate": 1.3809045226130654e-05, "loss": 1.3653, "step": 54200 }, { "epoch": 1.36, "grad_norm": 6.80859375, "learning_rate": 1.3778894472361809e-05, "loss": 1.3282, "step": 54300 }, { "epoch": 1.36, "grad_norm": 6.078125, "learning_rate": 1.3748743718592966e-05, "loss": 1.3511, "step": 54400 }, { "epoch": 1.36, "grad_norm": 7.08984375, "learning_rate": 1.371859296482412e-05, "loss": 1.3379, "step": 54500 }, { "epoch": 1.36, "grad_norm": 4.79296875, "learning_rate": 1.3688442211055277e-05, "loss": 1.3352, "step": 54600 }, { "epoch": 1.37, "grad_norm": 3.83984375, "learning_rate": 1.3658291457286432e-05, "loss": 1.3652, "step": 54700 }, { "epoch": 1.37, "grad_norm": 6.2734375, "learning_rate": 1.3628140703517589e-05, "loss": 1.3457, "step": 54800 }, { "epoch": 1.37, "grad_norm": 8.8203125, "learning_rate": 1.3597989949748744e-05, "loss": 1.3861, "step": 54900 }, { "epoch": 1.38, "grad_norm": 7.9921875, "learning_rate": 1.3567839195979901e-05, "loss": 1.3345, "step": 55000 }, { "epoch": 1.38, "eval_loss": 1.352059006690979, "eval_runtime": 58.4301, "eval_samples_per_second": 17.114, "eval_steps_per_second": 4.279, "step": 55000 }, { "epoch": 1.38, "grad_norm": 4.3046875, "learning_rate": 1.3537688442211055e-05, "loss": 1.3387, "step": 55100 }, { "epoch": 1.38, "grad_norm": 8.2734375, "learning_rate": 1.3507537688442212e-05, "loss": 1.3813, "step": 55200 }, { "epoch": 1.38, "grad_norm": 5.5078125, "learning_rate": 1.3477386934673367e-05, "loss": 1.3275, "step": 55300 }, { "epoch": 1.39, "grad_norm": 6.3125, "learning_rate": 1.3447236180904524e-05, "loss": 1.3102, "step": 55400 }, { "epoch": 1.39, "grad_norm": 6.84375, "learning_rate": 1.3417085427135679e-05, "loss": 1.3661, "step": 55500 }, { "epoch": 1.39, "grad_norm": 5.234375, "learning_rate": 1.3386934673366836e-05, "loss": 1.3706, "step": 55600 }, { "epoch": 1.39, "grad_norm": 6.6328125, "learning_rate": 1.335678391959799e-05, "loss": 1.3108, "step": 55700 }, { "epoch": 1.4, "grad_norm": 4.51171875, "learning_rate": 1.3326633165829147e-05, "loss": 1.3616, "step": 55800 }, { "epoch": 1.4, "grad_norm": 7.078125, "learning_rate": 1.3296482412060302e-05, "loss": 1.3603, "step": 55900 }, { "epoch": 1.4, "grad_norm": 5.26953125, "learning_rate": 1.3266331658291459e-05, "loss": 1.344, "step": 56000 }, { "epoch": 1.4, "grad_norm": 10.1328125, "learning_rate": 1.3236180904522614e-05, "loss": 1.3127, "step": 56100 }, { "epoch": 1.41, "grad_norm": 6.19140625, "learning_rate": 1.3206030150753768e-05, "loss": 1.3231, "step": 56200 }, { "epoch": 1.41, "grad_norm": 7.38671875, "learning_rate": 1.3175879396984925e-05, "loss": 1.3684, "step": 56300 }, { "epoch": 1.41, "grad_norm": 11.8203125, "learning_rate": 1.314572864321608e-05, "loss": 1.3543, "step": 56400 }, { "epoch": 1.41, "grad_norm": 6.22265625, "learning_rate": 1.3115577889447237e-05, "loss": 1.314, "step": 56500 }, { "epoch": 1.42, "grad_norm": 6.4609375, "learning_rate": 1.3085427135678392e-05, "loss": 1.3074, "step": 56600 }, { "epoch": 1.42, "grad_norm": 7.19921875, "learning_rate": 1.3055276381909549e-05, "loss": 1.3272, "step": 56700 }, { "epoch": 1.42, "grad_norm": 3.60546875, "learning_rate": 1.3025125628140703e-05, "loss": 1.2822, "step": 56800 }, { "epoch": 1.42, "grad_norm": 5.015625, "learning_rate": 1.299497487437186e-05, "loss": 1.3366, "step": 56900 }, { "epoch": 1.43, "grad_norm": 7.0859375, "learning_rate": 1.2964824120603015e-05, "loss": 1.3094, "step": 57000 }, { "epoch": 1.43, "grad_norm": 6.21484375, "learning_rate": 1.2934673366834172e-05, "loss": 1.313, "step": 57100 }, { "epoch": 1.43, "grad_norm": 8.9140625, "learning_rate": 1.2904522613065327e-05, "loss": 1.2882, "step": 57200 }, { "epoch": 1.43, "grad_norm": 4.85546875, "learning_rate": 1.2874371859296484e-05, "loss": 1.3425, "step": 57300 }, { "epoch": 1.44, "grad_norm": 7.71875, "learning_rate": 1.2844221105527638e-05, "loss": 1.3341, "step": 57400 }, { "epoch": 1.44, "grad_norm": 6.60546875, "learning_rate": 1.2814070351758795e-05, "loss": 1.2881, "step": 57500 }, { "epoch": 1.44, "eval_loss": 1.3498035669326782, "eval_runtime": 58.3991, "eval_samples_per_second": 17.124, "eval_steps_per_second": 4.281, "step": 57500 }, { "epoch": 1.44, "grad_norm": 5.59375, "learning_rate": 1.278391959798995e-05, "loss": 1.3218, "step": 57600 }, { "epoch": 1.44, "grad_norm": 7.36328125, "learning_rate": 1.2753768844221107e-05, "loss": 1.3059, "step": 57700 }, { "epoch": 1.45, "grad_norm": 6.1171875, "learning_rate": 1.2723618090452262e-05, "loss": 1.2796, "step": 57800 }, { "epoch": 1.45, "grad_norm": 5.33984375, "learning_rate": 1.2693467336683417e-05, "loss": 1.2952, "step": 57900 }, { "epoch": 1.45, "grad_norm": 7.80078125, "learning_rate": 1.2663316582914573e-05, "loss": 1.3225, "step": 58000 }, { "epoch": 1.45, "grad_norm": 6.1796875, "learning_rate": 1.263316582914573e-05, "loss": 1.332, "step": 58100 }, { "epoch": 1.46, "grad_norm": 6.26171875, "learning_rate": 1.2603015075376885e-05, "loss": 1.3215, "step": 58200 }, { "epoch": 1.46, "grad_norm": 7.08984375, "learning_rate": 1.2572864321608042e-05, "loss": 1.3355, "step": 58300 }, { "epoch": 1.46, "grad_norm": 5.46484375, "learning_rate": 1.2542713567839197e-05, "loss": 1.3136, "step": 58400 }, { "epoch": 1.46, "grad_norm": 7.5234375, "learning_rate": 1.251256281407035e-05, "loss": 1.3429, "step": 58500 }, { "epoch": 1.47, "grad_norm": 5.74609375, "learning_rate": 1.2482412060301508e-05, "loss": 1.3638, "step": 58600 }, { "epoch": 1.47, "grad_norm": 6.1328125, "learning_rate": 1.2452261306532663e-05, "loss": 1.2987, "step": 58700 }, { "epoch": 1.47, "grad_norm": 4.96484375, "learning_rate": 1.242211055276382e-05, "loss": 1.3281, "step": 58800 }, { "epoch": 1.47, "grad_norm": 4.68359375, "learning_rate": 1.2391959798994975e-05, "loss": 1.2599, "step": 58900 }, { "epoch": 1.48, "grad_norm": 7.87890625, "learning_rate": 1.2361809045226132e-05, "loss": 1.334, "step": 59000 }, { "epoch": 1.48, "grad_norm": 5.8125, "learning_rate": 1.2331658291457286e-05, "loss": 1.32, "step": 59100 }, { "epoch": 1.48, "grad_norm": 6.26953125, "learning_rate": 1.2301507537688443e-05, "loss": 1.3368, "step": 59200 }, { "epoch": 1.48, "grad_norm": 4.5703125, "learning_rate": 1.2271356783919598e-05, "loss": 1.2395, "step": 59300 }, { "epoch": 1.48, "grad_norm": 7.453125, "learning_rate": 1.2241206030150755e-05, "loss": 1.3287, "step": 59400 }, { "epoch": 1.49, "grad_norm": 6.453125, "learning_rate": 1.221105527638191e-05, "loss": 1.3069, "step": 59500 }, { "epoch": 1.49, "grad_norm": 5.75, "learning_rate": 1.2180904522613065e-05, "loss": 1.2809, "step": 59600 }, { "epoch": 1.49, "grad_norm": 6.00390625, "learning_rate": 1.215075376884422e-05, "loss": 1.3378, "step": 59700 }, { "epoch": 1.5, "grad_norm": 7.78125, "learning_rate": 1.2120603015075378e-05, "loss": 1.3107, "step": 59800 }, { "epoch": 1.5, "grad_norm": 5.94140625, "learning_rate": 1.2090452261306533e-05, "loss": 1.2776, "step": 59900 }, { "epoch": 1.5, "grad_norm": 5.78125, "learning_rate": 1.206030150753769e-05, "loss": 1.3065, "step": 60000 }, { "epoch": 1.5, "eval_loss": 1.3107746839523315, "eval_runtime": 58.384, "eval_samples_per_second": 17.128, "eval_steps_per_second": 4.282, "step": 60000 }, { "epoch": 1.5, "grad_norm": 8.734375, "learning_rate": 1.2030150753768845e-05, "loss": 1.3049, "step": 60100 }, { "epoch": 1.5, "grad_norm": 5.95703125, "learning_rate": 1.2e-05, "loss": 1.319, "step": 60200 }, { "epoch": 1.51, "grad_norm": 5.3515625, "learning_rate": 1.1969849246231156e-05, "loss": 1.2789, "step": 60300 }, { "epoch": 1.51, "grad_norm": 14.1171875, "learning_rate": 1.1939698492462313e-05, "loss": 1.3548, "step": 60400 }, { "epoch": 1.51, "grad_norm": 5.328125, "learning_rate": 1.1909547738693468e-05, "loss": 1.3843, "step": 60500 }, { "epoch": 1.52, "grad_norm": 9.5, "learning_rate": 1.1879396984924625e-05, "loss": 1.2877, "step": 60600 }, { "epoch": 1.52, "grad_norm": 3.416015625, "learning_rate": 1.184924623115578e-05, "loss": 1.3448, "step": 60700 }, { "epoch": 1.52, "grad_norm": 7.03515625, "learning_rate": 1.1819095477386934e-05, "loss": 1.3308, "step": 60800 }, { "epoch": 1.52, "grad_norm": 4.8671875, "learning_rate": 1.178894472361809e-05, "loss": 1.2599, "step": 60900 }, { "epoch": 1.52, "grad_norm": 6.1171875, "learning_rate": 1.1758793969849246e-05, "loss": 1.2777, "step": 61000 }, { "epoch": 1.53, "grad_norm": 5.17578125, "learning_rate": 1.1728643216080403e-05, "loss": 1.3046, "step": 61100 }, { "epoch": 1.53, "grad_norm": 6.5859375, "learning_rate": 1.1698492462311558e-05, "loss": 1.3241, "step": 61200 }, { "epoch": 1.53, "grad_norm": 7.91796875, "learning_rate": 1.1668341708542713e-05, "loss": 1.3214, "step": 61300 }, { "epoch": 1.54, "grad_norm": 4.66796875, "learning_rate": 1.1638190954773869e-05, "loss": 1.3132, "step": 61400 }, { "epoch": 1.54, "grad_norm": 6.0546875, "learning_rate": 1.1608040201005026e-05, "loss": 1.3265, "step": 61500 }, { "epoch": 1.54, "grad_norm": 5.17578125, "learning_rate": 1.157788944723618e-05, "loss": 1.261, "step": 61600 }, { "epoch": 1.54, "grad_norm": 4.625, "learning_rate": 1.1547738693467338e-05, "loss": 1.3262, "step": 61700 }, { "epoch": 1.54, "grad_norm": 4.87890625, "learning_rate": 1.1517587939698493e-05, "loss": 1.2664, "step": 61800 }, { "epoch": 1.55, "grad_norm": 4.79296875, "learning_rate": 1.1487437185929648e-05, "loss": 1.3297, "step": 61900 }, { "epoch": 1.55, "grad_norm": 8.34375, "learning_rate": 1.1457286432160804e-05, "loss": 1.28, "step": 62000 }, { "epoch": 1.55, "grad_norm": 8.3203125, "learning_rate": 1.142713567839196e-05, "loss": 1.266, "step": 62100 }, { "epoch": 1.56, "grad_norm": 7.94921875, "learning_rate": 1.1396984924623116e-05, "loss": 1.2977, "step": 62200 }, { "epoch": 1.56, "grad_norm": 10.5234375, "learning_rate": 1.1366834170854273e-05, "loss": 1.3281, "step": 62300 }, { "epoch": 1.56, "grad_norm": 6.87890625, "learning_rate": 1.1336683417085428e-05, "loss": 1.2998, "step": 62400 }, { "epoch": 1.56, "grad_norm": 5.69921875, "learning_rate": 1.1306532663316583e-05, "loss": 1.2484, "step": 62500 }, { "epoch": 1.56, "eval_loss": 1.3006246089935303, "eval_runtime": 58.2951, "eval_samples_per_second": 17.154, "eval_steps_per_second": 4.289, "step": 62500 }, { "epoch": 1.56, "grad_norm": 6.99609375, "learning_rate": 1.1276381909547739e-05, "loss": 1.2923, "step": 62600 }, { "epoch": 1.57, "grad_norm": 7.94140625, "learning_rate": 1.1246231155778896e-05, "loss": 1.2975, "step": 62700 }, { "epoch": 1.57, "grad_norm": 6.96484375, "learning_rate": 1.121608040201005e-05, "loss": 1.2703, "step": 62800 }, { "epoch": 1.57, "grad_norm": 6.7578125, "learning_rate": 1.1185929648241208e-05, "loss": 1.3221, "step": 62900 }, { "epoch": 1.57, "grad_norm": 6.00390625, "learning_rate": 1.1155778894472361e-05, "loss": 1.3186, "step": 63000 }, { "epoch": 1.58, "grad_norm": 4.48828125, "learning_rate": 1.1125628140703517e-05, "loss": 1.2878, "step": 63100 }, { "epoch": 1.58, "grad_norm": 5.16015625, "learning_rate": 1.1095477386934674e-05, "loss": 1.302, "step": 63200 }, { "epoch": 1.58, "grad_norm": 10.0546875, "learning_rate": 1.1065326633165829e-05, "loss": 1.2607, "step": 63300 }, { "epoch": 1.58, "grad_norm": 4.77734375, "learning_rate": 1.1035175879396986e-05, "loss": 1.3128, "step": 63400 }, { "epoch": 1.59, "grad_norm": 4.2578125, "learning_rate": 1.1005025125628141e-05, "loss": 1.2557, "step": 63500 }, { "epoch": 1.59, "grad_norm": 11.1015625, "learning_rate": 1.0974874371859296e-05, "loss": 1.3114, "step": 63600 }, { "epoch": 1.59, "grad_norm": 2.9296875, "learning_rate": 1.0944723618090452e-05, "loss": 1.2991, "step": 63700 }, { "epoch": 1.59, "grad_norm": 7.55859375, "learning_rate": 1.0914572864321609e-05, "loss": 1.3479, "step": 63800 }, { "epoch": 1.6, "grad_norm": 4.71875, "learning_rate": 1.0884422110552764e-05, "loss": 1.2951, "step": 63900 }, { "epoch": 1.6, "grad_norm": 5.328125, "learning_rate": 1.085427135678392e-05, "loss": 1.3348, "step": 64000 }, { "epoch": 1.6, "grad_norm": 6.984375, "learning_rate": 1.0824120603015076e-05, "loss": 1.2897, "step": 64100 }, { "epoch": 1.6, "grad_norm": 3.748046875, "learning_rate": 1.0793969849246231e-05, "loss": 1.3096, "step": 64200 }, { "epoch": 1.61, "grad_norm": 7.83984375, "learning_rate": 1.0763819095477387e-05, "loss": 1.2854, "step": 64300 }, { "epoch": 1.61, "grad_norm": 4.6640625, "learning_rate": 1.0733668341708544e-05, "loss": 1.2507, "step": 64400 }, { "epoch": 1.61, "grad_norm": 7.77734375, "learning_rate": 1.0703517587939699e-05, "loss": 1.271, "step": 64500 }, { "epoch": 1.61, "grad_norm": 6.4765625, "learning_rate": 1.0673366834170856e-05, "loss": 1.2176, "step": 64600 }, { "epoch": 1.62, "grad_norm": 6.984375, "learning_rate": 1.064321608040201e-05, "loss": 1.301, "step": 64700 }, { "epoch": 1.62, "grad_norm": 5.8203125, "learning_rate": 1.0613065326633166e-05, "loss": 1.3127, "step": 64800 }, { "epoch": 1.62, "grad_norm": 6.3671875, "learning_rate": 1.0582914572864322e-05, "loss": 1.2781, "step": 64900 }, { "epoch": 1.62, "grad_norm": 7.41015625, "learning_rate": 1.0552763819095479e-05, "loss": 1.3218, "step": 65000 }, { "epoch": 1.62, "eval_loss": 1.321094274520874, "eval_runtime": 58.4017, "eval_samples_per_second": 17.123, "eval_steps_per_second": 4.281, "step": 65000 }, { "epoch": 1.63, "grad_norm": 7.20703125, "learning_rate": 1.0522613065326634e-05, "loss": 1.2238, "step": 65100 }, { "epoch": 1.63, "grad_norm": 7.1640625, "learning_rate": 1.049246231155779e-05, "loss": 1.2775, "step": 65200 }, { "epoch": 1.63, "grad_norm": 4.03125, "learning_rate": 1.0462311557788944e-05, "loss": 1.3113, "step": 65300 }, { "epoch": 1.64, "grad_norm": 10.484375, "learning_rate": 1.04321608040201e-05, "loss": 1.2948, "step": 65400 }, { "epoch": 1.64, "grad_norm": 5.609375, "learning_rate": 1.0402010050251257e-05, "loss": 1.3212, "step": 65500 }, { "epoch": 1.64, "grad_norm": 3.39453125, "learning_rate": 1.0371859296482412e-05, "loss": 1.2958, "step": 65600 }, { "epoch": 1.64, "grad_norm": 5.3828125, "learning_rate": 1.0341708542713569e-05, "loss": 1.2665, "step": 65700 }, { "epoch": 1.65, "grad_norm": 6.5390625, "learning_rate": 1.0311557788944724e-05, "loss": 1.2708, "step": 65800 }, { "epoch": 1.65, "grad_norm": 9.9296875, "learning_rate": 1.028140703517588e-05, "loss": 1.3221, "step": 65900 }, { "epoch": 1.65, "grad_norm": 6.21484375, "learning_rate": 1.0251256281407035e-05, "loss": 1.2757, "step": 66000 }, { "epoch": 1.65, "grad_norm": 5.140625, "learning_rate": 1.0221105527638192e-05, "loss": 1.2619, "step": 66100 }, { "epoch": 1.66, "grad_norm": 4.375, "learning_rate": 1.0190954773869347e-05, "loss": 1.3567, "step": 66200 }, { "epoch": 1.66, "grad_norm": 5.93359375, "learning_rate": 1.0160804020100504e-05, "loss": 1.2923, "step": 66300 }, { "epoch": 1.66, "grad_norm": 6.234375, "learning_rate": 1.0130653266331657e-05, "loss": 1.2919, "step": 66400 }, { "epoch": 1.66, "grad_norm": 6.51171875, "learning_rate": 1.0100502512562814e-05, "loss": 1.2798, "step": 66500 }, { "epoch": 1.67, "grad_norm": 7.75, "learning_rate": 1.007035175879397e-05, "loss": 1.2884, "step": 66600 }, { "epoch": 1.67, "grad_norm": 5.21875, "learning_rate": 1.0040201005025127e-05, "loss": 1.2824, "step": 66700 }, { "epoch": 1.67, "grad_norm": 5.703125, "learning_rate": 1.0010050251256282e-05, "loss": 1.272, "step": 66800 }, { "epoch": 1.67, "grad_norm": 6.50390625, "learning_rate": 9.979899497487439e-06, "loss": 1.3051, "step": 66900 }, { "epoch": 1.68, "grad_norm": 5.8046875, "learning_rate": 9.949748743718592e-06, "loss": 1.2931, "step": 67000 }, { "epoch": 1.68, "grad_norm": 6.51171875, "learning_rate": 9.91959798994975e-06, "loss": 1.2462, "step": 67100 }, { "epoch": 1.68, "grad_norm": 5.41015625, "learning_rate": 9.889447236180905e-06, "loss": 1.262, "step": 67200 }, { "epoch": 1.68, "grad_norm": 5.578125, "learning_rate": 9.859296482412062e-06, "loss": 1.2758, "step": 67300 }, { "epoch": 1.69, "grad_norm": 7.79296875, "learning_rate": 9.829145728643217e-06, "loss": 1.2548, "step": 67400 }, { "epoch": 1.69, "grad_norm": 7.046875, "learning_rate": 9.798994974874372e-06, "loss": 1.3057, "step": 67500 }, { "epoch": 1.69, "eval_loss": 1.3317267894744873, "eval_runtime": 58.3095, "eval_samples_per_second": 17.15, "eval_steps_per_second": 4.287, "step": 67500 }, { "epoch": 1.69, "grad_norm": 9.78125, "learning_rate": 9.768844221105527e-06, "loss": 1.2899, "step": 67600 }, { "epoch": 1.69, "grad_norm": 5.09765625, "learning_rate": 9.738693467336683e-06, "loss": 1.2609, "step": 67700 }, { "epoch": 1.69, "grad_norm": 8.0859375, "learning_rate": 9.70854271356784e-06, "loss": 1.3006, "step": 67800 }, { "epoch": 1.7, "grad_norm": 5.0234375, "learning_rate": 9.678391959798995e-06, "loss": 1.275, "step": 67900 }, { "epoch": 1.7, "grad_norm": 5.44921875, "learning_rate": 9.648241206030152e-06, "loss": 1.2872, "step": 68000 }, { "epoch": 1.7, "grad_norm": 5.50390625, "learning_rate": 9.618090452261305e-06, "loss": 1.316, "step": 68100 }, { "epoch": 1.71, "grad_norm": 3.345703125, "learning_rate": 9.587939698492462e-06, "loss": 1.295, "step": 68200 }, { "epoch": 1.71, "grad_norm": 9.9140625, "learning_rate": 9.557788944723618e-06, "loss": 1.2607, "step": 68300 }, { "epoch": 1.71, "grad_norm": 6.43359375, "learning_rate": 9.527638190954775e-06, "loss": 1.2936, "step": 68400 }, { "epoch": 1.71, "grad_norm": 7.26953125, "learning_rate": 9.49748743718593e-06, "loss": 1.299, "step": 68500 }, { "epoch": 1.71, "grad_norm": 5.04296875, "learning_rate": 9.467336683417087e-06, "loss": 1.2669, "step": 68600 }, { "epoch": 1.72, "grad_norm": 4.80859375, "learning_rate": 9.43718592964824e-06, "loss": 1.2794, "step": 68700 }, { "epoch": 1.72, "grad_norm": 9.578125, "learning_rate": 9.407035175879397e-06, "loss": 1.2817, "step": 68800 }, { "epoch": 1.72, "grad_norm": 11.0546875, "learning_rate": 9.376884422110553e-06, "loss": 1.215, "step": 68900 }, { "epoch": 1.73, "grad_norm": 6.125, "learning_rate": 9.34673366834171e-06, "loss": 1.1854, "step": 69000 }, { "epoch": 1.73, "grad_norm": 6.05859375, "learning_rate": 9.316582914572865e-06, "loss": 1.2695, "step": 69100 }, { "epoch": 1.73, "grad_norm": 9.546875, "learning_rate": 9.28643216080402e-06, "loss": 1.2513, "step": 69200 }, { "epoch": 1.73, "grad_norm": 6.578125, "learning_rate": 9.256281407035175e-06, "loss": 1.2677, "step": 69300 }, { "epoch": 1.73, "grad_norm": 7.69921875, "learning_rate": 9.226130653266332e-06, "loss": 1.3259, "step": 69400 }, { "epoch": 1.74, "grad_norm": 4.6171875, "learning_rate": 9.195979899497488e-06, "loss": 1.2217, "step": 69500 }, { "epoch": 1.74, "grad_norm": 6.1484375, "learning_rate": 9.165829145728645e-06, "loss": 1.254, "step": 69600 }, { "epoch": 1.74, "grad_norm": 6.375, "learning_rate": 9.1356783919598e-06, "loss": 1.2685, "step": 69700 }, { "epoch": 1.75, "grad_norm": 5.44921875, "learning_rate": 9.105527638190955e-06, "loss": 1.1982, "step": 69800 }, { "epoch": 1.75, "grad_norm": 5.57421875, "learning_rate": 9.07537688442211e-06, "loss": 1.2908, "step": 69900 }, { "epoch": 1.75, "grad_norm": 6.02734375, "learning_rate": 9.045226130653266e-06, "loss": 1.2438, "step": 70000 }, { "epoch": 1.75, "eval_loss": 1.229135513305664, "eval_runtime": 58.2748, "eval_samples_per_second": 17.16, "eval_steps_per_second": 4.29, "step": 70000 }, { "epoch": 1.75, "grad_norm": 6.41796875, "learning_rate": 9.015075376884423e-06, "loss": 1.2803, "step": 70100 }, { "epoch": 1.75, "grad_norm": 7.65625, "learning_rate": 8.984924623115578e-06, "loss": 1.2674, "step": 70200 }, { "epoch": 1.76, "grad_norm": 11.1796875, "learning_rate": 8.954773869346735e-06, "loss": 1.2702, "step": 70300 }, { "epoch": 1.76, "grad_norm": 11.6015625, "learning_rate": 8.924623115577888e-06, "loss": 1.3009, "step": 70400 }, { "epoch": 1.76, "grad_norm": 6.42578125, "learning_rate": 8.894472361809045e-06, "loss": 1.2585, "step": 70500 }, { "epoch": 1.77, "grad_norm": 8.9921875, "learning_rate": 8.8643216080402e-06, "loss": 1.2714, "step": 70600 }, { "epoch": 1.77, "grad_norm": 6.359375, "learning_rate": 8.834170854271358e-06, "loss": 1.2657, "step": 70700 }, { "epoch": 1.77, "grad_norm": 4.984375, "learning_rate": 8.804020100502513e-06, "loss": 1.2992, "step": 70800 }, { "epoch": 1.77, "grad_norm": 8.53125, "learning_rate": 8.773869346733668e-06, "loss": 1.2252, "step": 70900 }, { "epoch": 1.77, "grad_norm": 5.296875, "learning_rate": 8.743718592964823e-06, "loss": 1.2618, "step": 71000 }, { "epoch": 1.78, "grad_norm": 8.515625, "learning_rate": 8.71356783919598e-06, "loss": 1.2973, "step": 71100 }, { "epoch": 1.78, "grad_norm": 9.8828125, "learning_rate": 8.683417085427136e-06, "loss": 1.2785, "step": 71200 }, { "epoch": 1.78, "grad_norm": 7.08203125, "learning_rate": 8.653266331658293e-06, "loss": 1.2496, "step": 71300 }, { "epoch": 1.79, "grad_norm": 7.35546875, "learning_rate": 8.623115577889448e-06, "loss": 1.2757, "step": 71400 }, { "epoch": 1.79, "grad_norm": 7.109375, "learning_rate": 8.592964824120603e-06, "loss": 1.283, "step": 71500 }, { "epoch": 1.79, "grad_norm": 8.9765625, "learning_rate": 8.562814070351758e-06, "loss": 1.2705, "step": 71600 }, { "epoch": 1.79, "grad_norm": 5.4921875, "learning_rate": 8.532663316582915e-06, "loss": 1.2593, "step": 71700 }, { "epoch": 1.79, "grad_norm": 6.984375, "learning_rate": 8.50251256281407e-06, "loss": 1.2839, "step": 71800 }, { "epoch": 1.8, "grad_norm": 4.89453125, "learning_rate": 8.472361809045228e-06, "loss": 1.2628, "step": 71900 }, { "epoch": 1.8, "grad_norm": 6.7265625, "learning_rate": 8.442211055276383e-06, "loss": 1.3171, "step": 72000 }, { "epoch": 1.8, "grad_norm": 6.06640625, "learning_rate": 8.412060301507538e-06, "loss": 1.2768, "step": 72100 }, { "epoch": 1.81, "grad_norm": 8.0546875, "learning_rate": 8.381909547738693e-06, "loss": 1.3066, "step": 72200 }, { "epoch": 1.81, "grad_norm": 5.91796875, "learning_rate": 8.351758793969849e-06, "loss": 1.2368, "step": 72300 }, { "epoch": 1.81, "grad_norm": 8.5625, "learning_rate": 8.321608040201006e-06, "loss": 1.2342, "step": 72400 }, { "epoch": 1.81, "grad_norm": 7.94921875, "learning_rate": 8.291457286432161e-06, "loss": 1.2615, "step": 72500 }, { "epoch": 1.81, "eval_loss": 1.2854846715927124, "eval_runtime": 58.3319, "eval_samples_per_second": 17.143, "eval_steps_per_second": 4.286, "step": 72500 }, { "epoch": 1.81, "grad_norm": 13.90625, "learning_rate": 8.261306532663316e-06, "loss": 1.2859, "step": 72600 }, { "epoch": 1.82, "grad_norm": 7.37890625, "learning_rate": 8.231155778894471e-06, "loss": 1.2373, "step": 72700 }, { "epoch": 1.82, "grad_norm": 6.78515625, "learning_rate": 8.201005025125628e-06, "loss": 1.2577, "step": 72800 }, { "epoch": 1.82, "grad_norm": 8.546875, "learning_rate": 8.170854271356784e-06, "loss": 1.2793, "step": 72900 }, { "epoch": 1.82, "grad_norm": 6.0625, "learning_rate": 8.14070351758794e-06, "loss": 1.3146, "step": 73000 }, { "epoch": 1.83, "grad_norm": 4.19140625, "learning_rate": 8.110552763819096e-06, "loss": 1.2822, "step": 73100 }, { "epoch": 1.83, "grad_norm": 5.15234375, "learning_rate": 8.080402010050251e-06, "loss": 1.2345, "step": 73200 }, { "epoch": 1.83, "grad_norm": 7.6640625, "learning_rate": 8.050251256281406e-06, "loss": 1.2923, "step": 73300 }, { "epoch": 1.83, "grad_norm": 8.1953125, "learning_rate": 8.020100502512563e-06, "loss": 1.2266, "step": 73400 }, { "epoch": 1.84, "grad_norm": 5.65234375, "learning_rate": 7.989949748743719e-06, "loss": 1.27, "step": 73500 }, { "epoch": 1.84, "grad_norm": 6.40625, "learning_rate": 7.959798994974876e-06, "loss": 1.2393, "step": 73600 }, { "epoch": 1.84, "grad_norm": 21.375, "learning_rate": 7.929648241206031e-06, "loss": 1.2509, "step": 73700 }, { "epoch": 1.84, "grad_norm": 6.984375, "learning_rate": 7.899497487437186e-06, "loss": 1.263, "step": 73800 }, { "epoch": 1.85, "grad_norm": 7.5234375, "learning_rate": 7.869346733668341e-06, "loss": 1.2373, "step": 73900 }, { "epoch": 1.85, "grad_norm": 5.546875, "learning_rate": 7.839195979899498e-06, "loss": 1.3024, "step": 74000 }, { "epoch": 1.85, "grad_norm": 6.68359375, "learning_rate": 7.809045226130654e-06, "loss": 1.2432, "step": 74100 }, { "epoch": 1.85, "grad_norm": 9.8046875, "learning_rate": 7.77889447236181e-06, "loss": 1.2423, "step": 74200 }, { "epoch": 1.86, "grad_norm": 7.74609375, "learning_rate": 7.748743718592964e-06, "loss": 1.2678, "step": 74300 }, { "epoch": 1.86, "grad_norm": 8.390625, "learning_rate": 7.718592964824121e-06, "loss": 1.2765, "step": 74400 }, { "epoch": 1.86, "grad_norm": 7.6328125, "learning_rate": 7.688442211055276e-06, "loss": 1.2353, "step": 74500 }, { "epoch": 1.86, "grad_norm": 5.83984375, "learning_rate": 7.658291457286432e-06, "loss": 1.2307, "step": 74600 }, { "epoch": 1.87, "grad_norm": 7.83203125, "learning_rate": 7.628140703517589e-06, "loss": 1.2646, "step": 74700 }, { "epoch": 1.87, "grad_norm": 7.10546875, "learning_rate": 7.597989949748743e-06, "loss": 1.249, "step": 74800 }, { "epoch": 1.87, "grad_norm": 7.12890625, "learning_rate": 7.5678391959799e-06, "loss": 1.1872, "step": 74900 }, { "epoch": 1.88, "grad_norm": 8.1640625, "learning_rate": 7.537688442211054e-06, "loss": 1.2345, "step": 75000 }, { "epoch": 1.88, "eval_loss": 1.2996472120285034, "eval_runtime": 58.363, "eval_samples_per_second": 17.134, "eval_steps_per_second": 4.284, "step": 75000 }, { "epoch": 1.88, "grad_norm": 4.2578125, "learning_rate": 7.507537688442211e-06, "loss": 1.2281, "step": 75100 }, { "epoch": 1.88, "grad_norm": 6.125, "learning_rate": 7.4773869346733675e-06, "loss": 1.2433, "step": 75200 }, { "epoch": 1.88, "grad_norm": 5.61328125, "learning_rate": 7.447236180904523e-06, "loss": 1.2314, "step": 75300 }, { "epoch": 1.89, "grad_norm": 8.28125, "learning_rate": 7.417085427135679e-06, "loss": 1.2677, "step": 75400 }, { "epoch": 1.89, "grad_norm": 7.12109375, "learning_rate": 7.386934673366835e-06, "loss": 1.2832, "step": 75500 }, { "epoch": 1.89, "grad_norm": 7.1796875, "learning_rate": 7.35678391959799e-06, "loss": 1.2156, "step": 75600 }, { "epoch": 1.89, "grad_norm": 6.73828125, "learning_rate": 7.326633165829146e-06, "loss": 1.248, "step": 75700 }, { "epoch": 1.9, "grad_norm": 12.0703125, "learning_rate": 7.296482412060302e-06, "loss": 1.2283, "step": 75800 }, { "epoch": 1.9, "grad_norm": 6.546875, "learning_rate": 7.266331658291457e-06, "loss": 1.2495, "step": 75900 }, { "epoch": 1.9, "grad_norm": 7.31640625, "learning_rate": 7.236180904522613e-06, "loss": 1.1779, "step": 76000 }, { "epoch": 1.9, "grad_norm": 5.79296875, "learning_rate": 7.206030150753769e-06, "loss": 1.2431, "step": 76100 }, { "epoch": 1.91, "grad_norm": 6.67578125, "learning_rate": 7.175879396984924e-06, "loss": 1.1931, "step": 76200 }, { "epoch": 1.91, "grad_norm": 8.65625, "learning_rate": 7.1457286432160805e-06, "loss": 1.2692, "step": 76300 }, { "epoch": 1.91, "grad_norm": 7.60546875, "learning_rate": 7.115577889447236e-06, "loss": 1.2612, "step": 76400 }, { "epoch": 1.91, "grad_norm": 6.47265625, "learning_rate": 7.085427135678392e-06, "loss": 1.247, "step": 76500 }, { "epoch": 1.92, "grad_norm": 8.421875, "learning_rate": 7.055276381909548e-06, "loss": 1.2481, "step": 76600 }, { "epoch": 1.92, "grad_norm": 7.8125, "learning_rate": 7.025125628140703e-06, "loss": 1.1965, "step": 76700 }, { "epoch": 1.92, "grad_norm": 9.15625, "learning_rate": 6.994974874371859e-06, "loss": 1.2125, "step": 76800 }, { "epoch": 1.92, "grad_norm": 9.0, "learning_rate": 6.9648241206030155e-06, "loss": 1.228, "step": 76900 }, { "epoch": 1.93, "grad_norm": 4.01171875, "learning_rate": 6.934673366834171e-06, "loss": 1.2254, "step": 77000 }, { "epoch": 1.93, "grad_norm": 4.84375, "learning_rate": 6.904522613065327e-06, "loss": 1.2786, "step": 77100 }, { "epoch": 1.93, "grad_norm": 7.03125, "learning_rate": 6.874371859296483e-06, "loss": 1.2677, "step": 77200 }, { "epoch": 1.93, "grad_norm": 8.40625, "learning_rate": 6.844221105527638e-06, "loss": 1.2667, "step": 77300 }, { "epoch": 1.94, "grad_norm": 7.8359375, "learning_rate": 6.814070351758794e-06, "loss": 1.2749, "step": 77400 }, { "epoch": 1.94, "grad_norm": 8.234375, "learning_rate": 6.7839195979899505e-06, "loss": 1.252, "step": 77500 }, { "epoch": 1.94, "eval_loss": 1.2479995489120483, "eval_runtime": 58.4081, "eval_samples_per_second": 17.121, "eval_steps_per_second": 4.28, "step": 77500 }, { "epoch": 1.94, "grad_norm": 6.81640625, "learning_rate": 6.753768844221106e-06, "loss": 1.1964, "step": 77600 }, { "epoch": 1.94, "grad_norm": 5.234375, "learning_rate": 6.723618090452262e-06, "loss": 1.2277, "step": 77700 }, { "epoch": 1.94, "grad_norm": 5.734375, "learning_rate": 6.693467336683418e-06, "loss": 1.1941, "step": 77800 }, { "epoch": 1.95, "grad_norm": 7.47265625, "learning_rate": 6.663316582914573e-06, "loss": 1.2202, "step": 77900 }, { "epoch": 1.95, "grad_norm": 7.37109375, "learning_rate": 6.633165829145729e-06, "loss": 1.1958, "step": 78000 }, { "epoch": 1.95, "grad_norm": 5.9609375, "learning_rate": 6.603015075376884e-06, "loss": 1.2519, "step": 78100 }, { "epoch": 1.96, "grad_norm": 7.921875, "learning_rate": 6.57286432160804e-06, "loss": 1.2012, "step": 78200 }, { "epoch": 1.96, "grad_norm": 6.5, "learning_rate": 6.542713567839196e-06, "loss": 1.2375, "step": 78300 }, { "epoch": 1.96, "grad_norm": 5.31640625, "learning_rate": 6.512562814070351e-06, "loss": 1.2287, "step": 78400 }, { "epoch": 1.96, "grad_norm": 8.6796875, "learning_rate": 6.482412060301507e-06, "loss": 1.2034, "step": 78500 }, { "epoch": 1.96, "grad_norm": 6.12890625, "learning_rate": 6.4522613065326635e-06, "loss": 1.2434, "step": 78600 }, { "epoch": 1.97, "grad_norm": 5.67578125, "learning_rate": 6.422110552763819e-06, "loss": 1.2209, "step": 78700 }, { "epoch": 1.97, "grad_norm": 5.1640625, "learning_rate": 6.391959798994975e-06, "loss": 1.2706, "step": 78800 }, { "epoch": 1.97, "grad_norm": 10.0703125, "learning_rate": 6.361809045226131e-06, "loss": 1.2471, "step": 78900 }, { "epoch": 1.98, "grad_norm": 5.3359375, "learning_rate": 6.331658291457286e-06, "loss": 1.2074, "step": 79000 }, { "epoch": 1.98, "grad_norm": 8.0234375, "learning_rate": 6.301507537688442e-06, "loss": 1.2271, "step": 79100 }, { "epoch": 1.98, "grad_norm": 12.015625, "learning_rate": 6.2713567839195985e-06, "loss": 1.2533, "step": 79200 }, { "epoch": 1.98, "grad_norm": 7.5078125, "learning_rate": 6.241206030150754e-06, "loss": 1.2112, "step": 79300 }, { "epoch": 1.98, "grad_norm": 6.16796875, "learning_rate": 6.21105527638191e-06, "loss": 1.2349, "step": 79400 }, { "epoch": 1.99, "grad_norm": 6.37109375, "learning_rate": 6.180904522613066e-06, "loss": 1.2972, "step": 79500 }, { "epoch": 1.99, "grad_norm": 5.58984375, "learning_rate": 6.150753768844221e-06, "loss": 1.2197, "step": 79600 }, { "epoch": 1.99, "grad_norm": 5.625, "learning_rate": 6.120603015075377e-06, "loss": 1.2149, "step": 79700 }, { "epoch": 2.0, "grad_norm": 7.828125, "learning_rate": 6.090452261306533e-06, "loss": 1.2589, "step": 79800 }, { "epoch": 2.0, "grad_norm": 8.46875, "learning_rate": 6.060301507537689e-06, "loss": 1.2533, "step": 79900 }, { "epoch": 2.0, "grad_norm": 6.96484375, "learning_rate": 6.030150753768845e-06, "loss": 1.148, "step": 80000 }, { "epoch": 2.0, "eval_loss": 1.2425081729888916, "eval_runtime": 58.3641, "eval_samples_per_second": 17.134, "eval_steps_per_second": 4.283, "step": 80000 }, { "epoch": 2.0, "grad_norm": 7.7265625, "learning_rate": 6e-06, "loss": 1.2136, "step": 80100 }, { "epoch": 2.0, "grad_norm": 14.2734375, "learning_rate": 5.969849246231156e-06, "loss": 1.1365, "step": 80200 }, { "epoch": 2.01, "grad_norm": 7.55078125, "learning_rate": 5.939698492462312e-06, "loss": 1.2502, "step": 80300 }, { "epoch": 2.01, "grad_norm": 9.328125, "learning_rate": 5.909547738693467e-06, "loss": 1.2155, "step": 80400 }, { "epoch": 2.01, "grad_norm": 6.2109375, "learning_rate": 5.879396984924623e-06, "loss": 1.1532, "step": 80500 }, { "epoch": 2.02, "grad_norm": 9.2421875, "learning_rate": 5.849246231155779e-06, "loss": 1.2078, "step": 80600 }, { "epoch": 2.02, "grad_norm": 7.6484375, "learning_rate": 5.819095477386934e-06, "loss": 1.2296, "step": 80700 }, { "epoch": 2.02, "grad_norm": 6.55078125, "learning_rate": 5.78894472361809e-06, "loss": 1.2095, "step": 80800 }, { "epoch": 2.02, "grad_norm": 8.6640625, "learning_rate": 5.7587939698492465e-06, "loss": 1.1459, "step": 80900 }, { "epoch": 2.02, "grad_norm": 4.33203125, "learning_rate": 5.728643216080402e-06, "loss": 1.1374, "step": 81000 }, { "epoch": 2.03, "grad_norm": 5.7109375, "learning_rate": 5.698492462311558e-06, "loss": 1.2141, "step": 81100 }, { "epoch": 2.03, "grad_norm": 7.6484375, "learning_rate": 5.668341708542714e-06, "loss": 1.1913, "step": 81200 }, { "epoch": 2.03, "grad_norm": 6.828125, "learning_rate": 5.638190954773869e-06, "loss": 1.2124, "step": 81300 }, { "epoch": 2.04, "grad_norm": 6.52734375, "learning_rate": 5.608040201005025e-06, "loss": 1.2935, "step": 81400 }, { "epoch": 2.04, "grad_norm": 8.8203125, "learning_rate": 5.577889447236181e-06, "loss": 1.217, "step": 81500 }, { "epoch": 2.04, "grad_norm": 7.5078125, "learning_rate": 5.547738693467337e-06, "loss": 1.2339, "step": 81600 }, { "epoch": 2.04, "grad_norm": 8.1171875, "learning_rate": 5.517587939698493e-06, "loss": 1.19, "step": 81700 }, { "epoch": 2.04, "grad_norm": 7.6015625, "learning_rate": 5.487437185929648e-06, "loss": 1.1577, "step": 81800 }, { "epoch": 2.05, "grad_norm": 7.97265625, "learning_rate": 5.457286432160804e-06, "loss": 1.1531, "step": 81900 }, { "epoch": 2.05, "grad_norm": 9.7265625, "learning_rate": 5.42713567839196e-06, "loss": 1.2609, "step": 82000 }, { "epoch": 2.05, "grad_norm": 10.296875, "learning_rate": 5.396984924623116e-06, "loss": 1.2294, "step": 82100 }, { "epoch": 2.06, "grad_norm": 14.6875, "learning_rate": 5.366834170854272e-06, "loss": 1.1506, "step": 82200 }, { "epoch": 2.06, "grad_norm": 10.59375, "learning_rate": 5.336683417085428e-06, "loss": 1.207, "step": 82300 }, { "epoch": 2.06, "grad_norm": 6.87109375, "learning_rate": 5.306532663316583e-06, "loss": 1.1686, "step": 82400 }, { "epoch": 2.06, "grad_norm": 10.375, "learning_rate": 5.276381909547739e-06, "loss": 1.2497, "step": 82500 }, { "epoch": 2.06, "eval_loss": 1.2355402708053589, "eval_runtime": 58.4521, "eval_samples_per_second": 17.108, "eval_steps_per_second": 4.277, "step": 82500 }, { "epoch": 2.06, "grad_norm": 6.8125, "learning_rate": 5.246231155778895e-06, "loss": 1.2069, "step": 82600 }, { "epoch": 2.07, "grad_norm": 7.6953125, "learning_rate": 5.21608040201005e-06, "loss": 1.2088, "step": 82700 }, { "epoch": 2.07, "grad_norm": 7.3671875, "learning_rate": 5.185929648241206e-06, "loss": 1.193, "step": 82800 }, { "epoch": 2.07, "grad_norm": 5.32421875, "learning_rate": 5.155778894472362e-06, "loss": 1.2365, "step": 82900 }, { "epoch": 2.08, "grad_norm": 5.71484375, "learning_rate": 5.125628140703517e-06, "loss": 1.1839, "step": 83000 }, { "epoch": 2.08, "grad_norm": 7.9140625, "learning_rate": 5.095477386934673e-06, "loss": 1.2174, "step": 83100 }, { "epoch": 2.08, "grad_norm": 8.7265625, "learning_rate": 5.065326633165829e-06, "loss": 1.1731, "step": 83200 }, { "epoch": 2.08, "grad_norm": 9.109375, "learning_rate": 5.035175879396985e-06, "loss": 1.1784, "step": 83300 }, { "epoch": 2.08, "grad_norm": 13.3125, "learning_rate": 5.005025125628141e-06, "loss": 1.1334, "step": 83400 }, { "epoch": 2.09, "grad_norm": 8.953125, "learning_rate": 4.974874371859296e-06, "loss": 1.1308, "step": 83500 }, { "epoch": 2.09, "grad_norm": 5.03515625, "learning_rate": 4.944723618090452e-06, "loss": 1.1502, "step": 83600 }, { "epoch": 2.09, "grad_norm": 6.67578125, "learning_rate": 4.914572864321608e-06, "loss": 1.1708, "step": 83700 }, { "epoch": 2.1, "grad_norm": 6.328125, "learning_rate": 4.884422110552764e-06, "loss": 1.152, "step": 83800 }, { "epoch": 2.1, "grad_norm": 6.3203125, "learning_rate": 4.85427135678392e-06, "loss": 1.1794, "step": 83900 }, { "epoch": 2.1, "grad_norm": 5.12890625, "learning_rate": 4.824120603015076e-06, "loss": 1.1732, "step": 84000 }, { "epoch": 2.1, "grad_norm": 7.3125, "learning_rate": 4.793969849246231e-06, "loss": 1.1665, "step": 84100 }, { "epoch": 2.1, "grad_norm": 15.8359375, "learning_rate": 4.763819095477387e-06, "loss": 1.2151, "step": 84200 }, { "epoch": 2.11, "grad_norm": 8.4453125, "learning_rate": 4.733668341708543e-06, "loss": 1.208, "step": 84300 }, { "epoch": 2.11, "grad_norm": 9.8828125, "learning_rate": 4.703517587939699e-06, "loss": 1.1756, "step": 84400 }, { "epoch": 2.11, "grad_norm": 6.1171875, "learning_rate": 4.673366834170855e-06, "loss": 1.1569, "step": 84500 }, { "epoch": 2.12, "grad_norm": 7.6953125, "learning_rate": 4.64321608040201e-06, "loss": 1.1985, "step": 84600 }, { "epoch": 2.12, "grad_norm": 9.2734375, "learning_rate": 4.613065326633166e-06, "loss": 1.1403, "step": 84700 }, { "epoch": 2.12, "grad_norm": 7.54296875, "learning_rate": 4.582914572864322e-06, "loss": 1.1365, "step": 84800 }, { "epoch": 2.12, "grad_norm": 10.5, "learning_rate": 4.5527638190954775e-06, "loss": 1.2374, "step": 84900 }, { "epoch": 2.12, "grad_norm": 7.4609375, "learning_rate": 4.522613065326633e-06, "loss": 1.1565, "step": 85000 }, { "epoch": 2.12, "eval_loss": 1.2156389951705933, "eval_runtime": 58.3222, "eval_samples_per_second": 17.146, "eval_steps_per_second": 4.287, "step": 85000 }, { "epoch": 2.13, "grad_norm": 5.97265625, "learning_rate": 4.492462311557789e-06, "loss": 1.2103, "step": 85100 }, { "epoch": 2.13, "grad_norm": 4.7109375, "learning_rate": 4.462311557788944e-06, "loss": 1.2058, "step": 85200 }, { "epoch": 2.13, "grad_norm": 9.140625, "learning_rate": 4.4321608040201e-06, "loss": 1.1758, "step": 85300 }, { "epoch": 2.13, "grad_norm": 6.52734375, "learning_rate": 4.4020100502512564e-06, "loss": 1.197, "step": 85400 }, { "epoch": 2.14, "grad_norm": 6.92578125, "learning_rate": 4.371859296482412e-06, "loss": 1.176, "step": 85500 }, { "epoch": 2.14, "grad_norm": 9.8984375, "learning_rate": 4.341708542713568e-06, "loss": 1.2176, "step": 85600 }, { "epoch": 2.14, "grad_norm": 6.609375, "learning_rate": 4.311557788944724e-06, "loss": 1.1897, "step": 85700 }, { "epoch": 2.15, "grad_norm": 5.81640625, "learning_rate": 4.281407035175879e-06, "loss": 1.1756, "step": 85800 }, { "epoch": 2.15, "grad_norm": 7.0703125, "learning_rate": 4.251256281407035e-06, "loss": 1.1818, "step": 85900 }, { "epoch": 2.15, "grad_norm": 7.296875, "learning_rate": 4.221105527638191e-06, "loss": 1.1527, "step": 86000 }, { "epoch": 2.15, "grad_norm": 4.29296875, "learning_rate": 4.190954773869347e-06, "loss": 1.1846, "step": 86100 }, { "epoch": 2.15, "grad_norm": 13.1875, "learning_rate": 4.160804020100503e-06, "loss": 1.2005, "step": 86200 }, { "epoch": 2.16, "grad_norm": 7.1328125, "learning_rate": 4.130653266331658e-06, "loss": 1.183, "step": 86300 }, { "epoch": 2.16, "grad_norm": 4.86328125, "learning_rate": 4.100502512562814e-06, "loss": 1.1729, "step": 86400 }, { "epoch": 2.16, "grad_norm": 6.046875, "learning_rate": 4.07035175879397e-06, "loss": 1.1052, "step": 86500 }, { "epoch": 2.17, "grad_norm": 5.03125, "learning_rate": 4.0402010050251256e-06, "loss": 1.181, "step": 86600 }, { "epoch": 2.17, "grad_norm": 9.9609375, "learning_rate": 4.010050251256282e-06, "loss": 1.2041, "step": 86700 }, { "epoch": 2.17, "grad_norm": 7.140625, "learning_rate": 3.979899497487438e-06, "loss": 1.1897, "step": 86800 }, { "epoch": 2.17, "grad_norm": 5.45703125, "learning_rate": 3.949748743718593e-06, "loss": 1.2078, "step": 86900 }, { "epoch": 2.17, "grad_norm": 7.04296875, "learning_rate": 3.919597989949749e-06, "loss": 1.0977, "step": 87000 }, { "epoch": 2.18, "grad_norm": 6.44140625, "learning_rate": 3.889447236180905e-06, "loss": 1.209, "step": 87100 }, { "epoch": 2.18, "grad_norm": 6.3046875, "learning_rate": 3.8592964824120606e-06, "loss": 1.1484, "step": 87200 }, { "epoch": 2.18, "grad_norm": 15.6484375, "learning_rate": 3.829145728643216e-06, "loss": 1.2224, "step": 87300 }, { "epoch": 2.19, "grad_norm": 6.73046875, "learning_rate": 3.7989949748743715e-06, "loss": 1.1861, "step": 87400 }, { "epoch": 2.19, "grad_norm": 5.4921875, "learning_rate": 3.768844221105527e-06, "loss": 1.1503, "step": 87500 }, { "epoch": 2.19, "eval_loss": 1.2061641216278076, "eval_runtime": 58.4078, "eval_samples_per_second": 17.121, "eval_steps_per_second": 4.28, "step": 87500 }, { "epoch": 2.19, "grad_norm": 9.4921875, "learning_rate": 3.7386934673366837e-06, "loss": 1.2169, "step": 87600 }, { "epoch": 2.19, "grad_norm": 14.7265625, "learning_rate": 3.7085427135678394e-06, "loss": 1.1826, "step": 87700 }, { "epoch": 2.19, "grad_norm": 7.20703125, "learning_rate": 3.678391959798995e-06, "loss": 1.2399, "step": 87800 }, { "epoch": 2.2, "grad_norm": 5.81640625, "learning_rate": 3.648241206030151e-06, "loss": 1.1242, "step": 87900 }, { "epoch": 2.2, "grad_norm": 8.4375, "learning_rate": 3.6180904522613065e-06, "loss": 1.1827, "step": 88000 }, { "epoch": 2.2, "grad_norm": 4.69921875, "learning_rate": 3.587939698492462e-06, "loss": 1.2243, "step": 88100 }, { "epoch": 2.21, "grad_norm": 7.3671875, "learning_rate": 3.557788944723618e-06, "loss": 1.2145, "step": 88200 }, { "epoch": 2.21, "grad_norm": 7.70703125, "learning_rate": 3.527638190954774e-06, "loss": 1.2503, "step": 88300 }, { "epoch": 2.21, "grad_norm": 5.734375, "learning_rate": 3.4974874371859297e-06, "loss": 1.2107, "step": 88400 }, { "epoch": 2.21, "grad_norm": 5.58984375, "learning_rate": 3.4673366834170854e-06, "loss": 1.1683, "step": 88500 }, { "epoch": 2.21, "grad_norm": 7.44921875, "learning_rate": 3.4371859296482415e-06, "loss": 1.1781, "step": 88600 }, { "epoch": 2.22, "grad_norm": 8.6953125, "learning_rate": 3.407035175879397e-06, "loss": 1.175, "step": 88700 }, { "epoch": 2.22, "grad_norm": 5.56640625, "learning_rate": 3.376884422110553e-06, "loss": 1.1351, "step": 88800 }, { "epoch": 2.22, "grad_norm": 7.08984375, "learning_rate": 3.346733668341709e-06, "loss": 1.1804, "step": 88900 }, { "epoch": 2.23, "grad_norm": 7.60546875, "learning_rate": 3.3165829145728647e-06, "loss": 1.2199, "step": 89000 }, { "epoch": 2.23, "grad_norm": 7.09765625, "learning_rate": 3.28643216080402e-06, "loss": 1.1729, "step": 89100 }, { "epoch": 2.23, "grad_norm": 6.39453125, "learning_rate": 3.2562814070351756e-06, "loss": 1.1395, "step": 89200 }, { "epoch": 2.23, "grad_norm": 8.7734375, "learning_rate": 3.2261306532663318e-06, "loss": 1.2157, "step": 89300 }, { "epoch": 2.23, "grad_norm": 8.4765625, "learning_rate": 3.1959798994974875e-06, "loss": 1.2263, "step": 89400 }, { "epoch": 2.24, "grad_norm": 8.765625, "learning_rate": 3.165829145728643e-06, "loss": 1.175, "step": 89500 }, { "epoch": 2.24, "grad_norm": 8.390625, "learning_rate": 3.1356783919597993e-06, "loss": 1.224, "step": 89600 }, { "epoch": 2.24, "grad_norm": 7.52734375, "learning_rate": 3.105527638190955e-06, "loss": 1.1259, "step": 89700 }, { "epoch": 2.25, "grad_norm": 4.87890625, "learning_rate": 3.0753768844221106e-06, "loss": 1.1547, "step": 89800 }, { "epoch": 2.25, "grad_norm": 6.99609375, "learning_rate": 3.0452261306532663e-06, "loss": 1.1514, "step": 89900 }, { "epoch": 2.25, "grad_norm": 10.671875, "learning_rate": 3.0150753768844224e-06, "loss": 1.1635, "step": 90000 }, { "epoch": 2.25, "eval_loss": 1.2134754657745361, "eval_runtime": 58.4436, "eval_samples_per_second": 17.111, "eval_steps_per_second": 4.278, "step": 90000 }, { "epoch": 2.25, "grad_norm": 7.99609375, "learning_rate": 2.984924623115578e-06, "loss": 1.1521, "step": 90100 }, { "epoch": 2.25, "grad_norm": 5.59375, "learning_rate": 2.9547738693467334e-06, "loss": 1.1633, "step": 90200 }, { "epoch": 2.26, "grad_norm": 6.43359375, "learning_rate": 2.9246231155778895e-06, "loss": 1.1356, "step": 90300 }, { "epoch": 2.26, "grad_norm": 9.6875, "learning_rate": 2.894472361809045e-06, "loss": 1.1495, "step": 90400 }, { "epoch": 2.26, "grad_norm": 11.3359375, "learning_rate": 2.864321608040201e-06, "loss": 1.1602, "step": 90500 }, { "epoch": 2.27, "grad_norm": 5.234375, "learning_rate": 2.834170854271357e-06, "loss": 1.1457, "step": 90600 }, { "epoch": 2.27, "grad_norm": 7.78515625, "learning_rate": 2.8040201005025127e-06, "loss": 1.1323, "step": 90700 }, { "epoch": 2.27, "grad_norm": 8.3203125, "learning_rate": 2.7738693467336684e-06, "loss": 1.166, "step": 90800 }, { "epoch": 2.27, "grad_norm": 7.22265625, "learning_rate": 2.743718592964824e-06, "loss": 1.2096, "step": 90900 }, { "epoch": 2.27, "grad_norm": 6.0, "learning_rate": 2.71356783919598e-06, "loss": 1.1357, "step": 91000 }, { "epoch": 2.28, "grad_norm": 6.1875, "learning_rate": 2.683417085427136e-06, "loss": 1.1882, "step": 91100 }, { "epoch": 2.28, "grad_norm": 14.40625, "learning_rate": 2.6532663316582916e-06, "loss": 1.1675, "step": 91200 }, { "epoch": 2.28, "grad_norm": 5.90234375, "learning_rate": 2.6231155778894477e-06, "loss": 1.1491, "step": 91300 }, { "epoch": 2.29, "grad_norm": 8.15625, "learning_rate": 2.592964824120603e-06, "loss": 1.2387, "step": 91400 }, { "epoch": 2.29, "grad_norm": 7.91015625, "learning_rate": 2.5628140703517587e-06, "loss": 1.1936, "step": 91500 }, { "epoch": 2.29, "grad_norm": 8.0859375, "learning_rate": 2.5326633165829143e-06, "loss": 1.1737, "step": 91600 }, { "epoch": 2.29, "grad_norm": 8.1796875, "learning_rate": 2.5025125628140705e-06, "loss": 1.1918, "step": 91700 }, { "epoch": 2.29, "grad_norm": 7.984375, "learning_rate": 2.472361809045226e-06, "loss": 1.1782, "step": 91800 }, { "epoch": 2.3, "grad_norm": 10.5078125, "learning_rate": 2.442211055276382e-06, "loss": 1.2198, "step": 91900 }, { "epoch": 2.3, "grad_norm": 8.1015625, "learning_rate": 2.412060301507538e-06, "loss": 1.1851, "step": 92000 }, { "epoch": 2.3, "grad_norm": 8.953125, "learning_rate": 2.3819095477386936e-06, "loss": 1.1886, "step": 92100 }, { "epoch": 2.31, "grad_norm": 7.05859375, "learning_rate": 2.3517587939698493e-06, "loss": 1.2223, "step": 92200 }, { "epoch": 2.31, "grad_norm": 5.34765625, "learning_rate": 2.321608040201005e-06, "loss": 1.1676, "step": 92300 }, { "epoch": 2.31, "grad_norm": 13.9921875, "learning_rate": 2.291457286432161e-06, "loss": 1.1743, "step": 92400 }, { "epoch": 2.31, "grad_norm": 15.0703125, "learning_rate": 2.2613065326633164e-06, "loss": 1.1538, "step": 92500 }, { "epoch": 2.31, "eval_loss": 1.1857857704162598, "eval_runtime": 58.4052, "eval_samples_per_second": 17.122, "eval_steps_per_second": 4.28, "step": 92500 }, { "epoch": 2.31, "grad_norm": 8.4140625, "learning_rate": 2.231155778894472e-06, "loss": 1.0991, "step": 92600 }, { "epoch": 2.32, "grad_norm": 6.92578125, "learning_rate": 2.2010050251256282e-06, "loss": 1.156, "step": 92700 }, { "epoch": 2.32, "grad_norm": 7.78515625, "learning_rate": 2.170854271356784e-06, "loss": 1.1821, "step": 92800 }, { "epoch": 2.32, "grad_norm": 6.5078125, "learning_rate": 2.1407035175879396e-06, "loss": 1.2011, "step": 92900 }, { "epoch": 2.33, "grad_norm": 8.96875, "learning_rate": 2.1105527638190957e-06, "loss": 1.1835, "step": 93000 }, { "epoch": 2.33, "grad_norm": 8.3828125, "learning_rate": 2.0804020100502514e-06, "loss": 1.1827, "step": 93100 }, { "epoch": 2.33, "grad_norm": 9.6328125, "learning_rate": 2.050251256281407e-06, "loss": 1.1546, "step": 93200 }, { "epoch": 2.33, "grad_norm": 9.4765625, "learning_rate": 2.0201005025125628e-06, "loss": 1.1461, "step": 93300 }, { "epoch": 2.33, "grad_norm": 7.52734375, "learning_rate": 1.989949748743719e-06, "loss": 1.1271, "step": 93400 }, { "epoch": 2.34, "grad_norm": 9.96875, "learning_rate": 1.9597989949748746e-06, "loss": 1.1457, "step": 93500 }, { "epoch": 2.34, "grad_norm": 8.34375, "learning_rate": 1.9296482412060303e-06, "loss": 1.2159, "step": 93600 }, { "epoch": 2.34, "grad_norm": 7.27734375, "learning_rate": 1.8994974874371858e-06, "loss": 1.1968, "step": 93700 }, { "epoch": 2.34, "grad_norm": 4.19140625, "learning_rate": 1.8693467336683419e-06, "loss": 1.0567, "step": 93800 }, { "epoch": 2.35, "grad_norm": 4.45703125, "learning_rate": 1.8391959798994976e-06, "loss": 1.1876, "step": 93900 }, { "epoch": 2.35, "grad_norm": 10.453125, "learning_rate": 1.8090452261306533e-06, "loss": 1.1298, "step": 94000 }, { "epoch": 2.35, "grad_norm": 8.3125, "learning_rate": 1.778894472361809e-06, "loss": 1.1558, "step": 94100 }, { "epoch": 2.35, "grad_norm": 5.66796875, "learning_rate": 1.7487437185929648e-06, "loss": 1.2117, "step": 94200 }, { "epoch": 2.36, "grad_norm": 9.484375, "learning_rate": 1.7185929648241207e-06, "loss": 1.1362, "step": 94300 }, { "epoch": 2.36, "grad_norm": 7.44140625, "learning_rate": 1.6884422110552764e-06, "loss": 1.2152, "step": 94400 }, { "epoch": 2.36, "grad_norm": 19.65625, "learning_rate": 1.6582914572864323e-06, "loss": 1.1914, "step": 94500 }, { "epoch": 2.37, "grad_norm": 6.1015625, "learning_rate": 1.6281407035175878e-06, "loss": 1.1662, "step": 94600 }, { "epoch": 2.37, "grad_norm": 6.44921875, "learning_rate": 1.5979899497487437e-06, "loss": 1.1681, "step": 94700 }, { "epoch": 2.37, "grad_norm": 8.515625, "learning_rate": 1.5678391959798996e-06, "loss": 1.1554, "step": 94800 }, { "epoch": 2.37, "grad_norm": 10.2734375, "learning_rate": 1.5376884422110553e-06, "loss": 1.1776, "step": 94900 }, { "epoch": 2.38, "grad_norm": 10.0703125, "learning_rate": 1.5075376884422112e-06, "loss": 1.1715, "step": 95000 }, { "epoch": 2.38, "eval_loss": 1.2057503461837769, "eval_runtime": 58.3696, "eval_samples_per_second": 17.132, "eval_steps_per_second": 4.283, "step": 95000 }, { "epoch": 2.38, "grad_norm": 11.359375, "learning_rate": 1.4773869346733667e-06, "loss": 1.1669, "step": 95100 }, { "epoch": 2.38, "grad_norm": 7.42578125, "learning_rate": 1.4472361809045226e-06, "loss": 1.1658, "step": 95200 }, { "epoch": 2.38, "grad_norm": 4.3203125, "learning_rate": 1.4170854271356785e-06, "loss": 1.1943, "step": 95300 }, { "epoch": 2.38, "grad_norm": 8.9765625, "learning_rate": 1.3869346733668342e-06, "loss": 1.1445, "step": 95400 }, { "epoch": 2.39, "grad_norm": 5.4765625, "learning_rate": 1.35678391959799e-06, "loss": 1.2335, "step": 95500 }, { "epoch": 2.39, "grad_norm": 6.5859375, "learning_rate": 1.3266331658291458e-06, "loss": 1.2079, "step": 95600 }, { "epoch": 2.39, "grad_norm": 7.40625, "learning_rate": 1.2964824120603015e-06, "loss": 1.1193, "step": 95700 }, { "epoch": 2.4, "grad_norm": 8.328125, "learning_rate": 1.2663316582914572e-06, "loss": 1.2107, "step": 95800 }, { "epoch": 2.4, "grad_norm": 6.96875, "learning_rate": 1.236180904522613e-06, "loss": 1.17, "step": 95900 }, { "epoch": 2.4, "grad_norm": 8.6328125, "learning_rate": 1.206030150753769e-06, "loss": 1.1605, "step": 96000 }, { "epoch": 2.4, "grad_norm": 5.61328125, "learning_rate": 1.1758793969849247e-06, "loss": 1.274, "step": 96100 }, { "epoch": 2.41, "grad_norm": 6.5, "learning_rate": 1.1457286432160806e-06, "loss": 1.195, "step": 96200 }, { "epoch": 2.41, "grad_norm": 4.8125, "learning_rate": 1.115577889447236e-06, "loss": 1.1551, "step": 96300 }, { "epoch": 2.41, "grad_norm": 5.99609375, "learning_rate": 1.085427135678392e-06, "loss": 1.16, "step": 96400 }, { "epoch": 2.41, "grad_norm": 6.90625, "learning_rate": 1.0552763819095479e-06, "loss": 1.1629, "step": 96500 }, { "epoch": 2.42, "grad_norm": 4.7109375, "learning_rate": 1.0251256281407035e-06, "loss": 1.1291, "step": 96600 }, { "epoch": 2.42, "grad_norm": 15.6796875, "learning_rate": 9.949748743718594e-07, "loss": 1.1471, "step": 96700 }, { "epoch": 2.42, "grad_norm": 6.796875, "learning_rate": 9.648241206030151e-07, "loss": 1.163, "step": 96800 }, { "epoch": 2.42, "grad_norm": 7.49609375, "learning_rate": 9.346733668341709e-07, "loss": 1.1476, "step": 96900 }, { "epoch": 2.42, "grad_norm": 6.5546875, "learning_rate": 9.045226130653266e-07, "loss": 1.233, "step": 97000 }, { "epoch": 2.43, "grad_norm": 5.06640625, "learning_rate": 8.743718592964824e-07, "loss": 1.1224, "step": 97100 }, { "epoch": 2.43, "grad_norm": 8.3828125, "learning_rate": 8.442211055276382e-07, "loss": 1.1806, "step": 97200 }, { "epoch": 2.43, "grad_norm": 8.953125, "learning_rate": 8.140703517587939e-07, "loss": 1.1643, "step": 97300 }, { "epoch": 2.44, "grad_norm": 7.90234375, "learning_rate": 7.839195979899498e-07, "loss": 1.1284, "step": 97400 }, { "epoch": 2.44, "grad_norm": 6.64453125, "learning_rate": 7.537688442211056e-07, "loss": 1.1856, "step": 97500 }, { "epoch": 2.44, "eval_loss": 1.1979784965515137, "eval_runtime": 58.3179, "eval_samples_per_second": 17.147, "eval_steps_per_second": 4.287, "step": 97500 }, { "epoch": 2.44, "grad_norm": 10.0625, "learning_rate": 7.236180904522613e-07, "loss": 1.1275, "step": 97600 }, { "epoch": 2.44, "grad_norm": 8.5703125, "learning_rate": 6.934673366834171e-07, "loss": 1.1365, "step": 97700 }, { "epoch": 2.44, "grad_norm": 14.328125, "learning_rate": 6.633165829145729e-07, "loss": 1.1769, "step": 97800 }, { "epoch": 2.45, "grad_norm": 6.0625, "learning_rate": 6.331658291457286e-07, "loss": 1.1797, "step": 97900 }, { "epoch": 2.45, "grad_norm": 4.9140625, "learning_rate": 6.030150753768845e-07, "loss": 1.1849, "step": 98000 }, { "epoch": 2.45, "grad_norm": 5.08984375, "learning_rate": 5.728643216080403e-07, "loss": 1.1552, "step": 98100 }, { "epoch": 2.46, "grad_norm": 6.0, "learning_rate": 5.42713567839196e-07, "loss": 1.2071, "step": 98200 }, { "epoch": 2.46, "grad_norm": 7.3125, "learning_rate": 5.125628140703518e-07, "loss": 1.1287, "step": 98300 }, { "epoch": 2.46, "grad_norm": 9.8671875, "learning_rate": 4.824120603015076e-07, "loss": 1.1739, "step": 98400 }, { "epoch": 2.46, "grad_norm": 7.5390625, "learning_rate": 4.522613065326633e-07, "loss": 1.1723, "step": 98500 }, { "epoch": 2.46, "grad_norm": 13.0, "learning_rate": 4.221105527638191e-07, "loss": 1.187, "step": 98600 }, { "epoch": 2.47, "grad_norm": 9.7890625, "learning_rate": 3.919597989949749e-07, "loss": 1.1765, "step": 98700 }, { "epoch": 2.47, "grad_norm": 4.5625, "learning_rate": 3.6180904522613065e-07, "loss": 1.2047, "step": 98800 }, { "epoch": 2.47, "grad_norm": 11.3984375, "learning_rate": 3.3165829145728645e-07, "loss": 1.2022, "step": 98900 }, { "epoch": 2.48, "grad_norm": 6.21875, "learning_rate": 3.0150753768844224e-07, "loss": 1.1459, "step": 99000 }, { "epoch": 2.48, "grad_norm": 7.47265625, "learning_rate": 2.71356783919598e-07, "loss": 1.1626, "step": 99100 }, { "epoch": 2.48, "grad_norm": 3.32421875, "learning_rate": 2.412060301507538e-07, "loss": 1.1997, "step": 99200 }, { "epoch": 2.48, "grad_norm": 4.859375, "learning_rate": 2.1105527638190956e-07, "loss": 1.2174, "step": 99300 }, { "epoch": 2.48, "grad_norm": 7.328125, "learning_rate": 1.8090452261306533e-07, "loss": 1.2133, "step": 99400 }, { "epoch": 2.49, "grad_norm": 11.375, "learning_rate": 1.5075376884422112e-07, "loss": 1.1436, "step": 99500 }, { "epoch": 2.49, "grad_norm": 6.7578125, "learning_rate": 1.206030150753769e-07, "loss": 1.149, "step": 99600 }, { "epoch": 2.49, "grad_norm": 11.4765625, "learning_rate": 9.045226130653266e-08, "loss": 1.1612, "step": 99700 }, { "epoch": 2.5, "grad_norm": 4.63671875, "learning_rate": 6.030150753768845e-08, "loss": 1.1397, "step": 99800 }, { "epoch": 2.5, "grad_norm": 11.5859375, "learning_rate": 3.015075376884422e-08, "loss": 1.1792, "step": 99900 }, { "epoch": 2.5, "grad_norm": 5.5703125, "learning_rate": 0.0, "loss": 1.1628, "step": 100000 }, { "epoch": 2.5, "eval_loss": 1.200670599937439, "eval_runtime": 58.2769, "eval_samples_per_second": 17.159, "eval_steps_per_second": 4.29, "step": 100000 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2500, "total_flos": 1.5733698330624e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }