{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1111111111111111, "grad_norm": 25.875, "learning_rate": 2.5e-05, "loss": 1.1308, "step": 1 }, { "epoch": 0.1111111111111111, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.8154025673866272, "eval_runtime": 0.6646, "eval_samples_per_second": 108.328, "eval_steps_per_second": 7.523, "step": 1 }, { "epoch": 0.2222222222222222, "grad_norm": 31.75, "learning_rate": 5e-05, "loss": 0.9888, "step": 2 }, { "epoch": 0.2222222222222222, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.8097670078277588, "eval_runtime": 0.7137, "eval_samples_per_second": 100.883, "eval_steps_per_second": 7.006, "step": 2 }, { "epoch": 0.3333333333333333, "grad_norm": 15.6875, "learning_rate": 4.943181818181818e-05, "loss": 0.8615, "step": 3 }, { "epoch": 0.3333333333333333, "eval_accuracy": 0.5833333333333334, "eval_loss": 0.8370497226715088, "eval_runtime": 0.6614, "eval_samples_per_second": 108.857, "eval_steps_per_second": 7.56, "step": 3 }, { "epoch": 0.4444444444444444, "grad_norm": 13.9375, "learning_rate": 4.886363636363637e-05, "loss": 0.901, "step": 4 }, { "epoch": 0.4444444444444444, "eval_accuracy": 0.5833333333333334, "eval_loss": 0.8638712763786316, "eval_runtime": 0.7123, "eval_samples_per_second": 101.088, "eval_steps_per_second": 7.02, "step": 4 }, { "epoch": 0.5555555555555556, "grad_norm": 13.375, "learning_rate": 4.829545454545455e-05, "loss": 1.1248, "step": 5 }, { "epoch": 0.5555555555555556, "eval_accuracy": 0.5972222222222222, "eval_loss": 0.8621860146522522, "eval_runtime": 0.7128, "eval_samples_per_second": 101.003, "eval_steps_per_second": 7.014, "step": 5 }, { "epoch": 0.6666666666666666, "grad_norm": 28.875, "learning_rate": 4.772727272727273e-05, "loss": 0.9482, "step": 6 }, { "epoch": 0.6666666666666666, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.8247477412223816, "eval_runtime": 0.6622, "eval_samples_per_second": 108.728, "eval_steps_per_second": 7.551, "step": 6 }, { "epoch": 0.7777777777777778, "grad_norm": 33.25, "learning_rate": 4.715909090909091e-05, "loss": 0.8705, "step": 7 }, { "epoch": 0.7777777777777778, "eval_accuracy": 0.5833333333333334, "eval_loss": 0.7855699062347412, "eval_runtime": 0.7138, "eval_samples_per_second": 100.863, "eval_steps_per_second": 7.004, "step": 7 }, { "epoch": 0.8888888888888888, "grad_norm": 20.75, "learning_rate": 4.659090909090909e-05, "loss": 0.9702, "step": 8 }, { "epoch": 0.8888888888888888, "eval_accuracy": 0.5694444444444444, "eval_loss": 0.7553032636642456, "eval_runtime": 0.6125, "eval_samples_per_second": 117.549, "eval_steps_per_second": 8.163, "step": 8 }, { "epoch": 1.0, "grad_norm": 22.25, "learning_rate": 4.602272727272727e-05, "loss": 0.7868, "step": 9 }, { "epoch": 1.0, "eval_accuracy": 0.5555555555555556, "eval_loss": 0.7463514804840088, "eval_runtime": 0.6616, "eval_samples_per_second": 108.828, "eval_steps_per_second": 7.557, "step": 9 }, { "epoch": 1.1111111111111112, "grad_norm": 6.4375, "learning_rate": 4.545454545454546e-05, "loss": 0.7814, "step": 10 }, { "epoch": 1.1111111111111112, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.7470024824142456, "eval_runtime": 0.6612, "eval_samples_per_second": 108.9, "eval_steps_per_second": 7.562, "step": 10 }, { "epoch": 1.2222222222222223, "grad_norm": 19.875, "learning_rate": 4.488636363636364e-05, "loss": 0.8527, "step": 11 }, { "epoch": 1.2222222222222223, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7425130009651184, "eval_runtime": 0.6143, "eval_samples_per_second": 117.207, "eval_steps_per_second": 8.139, "step": 11 }, { "epoch": 1.3333333333333333, "grad_norm": 15.1875, "learning_rate": 4.431818181818182e-05, "loss": 0.7484, "step": 12 }, { "epoch": 1.3333333333333333, "eval_accuracy": 0.5555555555555556, "eval_loss": 0.7314046025276184, "eval_runtime": 0.6182, "eval_samples_per_second": 116.47, "eval_steps_per_second": 8.088, "step": 12 }, { "epoch": 1.4444444444444444, "grad_norm": 21.375, "learning_rate": 4.375e-05, "loss": 0.6436, "step": 13 }, { "epoch": 1.4444444444444444, "eval_accuracy": 0.5833333333333334, "eval_loss": 0.7158339023590088, "eval_runtime": 0.7149, "eval_samples_per_second": 100.712, "eval_steps_per_second": 6.994, "step": 13 }, { "epoch": 1.5555555555555556, "grad_norm": 21.875, "learning_rate": 4.318181818181819e-05, "loss": 0.7159, "step": 14 }, { "epoch": 1.5555555555555556, "eval_accuracy": 0.6111111111111112, "eval_loss": 0.7068142294883728, "eval_runtime": 0.7129, "eval_samples_per_second": 100.991, "eval_steps_per_second": 7.013, "step": 14 }, { "epoch": 1.6666666666666665, "grad_norm": 5.6875, "learning_rate": 4.261363636363637e-05, "loss": 0.6247, "step": 15 }, { "epoch": 1.6666666666666665, "eval_accuracy": 0.5972222222222222, "eval_loss": 0.7105034589767456, "eval_runtime": 0.6625, "eval_samples_per_second": 108.678, "eval_steps_per_second": 7.547, "step": 15 }, { "epoch": 1.7777777777777777, "grad_norm": 12.9375, "learning_rate": 4.204545454545455e-05, "loss": 0.6662, "step": 16 }, { "epoch": 1.7777777777777777, "eval_accuracy": 0.5972222222222222, "eval_loss": 0.7118598222732544, "eval_runtime": 0.5619, "eval_samples_per_second": 128.13, "eval_steps_per_second": 8.898, "step": 16 }, { "epoch": 1.8888888888888888, "grad_norm": 9.0, "learning_rate": 4.1477272727272734e-05, "loss": 0.4505, "step": 17 }, { "epoch": 1.8888888888888888, "eval_accuracy": 0.5833333333333334, "eval_loss": 0.7344834804534912, "eval_runtime": 0.566, "eval_samples_per_second": 127.198, "eval_steps_per_second": 8.833, "step": 17 }, { "epoch": 2.0, "grad_norm": 9.75, "learning_rate": 4.0909090909090915e-05, "loss": 0.5626, "step": 18 }, { "epoch": 2.0, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7482435703277588, "eval_runtime": 0.6143, "eval_samples_per_second": 117.216, "eval_steps_per_second": 8.14, "step": 18 }, { "epoch": 2.111111111111111, "grad_norm": 16.125, "learning_rate": 4.034090909090909e-05, "loss": 0.4823, "step": 19 }, { "epoch": 2.111111111111111, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7449544072151184, "eval_runtime": 0.7126, "eval_samples_per_second": 101.038, "eval_steps_per_second": 7.016, "step": 19 }, { "epoch": 2.2222222222222223, "grad_norm": 15.4375, "learning_rate": 3.9772727272727275e-05, "loss": 0.4237, "step": 20 }, { "epoch": 2.2222222222222223, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.7253146767616272, "eval_runtime": 0.6618, "eval_samples_per_second": 108.786, "eval_steps_per_second": 7.555, "step": 20 }, { "epoch": 2.3333333333333335, "grad_norm": 22.625, "learning_rate": 3.9204545454545456e-05, "loss": 0.5236, "step": 21 }, { "epoch": 2.3333333333333335, "eval_accuracy": 0.5833333333333334, "eval_loss": 0.6920505166053772, "eval_runtime": 0.613, "eval_samples_per_second": 117.447, "eval_steps_per_second": 8.156, "step": 21 }, { "epoch": 2.4444444444444446, "grad_norm": 11.75, "learning_rate": 3.8636363636363636e-05, "loss": 0.4028, "step": 22 }, { "epoch": 2.4444444444444446, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.6593967080116272, "eval_runtime": 0.6119, "eval_samples_per_second": 117.661, "eval_steps_per_second": 8.171, "step": 22 }, { "epoch": 2.5555555555555554, "grad_norm": 18.875, "learning_rate": 3.8068181818181816e-05, "loss": 0.3952, "step": 23 }, { "epoch": 2.5555555555555554, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.6502448320388794, "eval_runtime": 0.6118, "eval_samples_per_second": 117.679, "eval_steps_per_second": 8.172, "step": 23 }, { "epoch": 2.6666666666666665, "grad_norm": 8.3125, "learning_rate": 3.7500000000000003e-05, "loss": 0.3701, "step": 24 }, { "epoch": 2.6666666666666665, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.6597442626953125, "eval_runtime": 0.7121, "eval_samples_per_second": 101.112, "eval_steps_per_second": 7.022, "step": 24 }, { "epoch": 2.7777777777777777, "grad_norm": 25.0, "learning_rate": 3.6931818181818184e-05, "loss": 0.4536, "step": 25 }, { "epoch": 2.7777777777777777, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.6615126132965088, "eval_runtime": 0.6608, "eval_samples_per_second": 108.959, "eval_steps_per_second": 7.567, "step": 25 }, { "epoch": 2.888888888888889, "grad_norm": 25.125, "learning_rate": 3.6363636363636364e-05, "loss": 0.4004, "step": 26 }, { "epoch": 2.888888888888889, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.6555294394493103, "eval_runtime": 0.6106, "eval_samples_per_second": 117.911, "eval_steps_per_second": 8.188, "step": 26 }, { "epoch": 3.0, "grad_norm": 16.875, "learning_rate": 3.579545454545455e-05, "loss": 0.2805, "step": 27 }, { "epoch": 3.0, "eval_accuracy": 0.6388888888888888, "eval_loss": 0.650442361831665, "eval_runtime": 0.6114, "eval_samples_per_second": 117.754, "eval_steps_per_second": 8.177, "step": 27 }, { "epoch": 3.111111111111111, "grad_norm": 5.6875, "learning_rate": 3.522727272727273e-05, "loss": 0.3309, "step": 28 }, { "epoch": 3.111111111111111, "eval_accuracy": 0.6111111111111112, "eval_loss": 0.6491343379020691, "eval_runtime": 0.7116, "eval_samples_per_second": 101.178, "eval_steps_per_second": 7.026, "step": 28 }, { "epoch": 3.2222222222222223, "grad_norm": 20.75, "learning_rate": 3.465909090909091e-05, "loss": 0.4176, "step": 29 }, { "epoch": 3.2222222222222223, "eval_accuracy": 0.6388888888888888, "eval_loss": 0.6927388310432434, "eval_runtime": 0.7116, "eval_samples_per_second": 101.173, "eval_steps_per_second": 7.026, "step": 29 }, { "epoch": 3.3333333333333335, "grad_norm": 11.375, "learning_rate": 3.409090909090909e-05, "loss": 0.1967, "step": 30 }, { "epoch": 3.3333333333333335, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.7233132123947144, "eval_runtime": 0.6182, "eval_samples_per_second": 116.474, "eval_steps_per_second": 8.088, "step": 30 }, { "epoch": 3.4444444444444446, "grad_norm": 24.875, "learning_rate": 3.352272727272727e-05, "loss": 0.2568, "step": 31 }, { "epoch": 3.4444444444444446, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.7308578491210938, "eval_runtime": 0.6112, "eval_samples_per_second": 117.804, "eval_steps_per_second": 8.181, "step": 31 }, { "epoch": 3.5555555555555554, "grad_norm": 11.25, "learning_rate": 3.295454545454545e-05, "loss": 0.2841, "step": 32 }, { "epoch": 3.5555555555555554, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.7040252685546875, "eval_runtime": 0.6107, "eval_samples_per_second": 117.902, "eval_steps_per_second": 8.188, "step": 32 }, { "epoch": 3.6666666666666665, "grad_norm": 14.875, "learning_rate": 3.238636363636364e-05, "loss": 0.2797, "step": 33 }, { "epoch": 3.6666666666666665, "eval_accuracy": 0.625, "eval_loss": 0.6633122563362122, "eval_runtime": 0.6111, "eval_samples_per_second": 117.821, "eval_steps_per_second": 8.182, "step": 33 }, { "epoch": 3.7777777777777777, "grad_norm": 20.375, "learning_rate": 3.181818181818182e-05, "loss": 0.2031, "step": 34 }, { "epoch": 3.7777777777777777, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.6236428022384644, "eval_runtime": 0.6112, "eval_samples_per_second": 117.809, "eval_steps_per_second": 8.181, "step": 34 }, { "epoch": 3.888888888888889, "grad_norm": 2.890625, "learning_rate": 3.125e-05, "loss": 0.1318, "step": 35 }, { "epoch": 3.888888888888889, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.6151860356330872, "eval_runtime": 0.6134, "eval_samples_per_second": 117.386, "eval_steps_per_second": 8.152, "step": 35 }, { "epoch": 4.0, "grad_norm": 5.25, "learning_rate": 3.068181818181818e-05, "loss": 0.2346, "step": 36 }, { "epoch": 4.0, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.6115561723709106, "eval_runtime": 0.7128, "eval_samples_per_second": 101.01, "eval_steps_per_second": 7.015, "step": 36 }, { "epoch": 4.111111111111111, "grad_norm": 13.125, "learning_rate": 3.0113636363636365e-05, "loss": 0.1741, "step": 37 }, { "epoch": 4.111111111111111, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.6088710427284241, "eval_runtime": 0.662, "eval_samples_per_second": 108.762, "eval_steps_per_second": 7.553, "step": 37 }, { "epoch": 4.222222222222222, "grad_norm": 5.84375, "learning_rate": 2.954545454545455e-05, "loss": 0.186, "step": 38 }, { "epoch": 4.222222222222222, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.6091925501823425, "eval_runtime": 0.7117, "eval_samples_per_second": 101.172, "eval_steps_per_second": 7.026, "step": 38 }, { "epoch": 4.333333333333333, "grad_norm": 3.46875, "learning_rate": 2.8977272727272732e-05, "loss": 0.1125, "step": 39 }, { "epoch": 4.333333333333333, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.6227471828460693, "eval_runtime": 0.7115, "eval_samples_per_second": 101.191, "eval_steps_per_second": 7.027, "step": 39 }, { "epoch": 4.444444444444445, "grad_norm": 9.0625, "learning_rate": 2.8409090909090912e-05, "loss": 0.1299, "step": 40 }, { "epoch": 4.444444444444445, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.634276270866394, "eval_runtime": 0.6124, "eval_samples_per_second": 117.561, "eval_steps_per_second": 8.164, "step": 40 }, { "epoch": 4.555555555555555, "grad_norm": 7.78125, "learning_rate": 2.784090909090909e-05, "loss": 0.1387, "step": 41 }, { "epoch": 4.555555555555555, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.6676434278488159, "eval_runtime": 0.7113, "eval_samples_per_second": 101.226, "eval_steps_per_second": 7.03, "step": 41 }, { "epoch": 4.666666666666667, "grad_norm": 5.90625, "learning_rate": 2.7272727272727273e-05, "loss": 0.1129, "step": 42 }, { "epoch": 4.666666666666667, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.7227713465690613, "eval_runtime": 0.7115, "eval_samples_per_second": 101.191, "eval_steps_per_second": 7.027, "step": 42 }, { "epoch": 4.777777777777778, "grad_norm": 7.125, "learning_rate": 2.6704545454545453e-05, "loss": 0.0691, "step": 43 }, { "epoch": 4.777777777777778, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.7673901319503784, "eval_runtime": 0.6613, "eval_samples_per_second": 108.871, "eval_steps_per_second": 7.561, "step": 43 }, { "epoch": 4.888888888888889, "grad_norm": 4.03125, "learning_rate": 2.6136363636363637e-05, "loss": 0.0547, "step": 44 }, { "epoch": 4.888888888888889, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.7912532687187195, "eval_runtime": 0.6118, "eval_samples_per_second": 117.677, "eval_steps_per_second": 8.172, "step": 44 }, { "epoch": 5.0, "grad_norm": 13.75, "learning_rate": 2.5568181818181817e-05, "loss": 0.0985, "step": 45 }, { "epoch": 5.0, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.7817304134368896, "eval_runtime": 0.6108, "eval_samples_per_second": 117.879, "eval_steps_per_second": 8.186, "step": 45 }, { "epoch": 5.111111111111111, "grad_norm": 3.53125, "learning_rate": 2.5e-05, "loss": 0.0375, "step": 46 }, { "epoch": 5.111111111111111, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.7576977014541626, "eval_runtime": 0.6112, "eval_samples_per_second": 117.794, "eval_steps_per_second": 8.18, "step": 46 }, { "epoch": 5.222222222222222, "grad_norm": 2.046875, "learning_rate": 2.4431818181818185e-05, "loss": 0.0279, "step": 47 }, { "epoch": 5.222222222222222, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.7299577593803406, "eval_runtime": 0.6111, "eval_samples_per_second": 117.826, "eval_steps_per_second": 8.182, "step": 47 }, { "epoch": 5.333333333333333, "grad_norm": 1.640625, "learning_rate": 2.3863636363636365e-05, "loss": 0.0396, "step": 48 }, { "epoch": 5.333333333333333, "eval_accuracy": 0.75, "eval_loss": 0.7089157104492188, "eval_runtime": 0.7116, "eval_samples_per_second": 101.18, "eval_steps_per_second": 7.026, "step": 48 }, { "epoch": 5.444444444444445, "grad_norm": 3.640625, "learning_rate": 2.3295454545454546e-05, "loss": 0.049, "step": 49 }, { "epoch": 5.444444444444445, "eval_accuracy": 0.75, "eval_loss": 0.7139127254486084, "eval_runtime": 0.6114, "eval_samples_per_second": 117.764, "eval_steps_per_second": 8.178, "step": 49 }, { "epoch": 5.555555555555555, "grad_norm": 2.671875, "learning_rate": 2.272727272727273e-05, "loss": 0.0306, "step": 50 }, { "epoch": 5.555555555555555, "eval_accuracy": 0.75, "eval_loss": 0.7333288192749023, "eval_runtime": 0.6125, "eval_samples_per_second": 117.554, "eval_steps_per_second": 8.163, "step": 50 }, { "epoch": 5.666666666666667, "grad_norm": 0.76171875, "learning_rate": 2.215909090909091e-05, "loss": 0.0138, "step": 51 }, { "epoch": 5.666666666666667, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.7559387683868408, "eval_runtime": 0.611, "eval_samples_per_second": 117.848, "eval_steps_per_second": 8.184, "step": 51 }, { "epoch": 5.777777777777778, "grad_norm": 0.50390625, "learning_rate": 2.1590909090909093e-05, "loss": 0.0076, "step": 52 }, { "epoch": 5.777777777777778, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.7911980152130127, "eval_runtime": 0.6618, "eval_samples_per_second": 108.8, "eval_steps_per_second": 7.556, "step": 52 }, { "epoch": 5.888888888888889, "grad_norm": 1.421875, "learning_rate": 2.1022727272727274e-05, "loss": 0.02, "step": 53 }, { "epoch": 5.888888888888889, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.810262143611908, "eval_runtime": 0.6623, "eval_samples_per_second": 108.718, "eval_steps_per_second": 7.55, "step": 53 }, { "epoch": 6.0, "grad_norm": 5.03125, "learning_rate": 2.0454545454545457e-05, "loss": 0.0509, "step": 54 }, { "epoch": 6.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.8537428975105286, "eval_runtime": 0.712, "eval_samples_per_second": 101.122, "eval_steps_per_second": 7.022, "step": 54 }, { "epoch": 6.111111111111111, "grad_norm": 0.3203125, "learning_rate": 1.9886363636363638e-05, "loss": 0.0033, "step": 55 }, { "epoch": 6.111111111111111, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.9030678868293762, "eval_runtime": 0.6622, "eval_samples_per_second": 108.728, "eval_steps_per_second": 7.551, "step": 55 }, { "epoch": 6.222222222222222, "grad_norm": 0.21484375, "learning_rate": 1.9318181818181818e-05, "loss": 0.0033, "step": 56 }, { "epoch": 6.222222222222222, "eval_accuracy": 0.75, "eval_loss": 0.9585488438606262, "eval_runtime": 0.611, "eval_samples_per_second": 117.842, "eval_steps_per_second": 8.183, "step": 56 }, { "epoch": 6.333333333333333, "grad_norm": 0.9296875, "learning_rate": 1.8750000000000002e-05, "loss": 0.0057, "step": 57 }, { "epoch": 6.333333333333333, "eval_accuracy": 0.7638888888888888, "eval_loss": 0.9753527045249939, "eval_runtime": 0.5617, "eval_samples_per_second": 128.182, "eval_steps_per_second": 8.902, "step": 57 }, { "epoch": 6.444444444444445, "grad_norm": 1.171875, "learning_rate": 1.8181818181818182e-05, "loss": 0.0091, "step": 58 }, { "epoch": 6.444444444444445, "eval_accuracy": 0.7638888888888888, "eval_loss": 1.0243321657180786, "eval_runtime": 0.6134, "eval_samples_per_second": 117.377, "eval_steps_per_second": 8.151, "step": 58 }, { "epoch": 6.555555555555555, "grad_norm": 0.25390625, "learning_rate": 1.7613636363636366e-05, "loss": 0.0024, "step": 59 }, { "epoch": 6.555555555555555, "eval_accuracy": 0.75, "eval_loss": 1.0561052560806274, "eval_runtime": 0.7119, "eval_samples_per_second": 101.131, "eval_steps_per_second": 7.023, "step": 59 }, { "epoch": 6.666666666666667, "grad_norm": 0.2080078125, "learning_rate": 1.7045454545454546e-05, "loss": 0.0011, "step": 60 }, { "epoch": 6.666666666666667, "eval_accuracy": 0.75, "eval_loss": 1.0907957553863525, "eval_runtime": 0.7135, "eval_samples_per_second": 100.915, "eval_steps_per_second": 7.008, "step": 60 }, { "epoch": 6.777777777777778, "grad_norm": 0.2001953125, "learning_rate": 1.6477272727272726e-05, "loss": 0.0017, "step": 61 }, { "epoch": 6.777777777777778, "eval_accuracy": 0.75, "eval_loss": 1.1045655012130737, "eval_runtime": 0.6619, "eval_samples_per_second": 108.781, "eval_steps_per_second": 7.554, "step": 61 }, { "epoch": 6.888888888888889, "grad_norm": 0.265625, "learning_rate": 1.590909090909091e-05, "loss": 0.0017, "step": 62 }, { "epoch": 6.888888888888889, "eval_accuracy": 0.7638888888888888, "eval_loss": 1.1206629276275635, "eval_runtime": 0.6117, "eval_samples_per_second": 117.708, "eval_steps_per_second": 8.174, "step": 62 }, { "epoch": 7.0, "grad_norm": 0.035400390625, "learning_rate": 1.534090909090909e-05, "loss": 0.0006, "step": 63 }, { "epoch": 7.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1336255073547363, "eval_runtime": 0.6634, "eval_samples_per_second": 108.527, "eval_steps_per_second": 7.537, "step": 63 }, { "epoch": 7.111111111111111, "grad_norm": 0.040283203125, "learning_rate": 1.4772727272727274e-05, "loss": 0.0004, "step": 64 }, { "epoch": 7.111111111111111, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1413822174072266, "eval_runtime": 0.6642, "eval_samples_per_second": 108.409, "eval_steps_per_second": 7.528, "step": 64 }, { "epoch": 7.222222222222222, "grad_norm": 0.076171875, "learning_rate": 1.4204545454545456e-05, "loss": 0.0005, "step": 65 }, { "epoch": 7.222222222222222, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.143465280532837, "eval_runtime": 0.5614, "eval_samples_per_second": 128.248, "eval_steps_per_second": 8.906, "step": 65 }, { "epoch": 7.333333333333333, "grad_norm": 0.06201171875, "learning_rate": 1.3636363636363637e-05, "loss": 0.0005, "step": 66 }, { "epoch": 7.333333333333333, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.155677318572998, "eval_runtime": 0.6131, "eval_samples_per_second": 117.44, "eval_steps_per_second": 8.156, "step": 66 }, { "epoch": 7.444444444444445, "grad_norm": 0.0888671875, "learning_rate": 1.3068181818181819e-05, "loss": 0.0008, "step": 67 }, { "epoch": 7.444444444444445, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1653226613998413, "eval_runtime": 0.6121, "eval_samples_per_second": 117.637, "eval_steps_per_second": 8.169, "step": 67 }, { "epoch": 7.555555555555555, "grad_norm": 0.515625, "learning_rate": 1.25e-05, "loss": 0.0021, "step": 68 }, { "epoch": 7.555555555555555, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1580508947372437, "eval_runtime": 0.7136, "eval_samples_per_second": 100.903, "eval_steps_per_second": 7.007, "step": 68 }, { "epoch": 7.666666666666667, "grad_norm": 0.0206298828125, "learning_rate": 1.1931818181818183e-05, "loss": 0.0002, "step": 69 }, { "epoch": 7.666666666666667, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1536939144134521, "eval_runtime": 0.6119, "eval_samples_per_second": 117.657, "eval_steps_per_second": 8.171, "step": 69 }, { "epoch": 7.777777777777778, "grad_norm": 0.03466796875, "learning_rate": 1.1363636363636365e-05, "loss": 0.0002, "step": 70 }, { "epoch": 7.777777777777778, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.1577788591384888, "eval_runtime": 0.6119, "eval_samples_per_second": 117.668, "eval_steps_per_second": 8.171, "step": 70 }, { "epoch": 7.888888888888889, "grad_norm": 0.08349609375, "learning_rate": 1.0795454545454547e-05, "loss": 0.0006, "step": 71 }, { "epoch": 7.888888888888889, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1397895812988281, "eval_runtime": 0.7129, "eval_samples_per_second": 100.999, "eval_steps_per_second": 7.014, "step": 71 }, { "epoch": 8.0, "grad_norm": 0.04443359375, "learning_rate": 1.0227272727272729e-05, "loss": 0.0004, "step": 72 }, { "epoch": 8.0, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1536896228790283, "eval_runtime": 0.7131, "eval_samples_per_second": 100.962, "eval_steps_per_second": 7.011, "step": 72 }, { "epoch": 8.11111111111111, "grad_norm": 0.02392578125, "learning_rate": 9.659090909090909e-06, "loss": 0.0002, "step": 73 }, { "epoch": 8.11111111111111, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1682804822921753, "eval_runtime": 0.6627, "eval_samples_per_second": 108.648, "eval_steps_per_second": 7.545, "step": 73 }, { "epoch": 8.222222222222221, "grad_norm": 0.036865234375, "learning_rate": 9.090909090909091e-06, "loss": 0.0004, "step": 74 }, { "epoch": 8.222222222222221, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1590551137924194, "eval_runtime": 0.6635, "eval_samples_per_second": 108.51, "eval_steps_per_second": 7.535, "step": 74 }, { "epoch": 8.333333333333334, "grad_norm": 0.0213623046875, "learning_rate": 8.522727272727273e-06, "loss": 0.0003, "step": 75 }, { "epoch": 8.333333333333334, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1632962226867676, "eval_runtime": 0.6116, "eval_samples_per_second": 117.718, "eval_steps_per_second": 8.175, "step": 75 }, { "epoch": 8.444444444444445, "grad_norm": 0.08740234375, "learning_rate": 7.954545454545455e-06, "loss": 0.0004, "step": 76 }, { "epoch": 8.444444444444445, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1668634414672852, "eval_runtime": 0.611, "eval_samples_per_second": 117.842, "eval_steps_per_second": 8.183, "step": 76 }, { "epoch": 8.555555555555555, "grad_norm": 0.011474609375, "learning_rate": 7.386363636363637e-06, "loss": 0.0002, "step": 77 }, { "epoch": 8.555555555555555, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1664338111877441, "eval_runtime": 0.5614, "eval_samples_per_second": 128.259, "eval_steps_per_second": 8.907, "step": 77 }, { "epoch": 8.666666666666666, "grad_norm": 0.0308837890625, "learning_rate": 6.818181818181818e-06, "loss": 0.0002, "step": 78 }, { "epoch": 8.666666666666666, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.167876958847046, "eval_runtime": 0.6112, "eval_samples_per_second": 117.795, "eval_steps_per_second": 8.18, "step": 78 }, { "epoch": 8.777777777777779, "grad_norm": 0.033935546875, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 79 }, { "epoch": 8.777777777777779, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.153360366821289, "eval_runtime": 0.712, "eval_samples_per_second": 101.127, "eval_steps_per_second": 7.023, "step": 79 }, { "epoch": 8.88888888888889, "grad_norm": 0.048828125, "learning_rate": 5.681818181818182e-06, "loss": 0.0003, "step": 80 }, { "epoch": 8.88888888888889, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1648998260498047, "eval_runtime": 0.6123, "eval_samples_per_second": 117.581, "eval_steps_per_second": 8.165, "step": 80 }, { "epoch": 9.0, "grad_norm": 0.0133056640625, "learning_rate": 5.113636363636364e-06, "loss": 0.0001, "step": 81 }, { "epoch": 9.0, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1657260656356812, "eval_runtime": 0.5613, "eval_samples_per_second": 128.281, "eval_steps_per_second": 8.908, "step": 81 }, { "epoch": 9.11111111111111, "grad_norm": 0.023193359375, "learning_rate": 4.5454545454545455e-06, "loss": 0.0001, "step": 82 }, { "epoch": 9.11111111111111, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1588553190231323, "eval_runtime": 0.6632, "eval_samples_per_second": 108.563, "eval_steps_per_second": 7.539, "step": 82 }, { "epoch": 9.222222222222221, "grad_norm": 0.028076171875, "learning_rate": 3.9772727272727275e-06, "loss": 0.0003, "step": 83 }, { "epoch": 9.222222222222221, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1638314723968506, "eval_runtime": 0.6615, "eval_samples_per_second": 108.842, "eval_steps_per_second": 7.558, "step": 83 }, { "epoch": 9.333333333333334, "grad_norm": 0.041259765625, "learning_rate": 3.409090909090909e-06, "loss": 0.0002, "step": 84 }, { "epoch": 9.333333333333334, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1614993810653687, "eval_runtime": 0.563, "eval_samples_per_second": 127.875, "eval_steps_per_second": 8.88, "step": 84 }, { "epoch": 9.444444444444445, "grad_norm": 0.033447265625, "learning_rate": 2.840909090909091e-06, "loss": 0.0002, "step": 85 }, { "epoch": 9.444444444444445, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1663289070129395, "eval_runtime": 0.6624, "eval_samples_per_second": 108.689, "eval_steps_per_second": 7.548, "step": 85 }, { "epoch": 9.555555555555555, "grad_norm": 0.060791015625, "learning_rate": 2.2727272727272728e-06, "loss": 0.0004, "step": 86 }, { "epoch": 9.555555555555555, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1687304973602295, "eval_runtime": 0.6624, "eval_samples_per_second": 108.698, "eval_steps_per_second": 7.548, "step": 86 }, { "epoch": 9.666666666666666, "grad_norm": 0.01275634765625, "learning_rate": 1.7045454545454546e-06, "loss": 0.0001, "step": 87 }, { "epoch": 9.666666666666666, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.170520544052124, "eval_runtime": 0.6618, "eval_samples_per_second": 108.798, "eval_steps_per_second": 7.555, "step": 87 }, { "epoch": 9.777777777777779, "grad_norm": 0.0166015625, "learning_rate": 1.1363636363636364e-06, "loss": 0.0001, "step": 88 }, { "epoch": 9.777777777777779, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.1710082292556763, "eval_runtime": 0.612, "eval_samples_per_second": 117.65, "eval_steps_per_second": 8.17, "step": 88 }, { "epoch": 9.88888888888889, "grad_norm": 0.01708984375, "learning_rate": 5.681818181818182e-07, "loss": 0.0002, "step": 89 }, { "epoch": 9.88888888888889, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.160547137260437, "eval_runtime": 0.5621, "eval_samples_per_second": 128.102, "eval_steps_per_second": 8.896, "step": 89 }, { "epoch": 10.0, "grad_norm": 0.0625, "learning_rate": 0.0, "loss": 0.0003, "step": 90 }, { "epoch": 10.0, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.156036138534546, "eval_runtime": 0.6629, "eval_samples_per_second": 108.621, "eval_steps_per_second": 7.543, "step": 90 }, { "epoch": 10.0, "step": 90, "total_flos": 5016736558481408.0, "train_loss": 0.24542952523463302, "train_runtime": 139.7817, "train_samples_per_second": 20.317, "train_steps_per_second": 0.644 } ], "logging_steps": 1, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5016736558481408.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }