{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21670606776989756, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003940110323089046, "grad_norm": 2.0866503715515137, "learning_rate": 0.0004909090909090909, "loss": 0.4138, "step": 10 }, { "epoch": 0.003940110323089046, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.4887285530567169, "eval_runtime": 644.3572, "eval_samples_per_second": 7.878, "eval_steps_per_second": 1.969, "step": 10 }, { "epoch": 0.007880220646178092, "grad_norm": 0.8669756650924683, "learning_rate": 0.00048181818181818184, "loss": 0.4995, "step": 20 }, { "epoch": 0.007880220646178092, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.38217219710350037, "eval_runtime": 633.2128, "eval_samples_per_second": 8.016, "eval_steps_per_second": 2.004, "step": 20 }, { "epoch": 0.01182033096926714, "grad_norm": 2.203610420227051, "learning_rate": 0.0004727272727272727, "loss": 0.382, "step": 30 }, { "epoch": 0.01182033096926714, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3638584017753601, "eval_runtime": 642.1063, "eval_samples_per_second": 7.905, "eval_steps_per_second": 1.976, "step": 30 }, { "epoch": 0.015760441292356184, "grad_norm": 0.812998354434967, "learning_rate": 0.00046363636363636366, "loss": 0.354, "step": 40 }, { "epoch": 0.015760441292356184, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3297870457172394, "eval_runtime": 644.4864, "eval_samples_per_second": 7.876, "eval_steps_per_second": 1.969, "step": 40 }, { "epoch": 0.019700551615445233, "grad_norm": 0.6705520749092102, "learning_rate": 0.00045454545454545455, "loss": 0.521, "step": 50 }, { "epoch": 0.019700551615445233, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.38316452503204346, "eval_runtime": 645.7523, "eval_samples_per_second": 7.861, "eval_steps_per_second": 1.965, "step": 50 }, { "epoch": 0.02364066193853428, "grad_norm": 1.1547324657440186, "learning_rate": 0.00044545454545454543, "loss": 0.3344, "step": 60 }, { "epoch": 0.02364066193853428, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3688468039035797, "eval_runtime": 647.1212, "eval_samples_per_second": 7.844, "eval_steps_per_second": 1.961, "step": 60 }, { "epoch": 0.027580772261623327, "grad_norm": 0.7195687890052795, "learning_rate": 0.00043636363636363637, "loss": 0.3524, "step": 70 }, { "epoch": 0.027580772261623327, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.33884838223457336, "eval_runtime": 644.2281, "eval_samples_per_second": 7.879, "eval_steps_per_second": 1.97, "step": 70 }, { "epoch": 0.03152088258471237, "grad_norm": 0.030797701328992844, "learning_rate": 0.00042727272727272726, "loss": 0.2702, "step": 80 }, { "epoch": 0.03152088258471237, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.32865411043167114, "eval_runtime": 643.6443, "eval_samples_per_second": 7.886, "eval_steps_per_second": 1.972, "step": 80 }, { "epoch": 0.03546099290780142, "grad_norm": 0.7822753190994263, "learning_rate": 0.00041818181818181814, "loss": 0.3767, "step": 90 }, { "epoch": 0.03546099290780142, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3282410800457001, "eval_runtime": 647.7767, "eval_samples_per_second": 7.836, "eval_steps_per_second": 1.959, "step": 90 }, { "epoch": 0.039401103230890466, "grad_norm": 0.7474893927574158, "learning_rate": 0.00040909090909090913, "loss": 0.2964, "step": 100 }, { "epoch": 0.039401103230890466, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3291880786418915, "eval_runtime": 640.1347, "eval_samples_per_second": 7.93, "eval_steps_per_second": 1.982, "step": 100 }, { "epoch": 0.04334121355397951, "grad_norm": 0.44683077931404114, "learning_rate": 0.0004, "loss": 0.3428, "step": 110 }, { "epoch": 0.04334121355397951, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.33626437187194824, "eval_runtime": 640.0811, "eval_samples_per_second": 7.93, "eval_steps_per_second": 1.983, "step": 110 }, { "epoch": 0.04728132387706856, "grad_norm": 0.07774700969457626, "learning_rate": 0.00039090909090909096, "loss": 0.3215, "step": 120 }, { "epoch": 0.04728132387706856, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.32750025391578674, "eval_runtime": 641.6289, "eval_samples_per_second": 7.911, "eval_steps_per_second": 1.978, "step": 120 }, { "epoch": 0.0512214342001576, "grad_norm": 0.8798918128013611, "learning_rate": 0.00038181818181818184, "loss": 0.3524, "step": 130 }, { "epoch": 0.0512214342001576, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3243106007575989, "eval_runtime": 642.1452, "eval_samples_per_second": 7.905, "eval_steps_per_second": 1.976, "step": 130 }, { "epoch": 0.055161544523246654, "grad_norm": 0.513219952583313, "learning_rate": 0.00037272727272727273, "loss": 0.3029, "step": 140 }, { "epoch": 0.055161544523246654, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.33765551447868347, "eval_runtime": 644.4577, "eval_samples_per_second": 7.876, "eval_steps_per_second": 1.969, "step": 140 }, { "epoch": 0.0591016548463357, "grad_norm": 0.1864446997642517, "learning_rate": 0.00036363636363636367, "loss": 0.494, "step": 150 }, { "epoch": 0.0591016548463357, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3399695158004761, "eval_runtime": 644.9312, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.968, "step": 150 }, { "epoch": 0.06304176516942474, "grad_norm": 0.6781743168830872, "learning_rate": 0.00035454545454545455, "loss": 0.2655, "step": 160 }, { "epoch": 0.06304176516942474, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.32840684056282043, "eval_runtime": 638.0586, "eval_samples_per_second": 7.955, "eval_steps_per_second": 1.989, "step": 160 }, { "epoch": 0.06698187549251379, "grad_norm": 0.4446357786655426, "learning_rate": 0.00034545454545454544, "loss": 0.3505, "step": 170 }, { "epoch": 0.06698187549251379, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.34705930948257446, "eval_runtime": 636.2641, "eval_samples_per_second": 7.978, "eval_steps_per_second": 1.994, "step": 170 }, { "epoch": 0.07092198581560284, "grad_norm": 0.5605026483535767, "learning_rate": 0.0003363636363636364, "loss": 0.2416, "step": 180 }, { "epoch": 0.07092198581560284, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3337305784225464, "eval_runtime": 637.842, "eval_samples_per_second": 7.958, "eval_steps_per_second": 1.99, "step": 180 }, { "epoch": 0.07486209613869188, "grad_norm": 0.48381492495536804, "learning_rate": 0.00032727272727272726, "loss": 0.3361, "step": 190 }, { "epoch": 0.07486209613869188, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3374550938606262, "eval_runtime": 630.1852, "eval_samples_per_second": 8.055, "eval_steps_per_second": 2.014, "step": 190 }, { "epoch": 0.07880220646178093, "grad_norm": 0.20769913494586945, "learning_rate": 0.0003181818181818182, "loss": 0.3264, "step": 200 }, { "epoch": 0.07880220646178093, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.32236042618751526, "eval_runtime": 634.0108, "eval_samples_per_second": 8.006, "eval_steps_per_second": 2.002, "step": 200 }, { "epoch": 0.08274231678486997, "grad_norm": 0.5153699517250061, "learning_rate": 0.0003090909090909091, "loss": 0.1682, "step": 210 }, { "epoch": 0.08274231678486997, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3330242931842804, "eval_runtime": 630.7179, "eval_samples_per_second": 8.048, "eval_steps_per_second": 2.012, "step": 210 }, { "epoch": 0.08668242710795902, "grad_norm": 0.5939351916313171, "learning_rate": 0.0003, "loss": 0.3564, "step": 220 }, { "epoch": 0.08668242710795902, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3238199055194855, "eval_runtime": 639.7513, "eval_samples_per_second": 7.934, "eval_steps_per_second": 1.984, "step": 220 }, { "epoch": 0.09062253743104808, "grad_norm": 0.5458611845970154, "learning_rate": 0.0002909090909090909, "loss": 0.2441, "step": 230 }, { "epoch": 0.09062253743104808, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.3024224638938904, "eval_runtime": 640.3192, "eval_samples_per_second": 7.927, "eval_steps_per_second": 1.982, "step": 230 }, { "epoch": 0.09456264775413711, "grad_norm": 4.142682075500488, "learning_rate": 0.0002818181818181818, "loss": 0.4017, "step": 240 }, { "epoch": 0.09456264775413711, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.28254833817481995, "eval_runtime": 639.8776, "eval_samples_per_second": 7.933, "eval_steps_per_second": 1.983, "step": 240 }, { "epoch": 0.09850275807722617, "grad_norm": 0.83643639087677, "learning_rate": 0.00027272727272727274, "loss": 0.2683, "step": 250 }, { "epoch": 0.09850275807722617, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.2727406322956085, "eval_runtime": 637.4582, "eval_samples_per_second": 7.963, "eval_steps_per_second": 1.991, "step": 250 }, { "epoch": 0.1024428684003152, "grad_norm": 2.7200253009796143, "learning_rate": 0.0002636363636363636, "loss": 0.3417, "step": 260 }, { "epoch": 0.1024428684003152, "eval_accuracy": 0.8975571393966675, "eval_loss": 0.2998380661010742, "eval_runtime": 637.2677, "eval_samples_per_second": 7.965, "eval_steps_per_second": 1.991, "step": 260 }, { "epoch": 0.10638297872340426, "grad_norm": 0.9870793223381042, "learning_rate": 0.0002545454545454545, "loss": 0.3689, "step": 270 }, { "epoch": 0.10638297872340426, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.25626352429389954, "eval_runtime": 638.6518, "eval_samples_per_second": 7.948, "eval_steps_per_second": 1.987, "step": 270 }, { "epoch": 0.11032308904649331, "grad_norm": 0.7646285891532898, "learning_rate": 0.00024545454545454545, "loss": 0.3017, "step": 280 }, { "epoch": 0.11032308904649331, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.2582588195800781, "eval_runtime": 633.7229, "eval_samples_per_second": 8.01, "eval_steps_per_second": 2.002, "step": 280 }, { "epoch": 0.11426319936958235, "grad_norm": 3.958172082901001, "learning_rate": 0.00023636363636363636, "loss": 0.3033, "step": 290 }, { "epoch": 0.11426319936958235, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.2637666165828705, "eval_runtime": 635.1066, "eval_samples_per_second": 7.992, "eval_steps_per_second": 1.998, "step": 290 }, { "epoch": 0.1182033096926714, "grad_norm": 3.4462485313415527, "learning_rate": 0.00022727272727272727, "loss": 0.1859, "step": 300 }, { "epoch": 0.1182033096926714, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.22763606905937195, "eval_runtime": 635.7822, "eval_samples_per_second": 7.984, "eval_steps_per_second": 1.996, "step": 300 }, { "epoch": 0.12214342001576044, "grad_norm": 0.9540772438049316, "learning_rate": 0.00021818181818181818, "loss": 0.2832, "step": 310 }, { "epoch": 0.12214342001576044, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1975635588169098, "eval_runtime": 642.0949, "eval_samples_per_second": 7.905, "eval_steps_per_second": 1.976, "step": 310 }, { "epoch": 0.12608353033884948, "grad_norm": 0.45892244577407837, "learning_rate": 0.00020909090909090907, "loss": 0.2679, "step": 320 }, { "epoch": 0.12608353033884948, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1894582062959671, "eval_runtime": 642.7065, "eval_samples_per_second": 7.898, "eval_steps_per_second": 1.974, "step": 320 }, { "epoch": 0.13002364066193853, "grad_norm": 0.4674457013607025, "learning_rate": 0.0002, "loss": 0.1966, "step": 330 }, { "epoch": 0.13002364066193853, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.193992018699646, "eval_runtime": 641.7924, "eval_samples_per_second": 7.909, "eval_steps_per_second": 1.977, "step": 330 }, { "epoch": 0.13396375098502758, "grad_norm": 0.4076831638813019, "learning_rate": 0.00019090909090909092, "loss": 0.2063, "step": 340 }, { "epoch": 0.13396375098502758, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.19286341965198517, "eval_runtime": 639.6626, "eval_samples_per_second": 7.935, "eval_steps_per_second": 1.984, "step": 340 }, { "epoch": 0.13790386130811663, "grad_norm": 0.5408686995506287, "learning_rate": 0.00018181818181818183, "loss": 0.2215, "step": 350 }, { "epoch": 0.13790386130811663, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.17871853709220886, "eval_runtime": 636.151, "eval_samples_per_second": 7.979, "eval_steps_per_second": 1.995, "step": 350 }, { "epoch": 0.14184397163120568, "grad_norm": 3.9466795921325684, "learning_rate": 0.00017272727272727272, "loss": 0.2226, "step": 360 }, { "epoch": 0.14184397163120568, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.22342762351036072, "eval_runtime": 635.3665, "eval_samples_per_second": 7.989, "eval_steps_per_second": 1.997, "step": 360 }, { "epoch": 0.1457840819542947, "grad_norm": 0.3433726131916046, "learning_rate": 0.00016363636363636363, "loss": 0.2688, "step": 370 }, { "epoch": 0.1457840819542947, "eval_accuracy": 0.8358944058418274, "eval_loss": 0.3028296232223511, "eval_runtime": 626.8327, "eval_samples_per_second": 8.098, "eval_steps_per_second": 2.024, "step": 370 }, { "epoch": 0.14972419227738376, "grad_norm": 0.269267201423645, "learning_rate": 0.00015454545454545454, "loss": 0.2317, "step": 380 }, { "epoch": 0.14972419227738376, "eval_accuracy": 0.8861308097839355, "eval_loss": 0.1874387264251709, "eval_runtime": 634.1878, "eval_samples_per_second": 8.004, "eval_steps_per_second": 2.001, "step": 380 }, { "epoch": 0.1536643026004728, "grad_norm": 0.16006210446357727, "learning_rate": 0.00014545454545454546, "loss": 0.2088, "step": 390 }, { "epoch": 0.1536643026004728, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.2302139550447464, "eval_runtime": 631.9364, "eval_samples_per_second": 8.032, "eval_steps_per_second": 2.008, "step": 390 }, { "epoch": 0.15760441292356187, "grad_norm": 0.5244100093841553, "learning_rate": 0.00013636363636363637, "loss": 0.4595, "step": 400 }, { "epoch": 0.15760441292356187, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.19357828795909882, "eval_runtime": 642.9065, "eval_samples_per_second": 7.895, "eval_steps_per_second": 1.974, "step": 400 }, { "epoch": 0.16154452324665092, "grad_norm": 0.5354598164558411, "learning_rate": 0.00012727272727272725, "loss": 0.15, "step": 410 }, { "epoch": 0.16154452324665092, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.18797340989112854, "eval_runtime": 640.1417, "eval_samples_per_second": 7.929, "eval_steps_per_second": 1.982, "step": 410 }, { "epoch": 0.16548463356973994, "grad_norm": 0.2795056998729706, "learning_rate": 0.00011818181818181818, "loss": 0.1919, "step": 420 }, { "epoch": 0.16548463356973994, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1791592687368393, "eval_runtime": 639.6272, "eval_samples_per_second": 7.936, "eval_steps_per_second": 1.984, "step": 420 }, { "epoch": 0.169424743892829, "grad_norm": 0.2897014021873474, "learning_rate": 0.00010909090909090909, "loss": 0.3189, "step": 430 }, { "epoch": 0.169424743892829, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1778295636177063, "eval_runtime": 637.1833, "eval_samples_per_second": 7.966, "eval_steps_per_second": 1.992, "step": 430 }, { "epoch": 0.17336485421591805, "grad_norm": 0.08481621742248535, "learning_rate": 0.0001, "loss": 0.2422, "step": 440 }, { "epoch": 0.17336485421591805, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.18217456340789795, "eval_runtime": 639.7951, "eval_samples_per_second": 7.934, "eval_steps_per_second": 1.983, "step": 440 }, { "epoch": 0.1773049645390071, "grad_norm": 0.3332684636116028, "learning_rate": 9.090909090909092e-05, "loss": 0.1599, "step": 450 }, { "epoch": 0.1773049645390071, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.18780100345611572, "eval_runtime": 641.5947, "eval_samples_per_second": 7.912, "eval_steps_per_second": 1.978, "step": 450 }, { "epoch": 0.18124507486209615, "grad_norm": 0.5597277879714966, "learning_rate": 8.181818181818182e-05, "loss": 0.2962, "step": 460 }, { "epoch": 0.18124507486209615, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1818259209394455, "eval_runtime": 638.6242, "eval_samples_per_second": 7.948, "eval_steps_per_second": 1.987, "step": 460 }, { "epoch": 0.18518518518518517, "grad_norm": 0.9142216444015503, "learning_rate": 7.272727272727273e-05, "loss": 0.1295, "step": 470 }, { "epoch": 0.18518518518518517, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1787974089384079, "eval_runtime": 642.4129, "eval_samples_per_second": 7.901, "eval_steps_per_second": 1.975, "step": 470 }, { "epoch": 0.18912529550827423, "grad_norm": 0.5384683012962341, "learning_rate": 6.363636363636363e-05, "loss": 0.2327, "step": 480 }, { "epoch": 0.18912529550827423, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1720920354127884, "eval_runtime": 637.6983, "eval_samples_per_second": 7.96, "eval_steps_per_second": 1.99, "step": 480 }, { "epoch": 0.19306540583136328, "grad_norm": 0.3855592608451843, "learning_rate": 5.4545454545454546e-05, "loss": 0.2012, "step": 490 }, { "epoch": 0.19306540583136328, "eval_accuracy": 0.8977541327476501, "eval_loss": 0.17169128358364105, "eval_runtime": 643.1115, "eval_samples_per_second": 7.893, "eval_steps_per_second": 1.973, "step": 490 }, { "epoch": 0.19700551615445233, "grad_norm": 0.18903906643390656, "learning_rate": 4.545454545454546e-05, "loss": 0.2338, "step": 500 }, { "epoch": 0.19700551615445233, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.16954948008060455, "eval_runtime": 641.432, "eval_samples_per_second": 7.914, "eval_steps_per_second": 1.978, "step": 500 }, { "epoch": 0.20094562647754138, "grad_norm": 0.5222665667533875, "learning_rate": 3.6363636363636364e-05, "loss": 0.261, "step": 510 }, { "epoch": 0.20094562647754138, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.16886387765407562, "eval_runtime": 644.6602, "eval_samples_per_second": 7.874, "eval_steps_per_second": 1.968, "step": 510 }, { "epoch": 0.2048857368006304, "grad_norm": 0.1900663524866104, "learning_rate": 2.7272727272727273e-05, "loss": 0.2295, "step": 520 }, { "epoch": 0.2048857368006304, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.16761720180511475, "eval_runtime": 644.587, "eval_samples_per_second": 7.875, "eval_steps_per_second": 1.969, "step": 520 }, { "epoch": 0.20882584712371946, "grad_norm": 0.7705594897270203, "learning_rate": 1.8181818181818182e-05, "loss": 0.2785, "step": 530 }, { "epoch": 0.20882584712371946, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.16720621287822723, "eval_runtime": 641.3858, "eval_samples_per_second": 7.914, "eval_steps_per_second": 1.979, "step": 530 }, { "epoch": 0.2127659574468085, "grad_norm": 3.4368479251861572, "learning_rate": 9.090909090909091e-06, "loss": 0.2326, "step": 540 }, { "epoch": 0.2127659574468085, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1670488715171814, "eval_runtime": 635.73, "eval_samples_per_second": 7.985, "eval_steps_per_second": 1.996, "step": 540 }, { "epoch": 0.21670606776989756, "grad_norm": 0.17799390852451324, "learning_rate": 0.0, "loss": 0.2048, "step": 550 }, { "epoch": 0.21670606776989756, "eval_accuracy": 0.8983451724052429, "eval_loss": 0.1670481413602829, "eval_runtime": 633.0482, "eval_samples_per_second": 8.018, "eval_steps_per_second": 2.005, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.667515335043259e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }