|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.21670606776989756, |
|
"eval_steps": 10, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003940110323089046, |
|
"grad_norm": 2.0866503715515137, |
|
"learning_rate": 0.0004909090909090909, |
|
"loss": 0.4138, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003940110323089046, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.4887285530567169, |
|
"eval_runtime": 644.3572, |
|
"eval_samples_per_second": 7.878, |
|
"eval_steps_per_second": 1.969, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007880220646178092, |
|
"grad_norm": 0.8669756650924683, |
|
"learning_rate": 0.00048181818181818184, |
|
"loss": 0.4995, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007880220646178092, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.38217219710350037, |
|
"eval_runtime": 633.2128, |
|
"eval_samples_per_second": 8.016, |
|
"eval_steps_per_second": 2.004, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01182033096926714, |
|
"grad_norm": 2.203610420227051, |
|
"learning_rate": 0.0004727272727272727, |
|
"loss": 0.382, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01182033096926714, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3638584017753601, |
|
"eval_runtime": 642.1063, |
|
"eval_samples_per_second": 7.905, |
|
"eval_steps_per_second": 1.976, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.015760441292356184, |
|
"grad_norm": 0.812998354434967, |
|
"learning_rate": 0.00046363636363636366, |
|
"loss": 0.354, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.015760441292356184, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3297870457172394, |
|
"eval_runtime": 644.4864, |
|
"eval_samples_per_second": 7.876, |
|
"eval_steps_per_second": 1.969, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.019700551615445233, |
|
"grad_norm": 0.6705520749092102, |
|
"learning_rate": 0.00045454545454545455, |
|
"loss": 0.521, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.019700551615445233, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.38316452503204346, |
|
"eval_runtime": 645.7523, |
|
"eval_samples_per_second": 7.861, |
|
"eval_steps_per_second": 1.965, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02364066193853428, |
|
"grad_norm": 1.1547324657440186, |
|
"learning_rate": 0.00044545454545454543, |
|
"loss": 0.3344, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02364066193853428, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3688468039035797, |
|
"eval_runtime": 647.1212, |
|
"eval_samples_per_second": 7.844, |
|
"eval_steps_per_second": 1.961, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.027580772261623327, |
|
"grad_norm": 0.7195687890052795, |
|
"learning_rate": 0.00043636363636363637, |
|
"loss": 0.3524, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.027580772261623327, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.33884838223457336, |
|
"eval_runtime": 644.2281, |
|
"eval_samples_per_second": 7.879, |
|
"eval_steps_per_second": 1.97, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03152088258471237, |
|
"grad_norm": 0.030797701328992844, |
|
"learning_rate": 0.00042727272727272726, |
|
"loss": 0.2702, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03152088258471237, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.32865411043167114, |
|
"eval_runtime": 643.6443, |
|
"eval_samples_per_second": 7.886, |
|
"eval_steps_per_second": 1.972, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03546099290780142, |
|
"grad_norm": 0.7822753190994263, |
|
"learning_rate": 0.00041818181818181814, |
|
"loss": 0.3767, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03546099290780142, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3282410800457001, |
|
"eval_runtime": 647.7767, |
|
"eval_samples_per_second": 7.836, |
|
"eval_steps_per_second": 1.959, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.039401103230890466, |
|
"grad_norm": 0.7474893927574158, |
|
"learning_rate": 0.00040909090909090913, |
|
"loss": 0.2964, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.039401103230890466, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3291880786418915, |
|
"eval_runtime": 640.1347, |
|
"eval_samples_per_second": 7.93, |
|
"eval_steps_per_second": 1.982, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04334121355397951, |
|
"grad_norm": 0.44683077931404114, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3428, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04334121355397951, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.33626437187194824, |
|
"eval_runtime": 640.0811, |
|
"eval_samples_per_second": 7.93, |
|
"eval_steps_per_second": 1.983, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04728132387706856, |
|
"grad_norm": 0.07774700969457626, |
|
"learning_rate": 0.00039090909090909096, |
|
"loss": 0.3215, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04728132387706856, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.32750025391578674, |
|
"eval_runtime": 641.6289, |
|
"eval_samples_per_second": 7.911, |
|
"eval_steps_per_second": 1.978, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0512214342001576, |
|
"grad_norm": 0.8798918128013611, |
|
"learning_rate": 0.00038181818181818184, |
|
"loss": 0.3524, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0512214342001576, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3243106007575989, |
|
"eval_runtime": 642.1452, |
|
"eval_samples_per_second": 7.905, |
|
"eval_steps_per_second": 1.976, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.055161544523246654, |
|
"grad_norm": 0.513219952583313, |
|
"learning_rate": 0.00037272727272727273, |
|
"loss": 0.3029, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.055161544523246654, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.33765551447868347, |
|
"eval_runtime": 644.4577, |
|
"eval_samples_per_second": 7.876, |
|
"eval_steps_per_second": 1.969, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0591016548463357, |
|
"grad_norm": 0.1864446997642517, |
|
"learning_rate": 0.00036363636363636367, |
|
"loss": 0.494, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0591016548463357, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3399695158004761, |
|
"eval_runtime": 644.9312, |
|
"eval_samples_per_second": 7.871, |
|
"eval_steps_per_second": 1.968, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06304176516942474, |
|
"grad_norm": 0.6781743168830872, |
|
"learning_rate": 0.00035454545454545455, |
|
"loss": 0.2655, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06304176516942474, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.32840684056282043, |
|
"eval_runtime": 638.0586, |
|
"eval_samples_per_second": 7.955, |
|
"eval_steps_per_second": 1.989, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06698187549251379, |
|
"grad_norm": 0.4446357786655426, |
|
"learning_rate": 0.00034545454545454544, |
|
"loss": 0.3505, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06698187549251379, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.34705930948257446, |
|
"eval_runtime": 636.2641, |
|
"eval_samples_per_second": 7.978, |
|
"eval_steps_per_second": 1.994, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07092198581560284, |
|
"grad_norm": 0.5605026483535767, |
|
"learning_rate": 0.0003363636363636364, |
|
"loss": 0.2416, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07092198581560284, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3337305784225464, |
|
"eval_runtime": 637.842, |
|
"eval_samples_per_second": 7.958, |
|
"eval_steps_per_second": 1.99, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07486209613869188, |
|
"grad_norm": 0.48381492495536804, |
|
"learning_rate": 0.00032727272727272726, |
|
"loss": 0.3361, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07486209613869188, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3374550938606262, |
|
"eval_runtime": 630.1852, |
|
"eval_samples_per_second": 8.055, |
|
"eval_steps_per_second": 2.014, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07880220646178093, |
|
"grad_norm": 0.20769913494586945, |
|
"learning_rate": 0.0003181818181818182, |
|
"loss": 0.3264, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07880220646178093, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.32236042618751526, |
|
"eval_runtime": 634.0108, |
|
"eval_samples_per_second": 8.006, |
|
"eval_steps_per_second": 2.002, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08274231678486997, |
|
"grad_norm": 0.5153699517250061, |
|
"learning_rate": 0.0003090909090909091, |
|
"loss": 0.1682, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08274231678486997, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3330242931842804, |
|
"eval_runtime": 630.7179, |
|
"eval_samples_per_second": 8.048, |
|
"eval_steps_per_second": 2.012, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08668242710795902, |
|
"grad_norm": 0.5939351916313171, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3564, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08668242710795902, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3238199055194855, |
|
"eval_runtime": 639.7513, |
|
"eval_samples_per_second": 7.934, |
|
"eval_steps_per_second": 1.984, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09062253743104808, |
|
"grad_norm": 0.5458611845970154, |
|
"learning_rate": 0.0002909090909090909, |
|
"loss": 0.2441, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09062253743104808, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.3024224638938904, |
|
"eval_runtime": 640.3192, |
|
"eval_samples_per_second": 7.927, |
|
"eval_steps_per_second": 1.982, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09456264775413711, |
|
"grad_norm": 4.142682075500488, |
|
"learning_rate": 0.0002818181818181818, |
|
"loss": 0.4017, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09456264775413711, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.28254833817481995, |
|
"eval_runtime": 639.8776, |
|
"eval_samples_per_second": 7.933, |
|
"eval_steps_per_second": 1.983, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09850275807722617, |
|
"grad_norm": 0.83643639087677, |
|
"learning_rate": 0.00027272727272727274, |
|
"loss": 0.2683, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09850275807722617, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.2727406322956085, |
|
"eval_runtime": 637.4582, |
|
"eval_samples_per_second": 7.963, |
|
"eval_steps_per_second": 1.991, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1024428684003152, |
|
"grad_norm": 2.7200253009796143, |
|
"learning_rate": 0.0002636363636363636, |
|
"loss": 0.3417, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1024428684003152, |
|
"eval_accuracy": 0.8975571393966675, |
|
"eval_loss": 0.2998380661010742, |
|
"eval_runtime": 637.2677, |
|
"eval_samples_per_second": 7.965, |
|
"eval_steps_per_second": 1.991, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 0.9870793223381042, |
|
"learning_rate": 0.0002545454545454545, |
|
"loss": 0.3689, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.25626352429389954, |
|
"eval_runtime": 638.6518, |
|
"eval_samples_per_second": 7.948, |
|
"eval_steps_per_second": 1.987, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11032308904649331, |
|
"grad_norm": 0.7646285891532898, |
|
"learning_rate": 0.00024545454545454545, |
|
"loss": 0.3017, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11032308904649331, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.2582588195800781, |
|
"eval_runtime": 633.7229, |
|
"eval_samples_per_second": 8.01, |
|
"eval_steps_per_second": 2.002, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11426319936958235, |
|
"grad_norm": 3.958172082901001, |
|
"learning_rate": 0.00023636363636363636, |
|
"loss": 0.3033, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11426319936958235, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.2637666165828705, |
|
"eval_runtime": 635.1066, |
|
"eval_samples_per_second": 7.992, |
|
"eval_steps_per_second": 1.998, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1182033096926714, |
|
"grad_norm": 3.4462485313415527, |
|
"learning_rate": 0.00022727272727272727, |
|
"loss": 0.1859, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1182033096926714, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.22763606905937195, |
|
"eval_runtime": 635.7822, |
|
"eval_samples_per_second": 7.984, |
|
"eval_steps_per_second": 1.996, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12214342001576044, |
|
"grad_norm": 0.9540772438049316, |
|
"learning_rate": 0.00021818181818181818, |
|
"loss": 0.2832, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.12214342001576044, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1975635588169098, |
|
"eval_runtime": 642.0949, |
|
"eval_samples_per_second": 7.905, |
|
"eval_steps_per_second": 1.976, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.12608353033884948, |
|
"grad_norm": 0.45892244577407837, |
|
"learning_rate": 0.00020909090909090907, |
|
"loss": 0.2679, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12608353033884948, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1894582062959671, |
|
"eval_runtime": 642.7065, |
|
"eval_samples_per_second": 7.898, |
|
"eval_steps_per_second": 1.974, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13002364066193853, |
|
"grad_norm": 0.4674457013607025, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1966, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13002364066193853, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.193992018699646, |
|
"eval_runtime": 641.7924, |
|
"eval_samples_per_second": 7.909, |
|
"eval_steps_per_second": 1.977, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13396375098502758, |
|
"grad_norm": 0.4076831638813019, |
|
"learning_rate": 0.00019090909090909092, |
|
"loss": 0.2063, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.13396375098502758, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.19286341965198517, |
|
"eval_runtime": 639.6626, |
|
"eval_samples_per_second": 7.935, |
|
"eval_steps_per_second": 1.984, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.13790386130811663, |
|
"grad_norm": 0.5408686995506287, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.2215, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13790386130811663, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.17871853709220886, |
|
"eval_runtime": 636.151, |
|
"eval_samples_per_second": 7.979, |
|
"eval_steps_per_second": 1.995, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14184397163120568, |
|
"grad_norm": 3.9466795921325684, |
|
"learning_rate": 0.00017272727272727272, |
|
"loss": 0.2226, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.14184397163120568, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.22342762351036072, |
|
"eval_runtime": 635.3665, |
|
"eval_samples_per_second": 7.989, |
|
"eval_steps_per_second": 1.997, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1457840819542947, |
|
"grad_norm": 0.3433726131916046, |
|
"learning_rate": 0.00016363636363636363, |
|
"loss": 0.2688, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1457840819542947, |
|
"eval_accuracy": 0.8358944058418274, |
|
"eval_loss": 0.3028296232223511, |
|
"eval_runtime": 626.8327, |
|
"eval_samples_per_second": 8.098, |
|
"eval_steps_per_second": 2.024, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.14972419227738376, |
|
"grad_norm": 0.269267201423645, |
|
"learning_rate": 0.00015454545454545454, |
|
"loss": 0.2317, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14972419227738376, |
|
"eval_accuracy": 0.8861308097839355, |
|
"eval_loss": 0.1874387264251709, |
|
"eval_runtime": 634.1878, |
|
"eval_samples_per_second": 8.004, |
|
"eval_steps_per_second": 2.001, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1536643026004728, |
|
"grad_norm": 0.16006210446357727, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.2088, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1536643026004728, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.2302139550447464, |
|
"eval_runtime": 631.9364, |
|
"eval_samples_per_second": 8.032, |
|
"eval_steps_per_second": 2.008, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.15760441292356187, |
|
"grad_norm": 0.5244100093841553, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 0.4595, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15760441292356187, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.19357828795909882, |
|
"eval_runtime": 642.9065, |
|
"eval_samples_per_second": 7.895, |
|
"eval_steps_per_second": 1.974, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16154452324665092, |
|
"grad_norm": 0.5354598164558411, |
|
"learning_rate": 0.00012727272727272725, |
|
"loss": 0.15, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.16154452324665092, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.18797340989112854, |
|
"eval_runtime": 640.1417, |
|
"eval_samples_per_second": 7.929, |
|
"eval_steps_per_second": 1.982, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.16548463356973994, |
|
"grad_norm": 0.2795056998729706, |
|
"learning_rate": 0.00011818181818181818, |
|
"loss": 0.1919, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.16548463356973994, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1791592687368393, |
|
"eval_runtime": 639.6272, |
|
"eval_samples_per_second": 7.936, |
|
"eval_steps_per_second": 1.984, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.169424743892829, |
|
"grad_norm": 0.2897014021873474, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.3189, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.169424743892829, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1778295636177063, |
|
"eval_runtime": 637.1833, |
|
"eval_samples_per_second": 7.966, |
|
"eval_steps_per_second": 1.992, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.17336485421591805, |
|
"grad_norm": 0.08481621742248535, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2422, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.17336485421591805, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.18217456340789795, |
|
"eval_runtime": 639.7951, |
|
"eval_samples_per_second": 7.934, |
|
"eval_steps_per_second": 1.983, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1773049645390071, |
|
"grad_norm": 0.3332684636116028, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.1599, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1773049645390071, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.18780100345611572, |
|
"eval_runtime": 641.5947, |
|
"eval_samples_per_second": 7.912, |
|
"eval_steps_per_second": 1.978, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18124507486209615, |
|
"grad_norm": 0.5597277879714966, |
|
"learning_rate": 8.181818181818182e-05, |
|
"loss": 0.2962, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.18124507486209615, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1818259209394455, |
|
"eval_runtime": 638.6242, |
|
"eval_samples_per_second": 7.948, |
|
"eval_steps_per_second": 1.987, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.9142216444015503, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.1295, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1787974089384079, |
|
"eval_runtime": 642.4129, |
|
"eval_samples_per_second": 7.901, |
|
"eval_steps_per_second": 1.975, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.18912529550827423, |
|
"grad_norm": 0.5384683012962341, |
|
"learning_rate": 6.363636363636363e-05, |
|
"loss": 0.2327, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.18912529550827423, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1720920354127884, |
|
"eval_runtime": 637.6983, |
|
"eval_samples_per_second": 7.96, |
|
"eval_steps_per_second": 1.99, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.19306540583136328, |
|
"grad_norm": 0.3855592608451843, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.2012, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.19306540583136328, |
|
"eval_accuracy": 0.8977541327476501, |
|
"eval_loss": 0.17169128358364105, |
|
"eval_runtime": 643.1115, |
|
"eval_samples_per_second": 7.893, |
|
"eval_steps_per_second": 1.973, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.19700551615445233, |
|
"grad_norm": 0.18903906643390656, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.2338, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19700551615445233, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.16954948008060455, |
|
"eval_runtime": 641.432, |
|
"eval_samples_per_second": 7.914, |
|
"eval_steps_per_second": 1.978, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.20094562647754138, |
|
"grad_norm": 0.5222665667533875, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.261, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.20094562647754138, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.16886387765407562, |
|
"eval_runtime": 644.6602, |
|
"eval_samples_per_second": 7.874, |
|
"eval_steps_per_second": 1.968, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2048857368006304, |
|
"grad_norm": 0.1900663524866104, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.2295, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2048857368006304, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.16761720180511475, |
|
"eval_runtime": 644.587, |
|
"eval_samples_per_second": 7.875, |
|
"eval_steps_per_second": 1.969, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.20882584712371946, |
|
"grad_norm": 0.7705594897270203, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.2785, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.20882584712371946, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.16720621287822723, |
|
"eval_runtime": 641.3858, |
|
"eval_samples_per_second": 7.914, |
|
"eval_steps_per_second": 1.979, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 3.4368479251861572, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.2326, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1670488715171814, |
|
"eval_runtime": 635.73, |
|
"eval_samples_per_second": 7.985, |
|
"eval_steps_per_second": 1.996, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.21670606776989756, |
|
"grad_norm": 0.17799390852451324, |
|
"learning_rate": 0.0, |
|
"loss": 0.2048, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.21670606776989756, |
|
"eval_accuracy": 0.8983451724052429, |
|
"eval_loss": 0.1670481413602829, |
|
"eval_runtime": 633.0482, |
|
"eval_samples_per_second": 8.018, |
|
"eval_steps_per_second": 2.005, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.667515335043259e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|