|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 1000, |
|
"global_step": 1110, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04504504504504504, |
|
"grad_norm": 3.3412909507751465, |
|
"learning_rate": 1.801801801801802e-05, |
|
"loss": 1.2791, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09009009009009009, |
|
"grad_norm": 1.4792481660842896, |
|
"learning_rate": 3.603603603603604e-05, |
|
"loss": 1.0799, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 1.3788402080535889, |
|
"learning_rate": 5.405405405405406e-05, |
|
"loss": 0.7657, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18018018018018017, |
|
"grad_norm": 0.7668061256408691, |
|
"learning_rate": 7.207207207207208e-05, |
|
"loss": 0.5807, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22522522522522523, |
|
"grad_norm": 0.7166613936424255, |
|
"learning_rate": 9.009009009009009e-05, |
|
"loss": 0.6035, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 0.9661350846290588, |
|
"learning_rate": 0.00010810810810810812, |
|
"loss": 0.5441, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3153153153153153, |
|
"grad_norm": 0.7341681122779846, |
|
"learning_rate": 0.00012612612612612612, |
|
"loss": 0.6031, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.36036036036036034, |
|
"grad_norm": 1.3319752216339111, |
|
"learning_rate": 0.00014414414414414415, |
|
"loss": 0.5686, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 0.7269447445869446, |
|
"learning_rate": 0.00016216216216216218, |
|
"loss": 0.4978, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.45045045045045046, |
|
"grad_norm": 0.46780362725257874, |
|
"learning_rate": 0.00018018018018018018, |
|
"loss": 0.473, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4954954954954955, |
|
"grad_norm": 0.728823184967041, |
|
"learning_rate": 0.0001981981981981982, |
|
"loss": 0.5892, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 0.5054831504821777, |
|
"learning_rate": 0.0001999599507118322, |
|
"loss": 0.5117, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5855855855855856, |
|
"grad_norm": 0.6193355321884155, |
|
"learning_rate": 0.00019982154991201608, |
|
"loss": 0.4759, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6306306306306306, |
|
"grad_norm": 0.3995501399040222, |
|
"learning_rate": 0.00019958443999073397, |
|
"loss": 0.3928, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 0.5497131943702698, |
|
"learning_rate": 0.0001992488554155135, |
|
"loss": 0.4507, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 1.0290075540542603, |
|
"learning_rate": 0.00019881512803111796, |
|
"loss": 0.4766, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7657657657657657, |
|
"grad_norm": 0.6826834678649902, |
|
"learning_rate": 0.00019828368673139947, |
|
"loss": 0.5236, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 0.6135373711585999, |
|
"learning_rate": 0.00019765505703518496, |
|
"loss": 0.4454, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8558558558558559, |
|
"grad_norm": 0.6352598667144775, |
|
"learning_rate": 0.00019692986056661356, |
|
"loss": 0.508, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9009009009009009, |
|
"grad_norm": 0.5680545568466187, |
|
"learning_rate": 0.0001961088144404403, |
|
"loss": 0.5896, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 0.3386252820491791, |
|
"learning_rate": 0.00019519273055291266, |
|
"loss": 0.4729, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.990990990990991, |
|
"grad_norm": 0.358553409576416, |
|
"learning_rate": 0.0001941825147789225, |
|
"loss": 0.45, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0360360360360361, |
|
"grad_norm": 0.668021023273468, |
|
"learning_rate": 0.0001930791660762262, |
|
"loss": 0.4162, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 0.5463367700576782, |
|
"learning_rate": 0.00019188377549761963, |
|
"loss": 0.4445, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1261261261261262, |
|
"grad_norm": 0.7385880351066589, |
|
"learning_rate": 0.000190597525112044, |
|
"loss": 0.3849, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1711711711711712, |
|
"grad_norm": 0.6837536692619324, |
|
"learning_rate": 0.0001892216868356904, |
|
"loss": 0.4652, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 0.866578221321106, |
|
"learning_rate": 0.00018775762117425777, |
|
"loss": 0.4648, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2612612612612613, |
|
"grad_norm": 0.6583455204963684, |
|
"learning_rate": 0.00018620677587760916, |
|
"loss": 0.3848, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.3063063063063063, |
|
"grad_norm": 0.6937561631202698, |
|
"learning_rate": 0.00018457068450815562, |
|
"loss": 0.4532, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 0.5930050611495972, |
|
"learning_rate": 0.00018285096492438424, |
|
"loss": 0.5282, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3963963963963963, |
|
"grad_norm": 1.1432991027832031, |
|
"learning_rate": 0.0001810493176810292, |
|
"loss": 0.4369, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.4414414414414414, |
|
"grad_norm": 0.5736434459686279, |
|
"learning_rate": 0.00017916752434746856, |
|
"loss": 0.4434, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 0.5792914032936096, |
|
"learning_rate": 0.00017720744574600863, |
|
"loss": 0.4434, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.5315315315315314, |
|
"grad_norm": 0.7626290917396545, |
|
"learning_rate": 0.00017517102011179933, |
|
"loss": 0.4226, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5765765765765765, |
|
"grad_norm": 0.6746386289596558, |
|
"learning_rate": 0.00017306026117619889, |
|
"loss": 0.4126, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 0.5064342617988586, |
|
"learning_rate": 0.00017087725617548385, |
|
"loss": 0.3926, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.793991208076477, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.4437, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.7117117117117115, |
|
"grad_norm": 0.6197868585586548, |
|
"learning_rate": 0.00016630321199390867, |
|
"loss": 0.3932, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 0.5004612803459167, |
|
"learning_rate": 0.0001639166958832985, |
|
"loss": 0.3883, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.8018018018018018, |
|
"grad_norm": 0.7265865206718445, |
|
"learning_rate": 0.00016146697537540924, |
|
"loss": 0.4453, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.8468468468468469, |
|
"grad_norm": 0.5155379772186279, |
|
"learning_rate": 0.00015895647289064396, |
|
"loss": 0.48, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 0.5756716132164001, |
|
"learning_rate": 0.0001563876709540178, |
|
"loss": 0.4874, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.936936936936937, |
|
"grad_norm": 0.7919459342956543, |
|
"learning_rate": 0.00015376310974029873, |
|
"loss": 0.4075, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.981981981981982, |
|
"grad_norm": 0.5977569818496704, |
|
"learning_rate": 0.0001510853845621409, |
|
"loss": 0.504, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 0.601466178894043, |
|
"learning_rate": 0.00014835714330369446, |
|
"loss": 0.3732, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.0720720720720722, |
|
"grad_norm": 0.575406014919281, |
|
"learning_rate": 0.00014558108380223012, |
|
"loss": 0.3317, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.1171171171171173, |
|
"grad_norm": 1.0440267324447632, |
|
"learning_rate": 0.00014275995118036693, |
|
"loss": 0.3896, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.6646713614463806, |
|
"learning_rate": 0.00013989653513154165, |
|
"loss": 0.3478, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.2072072072072073, |
|
"grad_norm": 0.7202288508415222, |
|
"learning_rate": 0.00013699366716140435, |
|
"loss": 0.3712, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.2522522522522523, |
|
"grad_norm": 0.6566169261932373, |
|
"learning_rate": 0.00013405421778786737, |
|
"loss": 0.3548, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 1.0158584117889404, |
|
"learning_rate": 0.00013108109370257712, |
|
"loss": 0.3404, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.3423423423423424, |
|
"grad_norm": 0.7582752108573914, |
|
"learning_rate": 0.00012807723489661495, |
|
"loss": 0.374, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.3873873873873874, |
|
"grad_norm": 0.7685467600822449, |
|
"learning_rate": 0.00012504561175326985, |
|
"loss": 0.3127, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 0.9077286124229431, |
|
"learning_rate": 0.00012198922211075778, |
|
"loss": 0.353, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.4774774774774775, |
|
"grad_norm": 0.9107437133789062, |
|
"learning_rate": 0.00011891108829779165, |
|
"loss": 0.3531, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.5225225225225225, |
|
"grad_norm": 1.1385325193405151, |
|
"learning_rate": 0.0001158142541449341, |
|
"loss": 0.3695, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 0.9225629568099976, |
|
"learning_rate": 0.00011270178197468789, |
|
"loss": 0.3606, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.6126126126126126, |
|
"grad_norm": 0.6338076591491699, |
|
"learning_rate": 0.00010957674957330042, |
|
"loss": 0.312, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.6576576576576576, |
|
"grad_norm": 1.3998136520385742, |
|
"learning_rate": 0.00010644224714727681, |
|
"loss": 0.4027, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 0.598822832107544, |
|
"learning_rate": 0.00010330137426761135, |
|
"loss": 0.3496, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.7477477477477477, |
|
"grad_norm": 0.9068642854690552, |
|
"learning_rate": 0.00010015723680475846, |
|
"loss": 0.3489, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.7927927927927927, |
|
"grad_norm": 0.4025176167488098, |
|
"learning_rate": 9.70129438573747e-05, |
|
"loss": 0.3296, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 0.6708613634109497, |
|
"learning_rate": 9.38716046778684e-05, |
|
"loss": 0.3004, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.8828828828828827, |
|
"grad_norm": 0.7858556509017944, |
|
"learning_rate": 9.07363255977973e-05, |
|
"loss": 0.3716, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.9279279279279278, |
|
"grad_norm": 0.6855165958404541, |
|
"learning_rate": 8.76102069561545e-05, |
|
"loss": 0.311, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 0.6526620388031006, |
|
"learning_rate": 8.449634003358022e-05, |
|
"loss": 0.3488, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.018018018018018, |
|
"grad_norm": 0.3698066174983978, |
|
"learning_rate": 8.13978039955308e-05, |
|
"loss": 0.2858, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.063063063063063, |
|
"grad_norm": 0.8586738705635071, |
|
"learning_rate": 7.831766284742807e-05, |
|
"loss": 0.2565, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 0.8718597292900085, |
|
"learning_rate": 7.525896240479976e-05, |
|
"loss": 0.2173, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.153153153153153, |
|
"grad_norm": 0.7671772241592407, |
|
"learning_rate": 7.222472728140695e-05, |
|
"loss": 0.2548, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.1981981981981984, |
|
"grad_norm": 1.2702572345733643, |
|
"learning_rate": 6.921795789833723e-05, |
|
"loss": 0.2638, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 1.4898873567581177, |
|
"learning_rate": 6.624162751702076e-05, |
|
"loss": 0.2623, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.2882882882882885, |
|
"grad_norm": 1.0137726068496704, |
|
"learning_rate": 6.329867929910347e-05, |
|
"loss": 0.2938, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.9631416201591492, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.2443, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 0.8912140130996704, |
|
"learning_rate": 5.752453407159522e-05, |
|
"loss": 0.2359, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.4234234234234235, |
|
"grad_norm": 0.9686083793640137, |
|
"learning_rate": 5.469904685916861e-05, |
|
"loss": 0.2465, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.4684684684684686, |
|
"grad_norm": 1.942658543586731, |
|
"learning_rate": 5.191835575830352e-05, |
|
"loss": 0.3042, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 1.2755067348480225, |
|
"learning_rate": 4.918521047160308e-05, |
|
"loss": 0.2885, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.5585585585585586, |
|
"grad_norm": 0.8992679715156555, |
|
"learning_rate": 4.650231368571486e-05, |
|
"loss": 0.2728, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.6036036036036037, |
|
"grad_norm": 2.1154401302337646, |
|
"learning_rate": 4.387231839876349e-05, |
|
"loss": 0.258, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 0.6957826018333435, |
|
"learning_rate": 4.129782529691815e-05, |
|
"loss": 0.3219, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.6936936936936937, |
|
"grad_norm": 0.8093072175979614, |
|
"learning_rate": 3.878138018268866e-05, |
|
"loss": 0.2318, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.7387387387387387, |
|
"grad_norm": 0.900164008140564, |
|
"learning_rate": 3.632547145749395e-05, |
|
"loss": 0.3025, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 1.3732051849365234, |
|
"learning_rate": 3.393252766099187e-05, |
|
"loss": 0.2744, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.828828828828829, |
|
"grad_norm": 1.3438997268676758, |
|
"learning_rate": 3.1604915069603436e-05, |
|
"loss": 0.2663, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.873873873873874, |
|
"grad_norm": 0.7277224063873291, |
|
"learning_rate": 2.9344935356606773e-05, |
|
"loss": 0.2058, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 0.9671199321746826, |
|
"learning_rate": 2.7154823316113932e-05, |
|
"loss": 0.2466, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.963963963963964, |
|
"grad_norm": 1.0856068134307861, |
|
"learning_rate": 2.5036744653181753e-05, |
|
"loss": 0.2695, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.009009009009009, |
|
"grad_norm": 0.7191686034202576, |
|
"learning_rate": 2.29927938422419e-05, |
|
"loss": 0.2252, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.054054054054054, |
|
"grad_norm": 0.9094095826148987, |
|
"learning_rate": 2.102499205596743e-05, |
|
"loss": 0.2067, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.099099099099099, |
|
"grad_norm": 1.2016669511795044, |
|
"learning_rate": 1.913528516662452e-05, |
|
"loss": 0.2165, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.1441441441441444, |
|
"grad_norm": 1.6922552585601807, |
|
"learning_rate": 1.7325541821885384e-05, |
|
"loss": 0.2102, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.1891891891891895, |
|
"grad_norm": 1.52359139919281, |
|
"learning_rate": 1.5597551597004966e-05, |
|
"loss": 0.1765, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.2342342342342345, |
|
"grad_norm": 1.333765983581543, |
|
"learning_rate": 1.3953023225189243e-05, |
|
"loss": 0.2147, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.2792792792792795, |
|
"grad_norm": 0.9832772016525269, |
|
"learning_rate": 1.23935829079042e-05, |
|
"loss": 0.2068, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.324324324324325, |
|
"grad_norm": 0.7258216738700867, |
|
"learning_rate": 1.0920772706797167e-05, |
|
"loss": 0.1884, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.36936936936937, |
|
"grad_norm": 1.0229756832122803, |
|
"learning_rate": 9.536049018820192e-06, |
|
"loss": 0.2135, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.414414414414415, |
|
"grad_norm": 1.0085179805755615, |
|
"learning_rate": 8.240781136063346e-06, |
|
"loss": 0.1831, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.45945945945946, |
|
"grad_norm": 0.7446288466453552, |
|
"learning_rate": 7.03624989172228e-06, |
|
"loss": 0.198, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.504504504504505, |
|
"grad_norm": 0.8291650414466858, |
|
"learning_rate": 5.9236463935389065e-06, |
|
"loss": 0.2189, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.504504504504505, |
|
"eval_loss": 0.9809222221374512, |
|
"eval_runtime": 10.6739, |
|
"eval_samples_per_second": 35.039, |
|
"eval_steps_per_second": 4.403, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.54954954954955, |
|
"grad_norm": 1.1298563480377197, |
|
"learning_rate": 4.904070845967468e-06, |
|
"loss": 0.1889, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.594594594594595, |
|
"grad_norm": 1.0232703685760498, |
|
"learning_rate": 3.9785314622310495e-06, |
|
"loss": 0.1891, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.63963963963964, |
|
"grad_norm": 1.2712104320526123, |
|
"learning_rate": 3.1479434673440167e-06, |
|
"loss": 0.1879, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.684684684684685, |
|
"grad_norm": 1.564489722251892, |
|
"learning_rate": 2.4131281930864002e-06, |
|
"loss": 0.1972, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.72972972972973, |
|
"grad_norm": 1.4100459814071655, |
|
"learning_rate": 1.7748122658251876e-06, |
|
"loss": 0.201, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.774774774774775, |
|
"grad_norm": 1.3149417638778687, |
|
"learning_rate": 1.2336268879856727e-06, |
|
"loss": 0.1876, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.81981981981982, |
|
"grad_norm": 0.8505904674530029, |
|
"learning_rate": 7.901072138831511e-07, |
|
"loss": 0.1722, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.864864864864865, |
|
"grad_norm": 2.1957037448883057, |
|
"learning_rate": 4.44691820532539e-07, |
|
"loss": 0.1917, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.90990990990991, |
|
"grad_norm": 1.9867583513259888, |
|
"learning_rate": 1.977222739588891e-07, |
|
"loss": 0.2082, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.954954954954955, |
|
"grad_norm": 1.539480447769165, |
|
"learning_rate": 4.9442791437848136e-08, |
|
"loss": 0.2052, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.6789027452468872, |
|
"learning_rate": 0.0, |
|
"loss": 0.197, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 1110, |
|
"total_flos": 1.01086802968209e+18, |
|
"train_loss": 0.36593411194311604, |
|
"train_runtime": 2755.6936, |
|
"train_samples_per_second": 12.872, |
|
"train_steps_per_second": 0.403 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1110, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.01086802968209e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|