gpt2_shakespeare_cp4350 / trainer_state.json
Gaphilly's picture
commit from $USER
4a68883
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3169845594913716,
"eval_steps": 500,
"global_step": 4350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009082652134423252,
"grad_norm": 2.5449585914611816,
"learning_rate": 4.9848622464426284e-05,
"loss": 8.7191,
"step": 30
},
{
"epoch": 0.018165304268846504,
"grad_norm": 2.3144371509552,
"learning_rate": 4.969724492885256e-05,
"loss": 7.4698,
"step": 60
},
{
"epoch": 0.027247956403269755,
"grad_norm": 2.304499626159668,
"learning_rate": 4.954586739327884e-05,
"loss": 6.5589,
"step": 90
},
{
"epoch": 0.03633060853769301,
"grad_norm": 2.272608757019043,
"learning_rate": 4.9394489857705115e-05,
"loss": 6.2425,
"step": 120
},
{
"epoch": 0.045413260672116255,
"grad_norm": 2.46329402923584,
"learning_rate": 4.9243112322131396e-05,
"loss": 6.1459,
"step": 150
},
{
"epoch": 0.05449591280653951,
"grad_norm": 1.8283530473709106,
"learning_rate": 4.909173478655768e-05,
"loss": 5.969,
"step": 180
},
{
"epoch": 0.06357856494096276,
"grad_norm": 2.1723110675811768,
"learning_rate": 4.894035725098395e-05,
"loss": 6.008,
"step": 210
},
{
"epoch": 0.07266121707538602,
"grad_norm": 2.5368807315826416,
"learning_rate": 4.878897971541024e-05,
"loss": 5.8783,
"step": 240
},
{
"epoch": 0.08174386920980926,
"grad_norm": 2.3222858905792236,
"learning_rate": 4.8637602179836515e-05,
"loss": 5.825,
"step": 270
},
{
"epoch": 0.09082652134423251,
"grad_norm": 2.557065010070801,
"learning_rate": 4.8486224644262796e-05,
"loss": 5.76,
"step": 300
},
{
"epoch": 0.09990917347865577,
"grad_norm": 2.4016597270965576,
"learning_rate": 4.833484710868907e-05,
"loss": 5.7039,
"step": 330
},
{
"epoch": 0.10899182561307902,
"grad_norm": 2.6895477771759033,
"learning_rate": 4.818346957311535e-05,
"loss": 5.5843,
"step": 360
},
{
"epoch": 0.11807447774750227,
"grad_norm": 2.741234064102173,
"learning_rate": 4.8032092037541634e-05,
"loss": 5.6376,
"step": 390
},
{
"epoch": 0.1271571298819255,
"grad_norm": 2.8266804218292236,
"learning_rate": 4.788071450196791e-05,
"loss": 5.5649,
"step": 420
},
{
"epoch": 0.1362397820163488,
"grad_norm": 2.792654275894165,
"learning_rate": 4.772933696639419e-05,
"loss": 5.3651,
"step": 450
},
{
"epoch": 0.14532243415077203,
"grad_norm": 2.7088894844055176,
"learning_rate": 4.757795943082047e-05,
"loss": 5.4921,
"step": 480
},
{
"epoch": 0.15440508628519528,
"grad_norm": 2.627201795578003,
"learning_rate": 4.7426581895246746e-05,
"loss": 5.461,
"step": 510
},
{
"epoch": 0.16348773841961853,
"grad_norm": 2.6373610496520996,
"learning_rate": 4.727520435967303e-05,
"loss": 5.3973,
"step": 540
},
{
"epoch": 0.17257039055404177,
"grad_norm": 2.772226095199585,
"learning_rate": 4.71238268240993e-05,
"loss": 5.3618,
"step": 570
},
{
"epoch": 0.18165304268846502,
"grad_norm": 2.6005172729492188,
"learning_rate": 4.6972449288525583e-05,
"loss": 5.4365,
"step": 600
},
{
"epoch": 0.1907356948228883,
"grad_norm": 4.7815260887146,
"learning_rate": 4.6821071752951865e-05,
"loss": 5.3225,
"step": 630
},
{
"epoch": 0.19981834695731154,
"grad_norm": 2.5871763229370117,
"learning_rate": 4.6669694217378146e-05,
"loss": 5.3615,
"step": 660
},
{
"epoch": 0.2089009990917348,
"grad_norm": 2.686840534210205,
"learning_rate": 4.651831668180443e-05,
"loss": 5.3201,
"step": 690
},
{
"epoch": 0.21798365122615804,
"grad_norm": 2.6963067054748535,
"learning_rate": 4.63669391462307e-05,
"loss": 5.1972,
"step": 720
},
{
"epoch": 0.22706630336058128,
"grad_norm": 2.9284744262695312,
"learning_rate": 4.6215561610656984e-05,
"loss": 5.3031,
"step": 750
},
{
"epoch": 0.23614895549500453,
"grad_norm": 2.7302122116088867,
"learning_rate": 4.606418407508326e-05,
"loss": 5.2057,
"step": 780
},
{
"epoch": 0.2452316076294278,
"grad_norm": 2.5760107040405273,
"learning_rate": 4.591280653950954e-05,
"loss": 5.1767,
"step": 810
},
{
"epoch": 0.254314259763851,
"grad_norm": 2.9804234504699707,
"learning_rate": 4.576142900393582e-05,
"loss": 5.1875,
"step": 840
},
{
"epoch": 0.2633969118982743,
"grad_norm": 3.311448812484741,
"learning_rate": 4.5610051468362096e-05,
"loss": 5.0712,
"step": 870
},
{
"epoch": 0.2724795640326976,
"grad_norm": 2.67448091506958,
"learning_rate": 4.545867393278838e-05,
"loss": 5.1241,
"step": 900
},
{
"epoch": 0.2815622161671208,
"grad_norm": 2.8352444171905518,
"learning_rate": 4.530729639721465e-05,
"loss": 5.1732,
"step": 930
},
{
"epoch": 0.29064486830154407,
"grad_norm": 2.5969910621643066,
"learning_rate": 4.515591886164093e-05,
"loss": 5.0828,
"step": 960
},
{
"epoch": 0.2997275204359673,
"grad_norm": 2.8792121410369873,
"learning_rate": 4.5004541326067215e-05,
"loss": 5.0844,
"step": 990
},
{
"epoch": 0.30881017257039056,
"grad_norm": 2.9506993293762207,
"learning_rate": 4.485316379049349e-05,
"loss": 5.1764,
"step": 1020
},
{
"epoch": 0.3178928247048138,
"grad_norm": 2.8818390369415283,
"learning_rate": 4.470178625491977e-05,
"loss": 5.0663,
"step": 1050
},
{
"epoch": 0.32697547683923706,
"grad_norm": 3.128511667251587,
"learning_rate": 4.4550408719346046e-05,
"loss": 5.1026,
"step": 1080
},
{
"epoch": 0.33605812897366033,
"grad_norm": 3.0155856609344482,
"learning_rate": 4.4399031183772334e-05,
"loss": 5.0686,
"step": 1110
},
{
"epoch": 0.34514078110808355,
"grad_norm": 2.811448097229004,
"learning_rate": 4.424765364819861e-05,
"loss": 5.0351,
"step": 1140
},
{
"epoch": 0.3542234332425068,
"grad_norm": 2.9916000366210938,
"learning_rate": 4.409627611262489e-05,
"loss": 5.1651,
"step": 1170
},
{
"epoch": 0.36330608537693004,
"grad_norm": 2.9689950942993164,
"learning_rate": 4.394489857705117e-05,
"loss": 5.1457,
"step": 1200
},
{
"epoch": 0.3723887375113533,
"grad_norm": 2.7896862030029297,
"learning_rate": 4.3793521041477446e-05,
"loss": 5.0049,
"step": 1230
},
{
"epoch": 0.3814713896457766,
"grad_norm": 2.790712833404541,
"learning_rate": 4.364214350590373e-05,
"loss": 4.9943,
"step": 1260
},
{
"epoch": 0.3905540417801998,
"grad_norm": 2.9977900981903076,
"learning_rate": 4.349076597033e-05,
"loss": 4.996,
"step": 1290
},
{
"epoch": 0.3996366939146231,
"grad_norm": 3.504183530807495,
"learning_rate": 4.333938843475628e-05,
"loss": 4.9611,
"step": 1320
},
{
"epoch": 0.4087193460490463,
"grad_norm": 2.737821578979492,
"learning_rate": 4.3188010899182565e-05,
"loss": 4.9541,
"step": 1350
},
{
"epoch": 0.4178019981834696,
"grad_norm": 3.0585217475891113,
"learning_rate": 4.303663336360884e-05,
"loss": 4.9014,
"step": 1380
},
{
"epoch": 0.4268846503178928,
"grad_norm": 3.004413604736328,
"learning_rate": 4.288525582803512e-05,
"loss": 4.9703,
"step": 1410
},
{
"epoch": 0.4359673024523161,
"grad_norm": 2.9328274726867676,
"learning_rate": 4.27338782924614e-05,
"loss": 4.9637,
"step": 1440
},
{
"epoch": 0.44504995458673935,
"grad_norm": 2.93721604347229,
"learning_rate": 4.258250075688768e-05,
"loss": 4.8024,
"step": 1470
},
{
"epoch": 0.45413260672116257,
"grad_norm": 3.0333001613616943,
"learning_rate": 4.243112322131396e-05,
"loss": 4.8555,
"step": 1500
},
{
"epoch": 0.46321525885558584,
"grad_norm": 3.3445775508880615,
"learning_rate": 4.227974568574024e-05,
"loss": 4.8035,
"step": 1530
},
{
"epoch": 0.47229791099000906,
"grad_norm": 2.9364359378814697,
"learning_rate": 4.212836815016652e-05,
"loss": 4.9296,
"step": 1560
},
{
"epoch": 0.48138056312443234,
"grad_norm": 2.755453586578369,
"learning_rate": 4.1976990614592796e-05,
"loss": 4.8051,
"step": 1590
},
{
"epoch": 0.4904632152588556,
"grad_norm": 3.0365066528320312,
"learning_rate": 4.182561307901908e-05,
"loss": 4.7833,
"step": 1620
},
{
"epoch": 0.49954586739327883,
"grad_norm": 3.2632575035095215,
"learning_rate": 4.167423554344536e-05,
"loss": 4.837,
"step": 1650
},
{
"epoch": 0.508628519527702,
"grad_norm": 3.310817003250122,
"learning_rate": 4.152285800787163e-05,
"loss": 4.7417,
"step": 1680
},
{
"epoch": 0.5177111716621253,
"grad_norm": 3.121156692504883,
"learning_rate": 4.1371480472297915e-05,
"loss": 4.7791,
"step": 1710
},
{
"epoch": 0.5267938237965486,
"grad_norm": 3.200591564178467,
"learning_rate": 4.122010293672419e-05,
"loss": 4.8619,
"step": 1740
},
{
"epoch": 0.5358764759309719,
"grad_norm": 3.1420202255249023,
"learning_rate": 4.106872540115047e-05,
"loss": 4.7576,
"step": 1770
},
{
"epoch": 0.5449591280653951,
"grad_norm": 3.2239160537719727,
"learning_rate": 4.091734786557675e-05,
"loss": 4.7767,
"step": 1800
},
{
"epoch": 0.5540417801998183,
"grad_norm": 2.9624414443969727,
"learning_rate": 4.076597033000303e-05,
"loss": 4.8608,
"step": 1830
},
{
"epoch": 0.5631244323342416,
"grad_norm": 3.14367938041687,
"learning_rate": 4.061459279442931e-05,
"loss": 4.7909,
"step": 1860
},
{
"epoch": 0.5722070844686649,
"grad_norm": 3.664564371109009,
"learning_rate": 4.046321525885558e-05,
"loss": 4.7325,
"step": 1890
},
{
"epoch": 0.5812897366030881,
"grad_norm": 2.9251296520233154,
"learning_rate": 4.0311837723281864e-05,
"loss": 4.8017,
"step": 1920
},
{
"epoch": 0.5903723887375113,
"grad_norm": 2.8796215057373047,
"learning_rate": 4.0160460187708146e-05,
"loss": 4.7124,
"step": 1950
},
{
"epoch": 0.5994550408719346,
"grad_norm": 3.0257513523101807,
"learning_rate": 4.000908265213443e-05,
"loss": 4.7311,
"step": 1980
},
{
"epoch": 0.6085376930063578,
"grad_norm": 3.096799612045288,
"learning_rate": 3.985770511656071e-05,
"loss": 4.6568,
"step": 2010
},
{
"epoch": 0.6176203451407811,
"grad_norm": 3.1430232524871826,
"learning_rate": 3.970632758098698e-05,
"loss": 4.6451,
"step": 2040
},
{
"epoch": 0.6267029972752044,
"grad_norm": 3.0216684341430664,
"learning_rate": 3.9554950045413265e-05,
"loss": 4.6565,
"step": 2070
},
{
"epoch": 0.6357856494096276,
"grad_norm": 3.0199525356292725,
"learning_rate": 3.940357250983954e-05,
"loss": 4.6988,
"step": 2100
},
{
"epoch": 0.6448683015440508,
"grad_norm": 2.9998953342437744,
"learning_rate": 3.925219497426582e-05,
"loss": 4.6654,
"step": 2130
},
{
"epoch": 0.6539509536784741,
"grad_norm": 3.15533447265625,
"learning_rate": 3.91008174386921e-05,
"loss": 4.616,
"step": 2160
},
{
"epoch": 0.6630336058128974,
"grad_norm": 2.8745930194854736,
"learning_rate": 3.894943990311838e-05,
"loss": 4.649,
"step": 2190
},
{
"epoch": 0.6721162579473207,
"grad_norm": 3.0759665966033936,
"learning_rate": 3.879806236754466e-05,
"loss": 4.6054,
"step": 2220
},
{
"epoch": 0.6811989100817438,
"grad_norm": 3.0508482456207275,
"learning_rate": 3.864668483197093e-05,
"loss": 4.4922,
"step": 2250
},
{
"epoch": 0.6902815622161671,
"grad_norm": 2.9260127544403076,
"learning_rate": 3.8495307296397214e-05,
"loss": 4.6469,
"step": 2280
},
{
"epoch": 0.6993642143505904,
"grad_norm": 2.924952268600464,
"learning_rate": 3.8343929760823496e-05,
"loss": 4.6164,
"step": 2310
},
{
"epoch": 0.7084468664850136,
"grad_norm": 3.056288480758667,
"learning_rate": 3.819255222524977e-05,
"loss": 4.5877,
"step": 2340
},
{
"epoch": 0.7175295186194369,
"grad_norm": 4.257227420806885,
"learning_rate": 3.804117468967605e-05,
"loss": 4.6301,
"step": 2370
},
{
"epoch": 0.7266121707538601,
"grad_norm": 3.282137155532837,
"learning_rate": 3.788979715410233e-05,
"loss": 4.4623,
"step": 2400
},
{
"epoch": 0.7356948228882834,
"grad_norm": 2.945059299468994,
"learning_rate": 3.7738419618528615e-05,
"loss": 4.6267,
"step": 2430
},
{
"epoch": 0.7447774750227066,
"grad_norm": 3.1374645233154297,
"learning_rate": 3.7587042082954896e-05,
"loss": 4.6835,
"step": 2460
},
{
"epoch": 0.7538601271571299,
"grad_norm": 3.21016001701355,
"learning_rate": 3.743566454738117e-05,
"loss": 4.5581,
"step": 2490
},
{
"epoch": 0.7629427792915532,
"grad_norm": 2.8072383403778076,
"learning_rate": 3.728428701180745e-05,
"loss": 4.571,
"step": 2520
},
{
"epoch": 0.7720254314259763,
"grad_norm": 2.9735002517700195,
"learning_rate": 3.713290947623373e-05,
"loss": 4.5013,
"step": 2550
},
{
"epoch": 0.7811080835603996,
"grad_norm": 3.182706832885742,
"learning_rate": 3.698153194066001e-05,
"loss": 4.534,
"step": 2580
},
{
"epoch": 0.7901907356948229,
"grad_norm": 2.958193778991699,
"learning_rate": 3.683015440508629e-05,
"loss": 4.5697,
"step": 2610
},
{
"epoch": 0.7992733878292462,
"grad_norm": 2.950946569442749,
"learning_rate": 3.6678776869512564e-05,
"loss": 4.6066,
"step": 2640
},
{
"epoch": 0.8083560399636693,
"grad_norm": 2.9701859951019287,
"learning_rate": 3.6527399333938846e-05,
"loss": 4.5934,
"step": 2670
},
{
"epoch": 0.8174386920980926,
"grad_norm": 3.2177681922912598,
"learning_rate": 3.637602179836512e-05,
"loss": 4.5418,
"step": 2700
},
{
"epoch": 0.8265213442325159,
"grad_norm": 2.7435505390167236,
"learning_rate": 3.62246442627914e-05,
"loss": 4.5485,
"step": 2730
},
{
"epoch": 0.8356039963669392,
"grad_norm": 3.4409849643707275,
"learning_rate": 3.607326672721768e-05,
"loss": 4.4268,
"step": 2760
},
{
"epoch": 0.8446866485013624,
"grad_norm": 3.803256034851074,
"learning_rate": 3.592188919164396e-05,
"loss": 4.5643,
"step": 2790
},
{
"epoch": 0.8537693006357856,
"grad_norm": 3.0399341583251953,
"learning_rate": 3.5770511656070246e-05,
"loss": 4.4783,
"step": 2820
},
{
"epoch": 0.8628519527702089,
"grad_norm": 2.9948980808258057,
"learning_rate": 3.561913412049652e-05,
"loss": 4.4929,
"step": 2850
},
{
"epoch": 0.8719346049046321,
"grad_norm": 3.400299549102783,
"learning_rate": 3.54677565849228e-05,
"loss": 4.4803,
"step": 2880
},
{
"epoch": 0.8810172570390554,
"grad_norm": 2.9282257556915283,
"learning_rate": 3.531637904934908e-05,
"loss": 4.4554,
"step": 2910
},
{
"epoch": 0.8900999091734787,
"grad_norm": 2.957598924636841,
"learning_rate": 3.516500151377536e-05,
"loss": 4.5324,
"step": 2940
},
{
"epoch": 0.8991825613079019,
"grad_norm": 2.9992153644561768,
"learning_rate": 3.501362397820164e-05,
"loss": 4.508,
"step": 2970
},
{
"epoch": 0.9082652134423251,
"grad_norm": 3.1509618759155273,
"learning_rate": 3.4862246442627914e-05,
"loss": 4.4265,
"step": 3000
},
{
"epoch": 0.9173478655767484,
"grad_norm": 3.027726888656616,
"learning_rate": 3.4710868907054196e-05,
"loss": 4.4979,
"step": 3030
},
{
"epoch": 0.9264305177111717,
"grad_norm": 3.0711803436279297,
"learning_rate": 3.455949137148047e-05,
"loss": 4.4946,
"step": 3060
},
{
"epoch": 0.935513169845595,
"grad_norm": 2.982269287109375,
"learning_rate": 3.440811383590675e-05,
"loss": 4.3433,
"step": 3090
},
{
"epoch": 0.9445958219800181,
"grad_norm": 2.9734480381011963,
"learning_rate": 3.425673630033303e-05,
"loss": 4.453,
"step": 3120
},
{
"epoch": 0.9536784741144414,
"grad_norm": 2.985030174255371,
"learning_rate": 3.410535876475931e-05,
"loss": 4.3705,
"step": 3150
},
{
"epoch": 0.9627611262488647,
"grad_norm": 3.1812829971313477,
"learning_rate": 3.395398122918559e-05,
"loss": 4.3414,
"step": 3180
},
{
"epoch": 0.971843778383288,
"grad_norm": 3.415923595428467,
"learning_rate": 3.380260369361187e-05,
"loss": 4.522,
"step": 3210
},
{
"epoch": 0.9809264305177112,
"grad_norm": 3.176737070083618,
"learning_rate": 3.3651226158038145e-05,
"loss": 4.4112,
"step": 3240
},
{
"epoch": 0.9900090826521344,
"grad_norm": 3.1306254863739014,
"learning_rate": 3.3499848622464433e-05,
"loss": 4.5104,
"step": 3270
},
{
"epoch": 0.9990917347865577,
"grad_norm": 3.216395616531372,
"learning_rate": 3.334847108689071e-05,
"loss": 4.3244,
"step": 3300
},
{
"epoch": 1.008174386920981,
"grad_norm": 3.1889307498931885,
"learning_rate": 3.319709355131699e-05,
"loss": 4.3521,
"step": 3330
},
{
"epoch": 1.017257039055404,
"grad_norm": 2.8001787662506104,
"learning_rate": 3.3045716015743264e-05,
"loss": 4.3047,
"step": 3360
},
{
"epoch": 1.0263396911898275,
"grad_norm": 3.5796685218811035,
"learning_rate": 3.2894338480169546e-05,
"loss": 4.1921,
"step": 3390
},
{
"epoch": 1.0354223433242506,
"grad_norm": 3.725538730621338,
"learning_rate": 3.274296094459583e-05,
"loss": 4.3203,
"step": 3420
},
{
"epoch": 1.044504995458674,
"grad_norm": 2.9058167934417725,
"learning_rate": 3.25915834090221e-05,
"loss": 4.385,
"step": 3450
},
{
"epoch": 1.0535876475930972,
"grad_norm": 3.120119333267212,
"learning_rate": 3.244020587344838e-05,
"loss": 4.2883,
"step": 3480
},
{
"epoch": 1.0626702997275204,
"grad_norm": 3.230036735534668,
"learning_rate": 3.228882833787466e-05,
"loss": 4.3602,
"step": 3510
},
{
"epoch": 1.0717529518619437,
"grad_norm": 3.482921600341797,
"learning_rate": 3.213745080230094e-05,
"loss": 4.3984,
"step": 3540
},
{
"epoch": 1.080835603996367,
"grad_norm": 3.0121572017669678,
"learning_rate": 3.198607326672722e-05,
"loss": 4.3864,
"step": 3570
},
{
"epoch": 1.0899182561307903,
"grad_norm": 3.277411460876465,
"learning_rate": 3.1834695731153495e-05,
"loss": 4.2294,
"step": 3600
},
{
"epoch": 1.0990009082652135,
"grad_norm": 3.0383167266845703,
"learning_rate": 3.168331819557978e-05,
"loss": 4.2759,
"step": 3630
},
{
"epoch": 1.1080835603996366,
"grad_norm": 3.3026745319366455,
"learning_rate": 3.153194066000605e-05,
"loss": 4.3093,
"step": 3660
},
{
"epoch": 1.11716621253406,
"grad_norm": 2.954747200012207,
"learning_rate": 3.138056312443234e-05,
"loss": 4.2476,
"step": 3690
},
{
"epoch": 1.1262488646684832,
"grad_norm": 3.2137765884399414,
"learning_rate": 3.1229185588858614e-05,
"loss": 4.2858,
"step": 3720
},
{
"epoch": 1.1353315168029066,
"grad_norm": 3.4028799533843994,
"learning_rate": 3.1077808053284896e-05,
"loss": 4.3652,
"step": 3750
},
{
"epoch": 1.1444141689373297,
"grad_norm": 3.0039563179016113,
"learning_rate": 3.092643051771118e-05,
"loss": 4.4106,
"step": 3780
},
{
"epoch": 1.1534968210717529,
"grad_norm": 2.973820209503174,
"learning_rate": 3.077505298213745e-05,
"loss": 4.1827,
"step": 3810
},
{
"epoch": 1.1625794732061763,
"grad_norm": 2.99037766456604,
"learning_rate": 3.062367544656373e-05,
"loss": 4.3092,
"step": 3840
},
{
"epoch": 1.1716621253405994,
"grad_norm": 3.181398391723633,
"learning_rate": 3.047229791099001e-05,
"loss": 4.417,
"step": 3870
},
{
"epoch": 1.1807447774750228,
"grad_norm": 3.1933484077453613,
"learning_rate": 3.032092037541629e-05,
"loss": 4.2361,
"step": 3900
},
{
"epoch": 1.189827429609446,
"grad_norm": 3.4427855014801025,
"learning_rate": 3.0169542839842567e-05,
"loss": 4.2687,
"step": 3930
},
{
"epoch": 1.1989100817438691,
"grad_norm": 3.0683298110961914,
"learning_rate": 3.001816530426885e-05,
"loss": 4.2748,
"step": 3960
},
{
"epoch": 1.2079927338782925,
"grad_norm": 3.044698715209961,
"learning_rate": 2.9866787768695127e-05,
"loss": 4.2671,
"step": 3990
},
{
"epoch": 1.2170753860127157,
"grad_norm": 3.1354904174804688,
"learning_rate": 2.9715410233121405e-05,
"loss": 4.2635,
"step": 4020
},
{
"epoch": 1.226158038147139,
"grad_norm": 3.282745361328125,
"learning_rate": 2.9564032697547683e-05,
"loss": 4.3544,
"step": 4050
},
{
"epoch": 1.2352406902815622,
"grad_norm": 3.369798183441162,
"learning_rate": 2.941265516197396e-05,
"loss": 4.1993,
"step": 4080
},
{
"epoch": 1.2443233424159854,
"grad_norm": 3.395785331726074,
"learning_rate": 2.9261277626400242e-05,
"loss": 4.1131,
"step": 4110
},
{
"epoch": 1.2534059945504088,
"grad_norm": 3.500697135925293,
"learning_rate": 2.9109900090826524e-05,
"loss": 4.192,
"step": 4140
},
{
"epoch": 1.262488646684832,
"grad_norm": 2.94278621673584,
"learning_rate": 2.8958522555252805e-05,
"loss": 4.2863,
"step": 4170
},
{
"epoch": 1.2715712988192553,
"grad_norm": 3.3217315673828125,
"learning_rate": 2.8807145019679083e-05,
"loss": 4.1763,
"step": 4200
},
{
"epoch": 1.2806539509536785,
"grad_norm": 3.232830762863159,
"learning_rate": 2.865576748410536e-05,
"loss": 4.2595,
"step": 4230
},
{
"epoch": 1.2897366030881017,
"grad_norm": 3.3042378425598145,
"learning_rate": 2.850438994853164e-05,
"loss": 4.2393,
"step": 4260
},
{
"epoch": 1.298819255222525,
"grad_norm": 3.83151912689209,
"learning_rate": 2.835301241295792e-05,
"loss": 4.3005,
"step": 4290
},
{
"epoch": 1.3079019073569482,
"grad_norm": 3.245086431503296,
"learning_rate": 2.82016348773842e-05,
"loss": 4.205,
"step": 4320
},
{
"epoch": 1.3169845594913716,
"grad_norm": 3.4392285346984863,
"learning_rate": 2.8050257341810477e-05,
"loss": 4.1964,
"step": 4350
}
],
"logging_steps": 30,
"max_steps": 9909,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 30,
"total_flos": 1136555016192000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}