| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.997827661115134, |
| "eval_steps": 500, |
| "global_step": 1035, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02896451846488052, |
| "grad_norm": 1.9546706041531023, |
| "learning_rate": 5e-06, |
| "loss": 1.0679, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05792903692976104, |
| "grad_norm": 5.4683536223610085, |
| "learning_rate": 5e-06, |
| "loss": 0.9613, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08689355539464157, |
| "grad_norm": 1.6939976541654087, |
| "learning_rate": 5e-06, |
| "loss": 0.9223, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11585807385952208, |
| "grad_norm": 1.0339182103747349, |
| "learning_rate": 5e-06, |
| "loss": 0.8975, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14482259232440262, |
| "grad_norm": 0.9491704034528055, |
| "learning_rate": 5e-06, |
| "loss": 0.88, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.17378711078928313, |
| "grad_norm": 1.2085381768876486, |
| "learning_rate": 5e-06, |
| "loss": 0.8621, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.20275162925416365, |
| "grad_norm": 1.4117640839833099, |
| "learning_rate": 5e-06, |
| "loss": 0.8519, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.23171614771904417, |
| "grad_norm": 1.1514550106019426, |
| "learning_rate": 5e-06, |
| "loss": 0.8438, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2606806661839247, |
| "grad_norm": 0.9243756138786047, |
| "learning_rate": 5e-06, |
| "loss": 0.8335, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.28964518464880523, |
| "grad_norm": 0.846545720738904, |
| "learning_rate": 5e-06, |
| "loss": 0.8269, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.31860970311368575, |
| "grad_norm": 0.5825470716035409, |
| "learning_rate": 5e-06, |
| "loss": 0.8194, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.34757422157856627, |
| "grad_norm": 0.7173313925098385, |
| "learning_rate": 5e-06, |
| "loss": 0.8195, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3765387400434468, |
| "grad_norm": 0.7287411570445328, |
| "learning_rate": 5e-06, |
| "loss": 0.8156, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4055032585083273, |
| "grad_norm": 0.7081477098458706, |
| "learning_rate": 5e-06, |
| "loss": 0.8116, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4344677769732078, |
| "grad_norm": 0.8598043962323773, |
| "learning_rate": 5e-06, |
| "loss": 0.8135, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.46343229543808834, |
| "grad_norm": 0.5450111798416236, |
| "learning_rate": 5e-06, |
| "loss": 0.8074, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.49239681390296886, |
| "grad_norm": 0.939010922813204, |
| "learning_rate": 5e-06, |
| "loss": 0.8062, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5213613323678494, |
| "grad_norm": 0.847543033032073, |
| "learning_rate": 5e-06, |
| "loss": 0.8029, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5503258508327299, |
| "grad_norm": 0.8787853615954785, |
| "learning_rate": 5e-06, |
| "loss": 0.8006, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5792903692976105, |
| "grad_norm": 1.0063082389466802, |
| "learning_rate": 5e-06, |
| "loss": 0.8009, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6082548877624909, |
| "grad_norm": 0.6359141777605319, |
| "learning_rate": 5e-06, |
| "loss": 0.7989, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6372194062273715, |
| "grad_norm": 0.6914513445032724, |
| "learning_rate": 5e-06, |
| "loss": 0.7942, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.666183924692252, |
| "grad_norm": 0.6878895702782952, |
| "learning_rate": 5e-06, |
| "loss": 0.7916, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6951484431571325, |
| "grad_norm": 0.5660450961011149, |
| "learning_rate": 5e-06, |
| "loss": 0.7874, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.724112961622013, |
| "grad_norm": 0.5744944170750963, |
| "learning_rate": 5e-06, |
| "loss": 0.7853, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7530774800868936, |
| "grad_norm": 0.8500170302127678, |
| "learning_rate": 5e-06, |
| "loss": 0.7868, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.782041998551774, |
| "grad_norm": 0.9550149697236748, |
| "learning_rate": 5e-06, |
| "loss": 0.7901, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8110065170166546, |
| "grad_norm": 0.5546890467469614, |
| "learning_rate": 5e-06, |
| "loss": 0.7864, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8399710354815351, |
| "grad_norm": 0.6331265595090024, |
| "learning_rate": 5e-06, |
| "loss": 0.788, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8689355539464156, |
| "grad_norm": 0.6196206182624663, |
| "learning_rate": 5e-06, |
| "loss": 0.782, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8979000724112962, |
| "grad_norm": 0.7124233685042775, |
| "learning_rate": 5e-06, |
| "loss": 0.7814, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9268645908761767, |
| "grad_norm": 0.6281001430065685, |
| "learning_rate": 5e-06, |
| "loss": 0.7821, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9558291093410572, |
| "grad_norm": 0.6288445991938771, |
| "learning_rate": 5e-06, |
| "loss": 0.7835, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9847936278059377, |
| "grad_norm": 0.5963707636678538, |
| "learning_rate": 5e-06, |
| "loss": 0.785, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.999275887038378, |
| "eval_loss": 0.7745929956436157, |
| "eval_runtime": 243.4419, |
| "eval_samples_per_second": 38.21, |
| "eval_steps_per_second": 0.6, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.0137581462708183, |
| "grad_norm": 0.887536909694853, |
| "learning_rate": 5e-06, |
| "loss": 0.8189, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0427226647356989, |
| "grad_norm": 0.9746717296866173, |
| "learning_rate": 5e-06, |
| "loss": 0.7355, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.0716871832005792, |
| "grad_norm": 0.6469650977983997, |
| "learning_rate": 5e-06, |
| "loss": 0.734, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1006517016654598, |
| "grad_norm": 0.7891671650419445, |
| "learning_rate": 5e-06, |
| "loss": 0.7328, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.1296162201303404, |
| "grad_norm": 0.6208155398032233, |
| "learning_rate": 5e-06, |
| "loss": 0.732, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.158580738595221, |
| "grad_norm": 0.6207300417186483, |
| "learning_rate": 5e-06, |
| "loss": 0.7297, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.1875452570601013, |
| "grad_norm": 0.6785729802747962, |
| "learning_rate": 5e-06, |
| "loss": 0.7341, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2165097755249819, |
| "grad_norm": 0.8266342046421713, |
| "learning_rate": 5e-06, |
| "loss": 0.7323, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.2454742939898624, |
| "grad_norm": 0.9660234843621154, |
| "learning_rate": 5e-06, |
| "loss": 0.7306, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.274438812454743, |
| "grad_norm": 0.6935390944005595, |
| "learning_rate": 5e-06, |
| "loss": 0.7319, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3034033309196236, |
| "grad_norm": 0.5941658732015132, |
| "learning_rate": 5e-06, |
| "loss": 0.7363, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.332367849384504, |
| "grad_norm": 0.590155807756852, |
| "learning_rate": 5e-06, |
| "loss": 0.7324, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.3613323678493845, |
| "grad_norm": 0.6352796068632335, |
| "learning_rate": 5e-06, |
| "loss": 0.7305, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.390296886314265, |
| "grad_norm": 0.6933255949901775, |
| "learning_rate": 5e-06, |
| "loss": 0.7335, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4192614047791454, |
| "grad_norm": 0.5969978905166637, |
| "learning_rate": 5e-06, |
| "loss": 0.7331, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.448225923244026, |
| "grad_norm": 0.5822244996224603, |
| "learning_rate": 5e-06, |
| "loss": 0.7316, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.4771904417089066, |
| "grad_norm": 0.6473397313720373, |
| "learning_rate": 5e-06, |
| "loss": 0.7325, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.5061549601737871, |
| "grad_norm": 0.7379810742604224, |
| "learning_rate": 5e-06, |
| "loss": 0.7322, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.5351194786386677, |
| "grad_norm": 0.877777055252363, |
| "learning_rate": 5e-06, |
| "loss": 0.7267, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.5640839971035483, |
| "grad_norm": 0.702500798809481, |
| "learning_rate": 5e-06, |
| "loss": 0.7278, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.5930485155684286, |
| "grad_norm": 0.7082980837667808, |
| "learning_rate": 5e-06, |
| "loss": 0.7282, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.6220130340333092, |
| "grad_norm": 0.6193169422318507, |
| "learning_rate": 5e-06, |
| "loss": 0.7322, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.6509775524981896, |
| "grad_norm": 0.6349696489382688, |
| "learning_rate": 5e-06, |
| "loss": 0.7272, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.6799420709630701, |
| "grad_norm": 0.5808865809115293, |
| "learning_rate": 5e-06, |
| "loss": 0.7256, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.7089065894279507, |
| "grad_norm": 0.5965976052156886, |
| "learning_rate": 5e-06, |
| "loss": 0.7222, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.7378711078928313, |
| "grad_norm": 0.5710627722704887, |
| "learning_rate": 5e-06, |
| "loss": 0.7297, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.7668356263577119, |
| "grad_norm": 0.608816224452555, |
| "learning_rate": 5e-06, |
| "loss": 0.7262, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.7958001448225924, |
| "grad_norm": 0.6371768739478637, |
| "learning_rate": 5e-06, |
| "loss": 0.7303, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.824764663287473, |
| "grad_norm": 0.5358050463696646, |
| "learning_rate": 5e-06, |
| "loss": 0.7287, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.8537291817523533, |
| "grad_norm": 0.7036505719286567, |
| "learning_rate": 5e-06, |
| "loss": 0.7247, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.882693700217234, |
| "grad_norm": 0.6805629942635829, |
| "learning_rate": 5e-06, |
| "loss": 0.7294, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.9116582186821143, |
| "grad_norm": 0.6498078585112878, |
| "learning_rate": 5e-06, |
| "loss": 0.7289, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.9406227371469948, |
| "grad_norm": 0.5784132159085213, |
| "learning_rate": 5e-06, |
| "loss": 0.7296, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.9695872556118754, |
| "grad_norm": 0.5796673295424948, |
| "learning_rate": 5e-06, |
| "loss": 0.7277, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.998551774076756, |
| "grad_norm": 0.5936807614505102, |
| "learning_rate": 5e-06, |
| "loss": 0.72, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.998551774076756, |
| "eval_loss": 0.7591201663017273, |
| "eval_runtime": 239.3101, |
| "eval_samples_per_second": 38.87, |
| "eval_steps_per_second": 0.61, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.0275162925416366, |
| "grad_norm": 0.715729072917497, |
| "learning_rate": 5e-06, |
| "loss": 0.7321, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.056480811006517, |
| "grad_norm": 0.6144874116006872, |
| "learning_rate": 5e-06, |
| "loss": 0.6723, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.0854453294713977, |
| "grad_norm": 0.7303697849008477, |
| "learning_rate": 5e-06, |
| "loss": 0.6804, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.114409847936278, |
| "grad_norm": 0.68311827138117, |
| "learning_rate": 5e-06, |
| "loss": 0.6797, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.1433743664011584, |
| "grad_norm": 0.5545659750688543, |
| "learning_rate": 5e-06, |
| "loss": 0.6733, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.172338884866039, |
| "grad_norm": 0.5612265877265745, |
| "learning_rate": 5e-06, |
| "loss": 0.6807, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.2013034033309196, |
| "grad_norm": 0.6652869497470263, |
| "learning_rate": 5e-06, |
| "loss": 0.6836, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.2302679217958, |
| "grad_norm": 0.5387482606960555, |
| "learning_rate": 5e-06, |
| "loss": 0.6782, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.2592324402606807, |
| "grad_norm": 0.9270487907700882, |
| "learning_rate": 5e-06, |
| "loss": 0.6792, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.2881969587255613, |
| "grad_norm": 0.5940033865543676, |
| "learning_rate": 5e-06, |
| "loss": 0.6839, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.317161477190442, |
| "grad_norm": 0.6205717881980902, |
| "learning_rate": 5e-06, |
| "loss": 0.6793, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.3461259956553224, |
| "grad_norm": 0.7863817807933237, |
| "learning_rate": 5e-06, |
| "loss": 0.6771, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.3750905141202026, |
| "grad_norm": 0.6565632670788291, |
| "learning_rate": 5e-06, |
| "loss": 0.6834, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.404055032585083, |
| "grad_norm": 0.6373382549439205, |
| "learning_rate": 5e-06, |
| "loss": 0.6828, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.4330195510499637, |
| "grad_norm": 0.6613391820459834, |
| "learning_rate": 5e-06, |
| "loss": 0.6835, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.4619840695148443, |
| "grad_norm": 0.5400957321029153, |
| "learning_rate": 5e-06, |
| "loss": 0.6852, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.490948587979725, |
| "grad_norm": 0.6001602197216074, |
| "learning_rate": 5e-06, |
| "loss": 0.6813, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.5199131064446054, |
| "grad_norm": 0.6758144590327261, |
| "learning_rate": 5e-06, |
| "loss": 0.6846, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.548877624909486, |
| "grad_norm": 0.7279056638638517, |
| "learning_rate": 5e-06, |
| "loss": 0.6806, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.577842143374366, |
| "grad_norm": 0.6530599852985071, |
| "learning_rate": 5e-06, |
| "loss": 0.6818, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.606806661839247, |
| "grad_norm": 0.6780623323564912, |
| "learning_rate": 5e-06, |
| "loss": 0.6825, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.6357711803041273, |
| "grad_norm": 0.5837946750071207, |
| "learning_rate": 5e-06, |
| "loss": 0.6825, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.664735698769008, |
| "grad_norm": 0.5602013963713363, |
| "learning_rate": 5e-06, |
| "loss": 0.6834, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.6937002172338884, |
| "grad_norm": 0.7041458005241192, |
| "learning_rate": 5e-06, |
| "loss": 0.6808, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.722664735698769, |
| "grad_norm": 0.6490957944655403, |
| "learning_rate": 5e-06, |
| "loss": 0.6844, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.7516292541636496, |
| "grad_norm": 0.6599285105597698, |
| "learning_rate": 5e-06, |
| "loss": 0.6841, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.78059377262853, |
| "grad_norm": 0.8783093936780623, |
| "learning_rate": 5e-06, |
| "loss": 0.6805, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.8095582910934107, |
| "grad_norm": 0.6029880285970106, |
| "learning_rate": 5e-06, |
| "loss": 0.6836, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.838522809558291, |
| "grad_norm": 0.6433012731687247, |
| "learning_rate": 5e-06, |
| "loss": 0.6802, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.867487328023172, |
| "grad_norm": 0.6070387953228661, |
| "learning_rate": 5e-06, |
| "loss": 0.6779, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.896451846488052, |
| "grad_norm": 0.5805918805506912, |
| "learning_rate": 5e-06, |
| "loss": 0.6815, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.9254163649529326, |
| "grad_norm": 0.6364227056673066, |
| "learning_rate": 5e-06, |
| "loss": 0.6828, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.954380883417813, |
| "grad_norm": 0.7905206793090467, |
| "learning_rate": 5e-06, |
| "loss": 0.6884, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.9833454018826937, |
| "grad_norm": 0.6878648180858722, |
| "learning_rate": 5e-06, |
| "loss": 0.685, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.997827661115134, |
| "eval_loss": 0.7575626373291016, |
| "eval_runtime": 234.038, |
| "eval_samples_per_second": 39.746, |
| "eval_steps_per_second": 0.624, |
| "step": 1035 |
| }, |
| { |
| "epoch": 2.997827661115134, |
| "step": 1035, |
| "total_flos": 1733454169374720.0, |
| "train_loss": 0.7466064549874568, |
| "train_runtime": 34327.0468, |
| "train_samples_per_second": 15.444, |
| "train_steps_per_second": 0.03 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1035, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1733454169374720.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|