{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.995634549423137, "eval_steps": 100, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012472715933894606, "grad_norm": 6.119478225708008, "learning_rate": 3.75e-05, "loss": 36.8721, "step": 5 }, { "epoch": 0.024945431867789213, "grad_norm": 2.9732778072357178, "learning_rate": 7.5e-05, "loss": 33.0439, "step": 10 }, { "epoch": 0.037418147801683815, "grad_norm": 1.5332609415054321, "learning_rate": 0.0001125, "loss": 30.1165, "step": 15 }, { "epoch": 0.049890863735578425, "grad_norm": 1.2270578145980835, "learning_rate": 0.00015, "loss": 28.647, "step": 20 }, { "epoch": 0.06236357966947303, "grad_norm": 1.053142786026001, "learning_rate": 0.00018749999999999998, "loss": 26.0629, "step": 25 }, { "epoch": 0.07483629560336763, "grad_norm": 1.0131248235702515, "learning_rate": 0.000225, "loss": 23.8703, "step": 30 }, { "epoch": 0.08730901153726224, "grad_norm": 0.9197985529899597, "learning_rate": 0.0002625, "loss": 21.521, "step": 35 }, { "epoch": 0.09978172747115685, "grad_norm": 1.0926002264022827, "learning_rate": 0.0003, "loss": 19.8433, "step": 40 }, { "epoch": 0.11225444340505145, "grad_norm": 0.7152827382087708, "learning_rate": 0.0003, "loss": 18.618, "step": 45 }, { "epoch": 0.12472715933894606, "grad_norm": 0.6178381443023682, "learning_rate": 0.0003, "loss": 17.3644, "step": 50 }, { "epoch": 0.13719987527284067, "grad_norm": 0.48063215613365173, "learning_rate": 0.0003, "loss": 16.6105, "step": 55 }, { "epoch": 0.14967259120673526, "grad_norm": 0.46090102195739746, "learning_rate": 0.0003, "loss": 16.2326, "step": 60 }, { "epoch": 0.16214530714062989, "grad_norm": 0.4266461730003357, "learning_rate": 0.0003, "loss": 15.8385, "step": 65 }, { "epoch": 0.17461802307452448, "grad_norm": 0.3876805901527405, "learning_rate": 0.0003, "loss": 15.3119, "step": 70 }, { "epoch": 0.18709073900841908, "grad_norm": 0.3796117603778839, "learning_rate": 0.0003, "loss": 15.2481, "step": 75 }, { "epoch": 0.1995634549423137, "grad_norm": 0.37646082043647766, "learning_rate": 0.0003, "loss": 14.7319, "step": 80 }, { "epoch": 0.2120361708762083, "grad_norm": 0.3688748776912689, "learning_rate": 0.0003, "loss": 14.6364, "step": 85 }, { "epoch": 0.2245088868101029, "grad_norm": 0.37435677647590637, "learning_rate": 0.0003, "loss": 14.2134, "step": 90 }, { "epoch": 0.23698160274399752, "grad_norm": 0.36440223455429077, "learning_rate": 0.0003, "loss": 13.9198, "step": 95 }, { "epoch": 0.2494543186778921, "grad_norm": 0.33530500531196594, "learning_rate": 0.0003, "loss": 13.6044, "step": 100 }, { "epoch": 0.2494543186778921, "eval_accuracy": 0.007913978494623657, "eval_loss": 12.544113159179688, "eval_runtime": 18.5829, "eval_samples_per_second": 13.453, "eval_steps_per_second": 3.39, "step": 100 }, { "epoch": 0.26192703461178674, "grad_norm": 0.3251523971557617, "learning_rate": 0.0003, "loss": 13.3181, "step": 105 }, { "epoch": 0.27439975054568133, "grad_norm": 0.3473041355609894, "learning_rate": 0.0003, "loss": 12.9976, "step": 110 }, { "epoch": 0.2868724664795759, "grad_norm": 0.3266255557537079, "learning_rate": 0.0003, "loss": 12.7667, "step": 115 }, { "epoch": 0.2993451824134705, "grad_norm": 0.35194671154022217, "learning_rate": 0.0003, "loss": 12.7544, "step": 120 }, { "epoch": 0.3118178983473651, "grad_norm": 0.34635770320892334, "learning_rate": 0.0003, "loss": 12.2756, "step": 125 }, { "epoch": 0.32429061428125977, "grad_norm": 0.3480404019355774, "learning_rate": 0.0003, "loss": 12.1192, "step": 130 }, { "epoch": 0.33676333021515437, "grad_norm": 0.3309994339942932, "learning_rate": 0.0003, "loss": 11.8339, "step": 135 }, { "epoch": 0.34923604614904896, "grad_norm": 0.33558282256126404, "learning_rate": 0.0003, "loss": 11.6745, "step": 140 }, { "epoch": 0.36170876208294356, "grad_norm": 0.3359847664833069, "learning_rate": 0.0003, "loss": 11.3363, "step": 145 }, { "epoch": 0.37418147801683815, "grad_norm": 0.33947232365608215, "learning_rate": 0.0003, "loss": 11.0303, "step": 150 }, { "epoch": 0.38665419395073275, "grad_norm": 0.32984089851379395, "learning_rate": 0.0003, "loss": 10.9271, "step": 155 }, { "epoch": 0.3991269098846274, "grad_norm": 0.3498048782348633, "learning_rate": 0.0003, "loss": 10.6215, "step": 160 }, { "epoch": 0.411599625818522, "grad_norm": 0.354889839887619, "learning_rate": 0.0003, "loss": 10.5165, "step": 165 }, { "epoch": 0.4240723417524166, "grad_norm": 0.34426406025886536, "learning_rate": 0.0003, "loss": 10.0716, "step": 170 }, { "epoch": 0.4365450576863112, "grad_norm": 0.34653356671333313, "learning_rate": 0.0003, "loss": 10.0709, "step": 175 }, { "epoch": 0.4490177736202058, "grad_norm": 0.3454643189907074, "learning_rate": 0.0003, "loss": 9.7226, "step": 180 }, { "epoch": 0.4614904895541004, "grad_norm": 0.3724479377269745, "learning_rate": 0.0003, "loss": 9.5827, "step": 185 }, { "epoch": 0.47396320548799503, "grad_norm": 0.37687671184539795, "learning_rate": 0.0003, "loss": 9.3702, "step": 190 }, { "epoch": 0.4864359214218896, "grad_norm": 0.3670942187309265, "learning_rate": 0.0003, "loss": 9.2377, "step": 195 }, { "epoch": 0.4989086373557842, "grad_norm": 0.3864516019821167, "learning_rate": 0.0003, "loss": 8.9524, "step": 200 }, { "epoch": 0.4989086373557842, "eval_accuracy": 0.04734701857282502, "eval_loss": 8.425415992736816, "eval_runtime": 17.9427, "eval_samples_per_second": 13.933, "eval_steps_per_second": 3.511, "step": 200 }, { "epoch": 0.5113813532896788, "grad_norm": 0.3540992736816406, "learning_rate": 0.0003, "loss": 8.9811, "step": 205 }, { "epoch": 0.5238540692235735, "grad_norm": 0.35756129026412964, "learning_rate": 0.0003, "loss": 8.6522, "step": 210 }, { "epoch": 0.536326785157468, "grad_norm": 0.38473081588745117, "learning_rate": 0.0003, "loss": 8.6516, "step": 215 }, { "epoch": 0.5487995010913627, "grad_norm": 0.3616325259208679, "learning_rate": 0.0003, "loss": 8.5213, "step": 220 }, { "epoch": 0.5612722170252572, "grad_norm": 0.375959187746048, "learning_rate": 0.0003, "loss": 8.3109, "step": 225 }, { "epoch": 0.5737449329591519, "grad_norm": 0.38421833515167236, "learning_rate": 0.0003, "loss": 8.2747, "step": 230 }, { "epoch": 0.5862176488930465, "grad_norm": 0.379168301820755, "learning_rate": 0.0003, "loss": 8.197, "step": 235 }, { "epoch": 0.598690364826941, "grad_norm": 0.39803043007850647, "learning_rate": 0.0003, "loss": 8.0836, "step": 240 }, { "epoch": 0.6111630807608357, "grad_norm": 0.41287195682525635, "learning_rate": 0.0003, "loss": 7.9406, "step": 245 }, { "epoch": 0.6236357966947302, "grad_norm": 0.3857806324958801, "learning_rate": 0.0003, "loss": 7.9488, "step": 250 }, { "epoch": 0.6361085126286249, "grad_norm": 0.3808286488056183, "learning_rate": 0.0003, "loss": 7.7673, "step": 255 }, { "epoch": 0.6485812285625195, "grad_norm": 0.4393250048160553, "learning_rate": 0.0003, "loss": 7.707, "step": 260 }, { "epoch": 0.6610539444964141, "grad_norm": 0.4232034981250763, "learning_rate": 0.0003, "loss": 7.7852, "step": 265 }, { "epoch": 0.6735266604303087, "grad_norm": 0.42222586274147034, "learning_rate": 0.0003, "loss": 7.6145, "step": 270 }, { "epoch": 0.6859993763642033, "grad_norm": 0.35792261362075806, "learning_rate": 0.0003, "loss": 7.5498, "step": 275 }, { "epoch": 0.6984720922980979, "grad_norm": 0.343427449464798, "learning_rate": 0.0003, "loss": 7.4698, "step": 280 }, { "epoch": 0.7109448082319925, "grad_norm": 0.4176105856895447, "learning_rate": 0.0003, "loss": 7.3752, "step": 285 }, { "epoch": 0.7234175241658871, "grad_norm": 0.40987178683280945, "learning_rate": 0.0003, "loss": 7.342, "step": 290 }, { "epoch": 0.7358902400997818, "grad_norm": 0.4014261066913605, "learning_rate": 0.0003, "loss": 7.1609, "step": 295 }, { "epoch": 0.7483629560336763, "grad_norm": 0.4236806035041809, "learning_rate": 0.0003, "loss": 7.1721, "step": 300 }, { "epoch": 0.7483629560336763, "eval_accuracy": 0.03885043988269795, "eval_loss": 6.619859218597412, "eval_runtime": 18.2015, "eval_samples_per_second": 13.735, "eval_steps_per_second": 3.461, "step": 300 }, { "epoch": 0.760835671967571, "grad_norm": 0.4133549630641937, "learning_rate": 0.0003, "loss": 7.1892, "step": 305 }, { "epoch": 0.7733083879014655, "grad_norm": 0.44653546810150146, "learning_rate": 0.0003, "loss": 7.0446, "step": 310 }, { "epoch": 0.7857811038353602, "grad_norm": 0.41286739706993103, "learning_rate": 0.0003, "loss": 6.9656, "step": 315 }, { "epoch": 0.7982538197692548, "grad_norm": 0.3720580041408539, "learning_rate": 0.0003, "loss": 6.907, "step": 320 }, { "epoch": 0.8107265357031493, "grad_norm": 0.39917078614234924, "learning_rate": 0.0003, "loss": 6.9853, "step": 325 }, { "epoch": 0.823199251637044, "grad_norm": 0.4373719096183777, "learning_rate": 0.0003, "loss": 6.8592, "step": 330 }, { "epoch": 0.8356719675709385, "grad_norm": 0.4183291792869568, "learning_rate": 0.0003, "loss": 6.7432, "step": 335 }, { "epoch": 0.8481446835048332, "grad_norm": 0.40696659684181213, "learning_rate": 0.0003, "loss": 6.7505, "step": 340 }, { "epoch": 0.8606173994387278, "grad_norm": 0.36887314915657043, "learning_rate": 0.0003, "loss": 6.7657, "step": 345 }, { "epoch": 0.8730901153726224, "grad_norm": 0.4768717885017395, "learning_rate": 0.0003, "loss": 6.7173, "step": 350 }, { "epoch": 0.885562831306517, "grad_norm": 0.43819448351860046, "learning_rate": 0.0003, "loss": 6.5465, "step": 355 }, { "epoch": 0.8980355472404116, "grad_norm": 0.40145763754844666, "learning_rate": 0.0003, "loss": 6.512, "step": 360 }, { "epoch": 0.9105082631743062, "grad_norm": 0.49852269887924194, "learning_rate": 0.0003, "loss": 6.5335, "step": 365 }, { "epoch": 0.9229809791082008, "grad_norm": 0.454698771238327, "learning_rate": 0.0003, "loss": 6.4527, "step": 370 }, { "epoch": 0.9354536950420954, "grad_norm": 0.4860341250896454, "learning_rate": 0.0003, "loss": 6.4102, "step": 375 }, { "epoch": 0.9479264109759901, "grad_norm": 0.39718613028526306, "learning_rate": 0.0003, "loss": 6.4694, "step": 380 }, { "epoch": 0.9603991269098846, "grad_norm": 0.4210009276866913, "learning_rate": 0.0003, "loss": 6.4807, "step": 385 }, { "epoch": 0.9728718428437793, "grad_norm": 0.4482674300670624, "learning_rate": 0.0003, "loss": 6.414, "step": 390 }, { "epoch": 0.9853445587776738, "grad_norm": 0.42889419198036194, "learning_rate": 0.0003, "loss": 6.3543, "step": 395 }, { "epoch": 0.9978172747115684, "grad_norm": 0.5144391059875488, "learning_rate": 0.0003, "loss": 6.2087, "step": 400 }, { "epoch": 0.9978172747115684, "eval_accuracy": 0.22513000977517106, "eval_loss": 5.719752311706543, "eval_runtime": 17.8865, "eval_samples_per_second": 13.977, "eval_steps_per_second": 3.522, "step": 400 }, { "epoch": 1.010289990645463, "grad_norm": 0.6417849063873291, "learning_rate": 0.0003, "loss": 6.048, "step": 405 }, { "epoch": 1.0227627065793576, "grad_norm": 0.5739749073982239, "learning_rate": 0.0003, "loss": 5.9866, "step": 410 }, { "epoch": 1.0352354225132523, "grad_norm": 0.49603304266929626, "learning_rate": 0.0003, "loss": 5.9419, "step": 415 }, { "epoch": 1.047708138447147, "grad_norm": 0.5403385162353516, "learning_rate": 0.0003, "loss": 5.8366, "step": 420 }, { "epoch": 1.0601808543810414, "grad_norm": 0.6306777000427246, "learning_rate": 0.0003, "loss": 5.7657, "step": 425 }, { "epoch": 1.072653570314936, "grad_norm": 0.7016925811767578, "learning_rate": 0.0003, "loss": 5.6619, "step": 430 }, { "epoch": 1.0851262862488307, "grad_norm": 0.6606624722480774, "learning_rate": 0.0003, "loss": 5.6094, "step": 435 }, { "epoch": 1.0975990021827253, "grad_norm": 0.7023086547851562, "learning_rate": 0.0003, "loss": 5.6074, "step": 440 }, { "epoch": 1.11007171811662, "grad_norm": 0.8505487442016602, "learning_rate": 0.0003, "loss": 5.6959, "step": 445 }, { "epoch": 1.1225444340505144, "grad_norm": 0.6713190674781799, "learning_rate": 0.0003, "loss": 5.6344, "step": 450 }, { "epoch": 1.135017149984409, "grad_norm": 0.5908814668655396, "learning_rate": 0.0003, "loss": 5.4591, "step": 455 }, { "epoch": 1.1474898659183037, "grad_norm": 0.7601476311683655, "learning_rate": 0.0003, "loss": 5.5622, "step": 460 }, { "epoch": 1.1599625818521984, "grad_norm": 0.5737589001655579, "learning_rate": 0.0003, "loss": 5.4541, "step": 465 }, { "epoch": 1.172435297786093, "grad_norm": 0.8831024169921875, "learning_rate": 0.0003, "loss": 5.4784, "step": 470 }, { "epoch": 1.1849080137199874, "grad_norm": 0.8297187089920044, "learning_rate": 0.0003, "loss": 5.4252, "step": 475 }, { "epoch": 1.197380729653882, "grad_norm": 0.857667863368988, "learning_rate": 0.0003, "loss": 5.3268, "step": 480 }, { "epoch": 1.2098534455877767, "grad_norm": 0.8937066793441772, "learning_rate": 0.0003, "loss": 5.279, "step": 485 }, { "epoch": 1.2223261615216714, "grad_norm": 0.784275472164154, "learning_rate": 0.0003, "loss": 5.3079, "step": 490 }, { "epoch": 1.234798877455566, "grad_norm": 0.7549949884414673, "learning_rate": 0.0003, "loss": 5.3977, "step": 495 }, { "epoch": 1.2472715933894605, "grad_norm": 0.7452312111854553, "learning_rate": 0.0003, "loss": 5.4917, "step": 500 }, { "epoch": 1.2472715933894605, "eval_accuracy": 0.32684261974584555, "eval_loss": 4.947990894317627, "eval_runtime": 19.5683, "eval_samples_per_second": 12.776, "eval_steps_per_second": 3.219, "step": 500 }, { "epoch": 1.2597443093233551, "grad_norm": 0.6744974255561829, "learning_rate": 0.0003, "loss": 5.1679, "step": 505 }, { "epoch": 1.2722170252572498, "grad_norm": 1.0095832347869873, "learning_rate": 0.0003, "loss": 5.3918, "step": 510 }, { "epoch": 1.2846897411911444, "grad_norm": 0.7461665272712708, "learning_rate": 0.0003, "loss": 5.2346, "step": 515 }, { "epoch": 1.2971624571250389, "grad_norm": 0.88801109790802, "learning_rate": 0.0003, "loss": 5.2033, "step": 520 }, { "epoch": 1.3096351730589335, "grad_norm": 0.7549375891685486, "learning_rate": 0.0003, "loss": 5.098, "step": 525 }, { "epoch": 1.3221078889928282, "grad_norm": 1.1236454248428345, "learning_rate": 0.0003, "loss": 5.2069, "step": 530 }, { "epoch": 1.3345806049267228, "grad_norm": 0.9261302947998047, "learning_rate": 0.0003, "loss": 5.1925, "step": 535 }, { "epoch": 1.3470533208606175, "grad_norm": 0.7248057126998901, "learning_rate": 0.0003, "loss": 5.109, "step": 540 }, { "epoch": 1.3595260367945121, "grad_norm": 0.941017210483551, "learning_rate": 0.0003, "loss": 5.0975, "step": 545 }, { "epoch": 1.3719987527284065, "grad_norm": 0.9451349973678589, "learning_rate": 0.0003, "loss": 5.1825, "step": 550 }, { "epoch": 1.3844714686623012, "grad_norm": 0.9956802725791931, "learning_rate": 0.0003, "loss": 5.1017, "step": 555 }, { "epoch": 1.3969441845961958, "grad_norm": 1.0484583377838135, "learning_rate": 0.0003, "loss": 5.1371, "step": 560 }, { "epoch": 1.4094169005300905, "grad_norm": 1.1080021858215332, "learning_rate": 0.0003, "loss": 5.0146, "step": 565 }, { "epoch": 1.421889616463985, "grad_norm": 0.9495016932487488, "learning_rate": 0.0003, "loss": 5.0971, "step": 570 }, { "epoch": 1.4343623323978796, "grad_norm": 0.7586097717285156, "learning_rate": 0.0003, "loss": 5.0336, "step": 575 }, { "epoch": 1.4468350483317742, "grad_norm": 0.647396981716156, "learning_rate": 0.0003, "loss": 5.0119, "step": 580 }, { "epoch": 1.4593077642656689, "grad_norm": 0.7189023494720459, "learning_rate": 0.0003, "loss": 5.0908, "step": 585 }, { "epoch": 1.4717804801995635, "grad_norm": 0.9973328113555908, "learning_rate": 0.0003, "loss": 4.7903, "step": 590 }, { "epoch": 1.4842531961334582, "grad_norm": 0.8094688057899475, "learning_rate": 0.0003, "loss": 5.0103, "step": 595 }, { "epoch": 1.4967259120673526, "grad_norm": 1.0308438539505005, "learning_rate": 0.0003, "loss": 4.9408, "step": 600 }, { "epoch": 1.4967259120673526, "eval_accuracy": 0.35667253176930597, "eval_loss": 4.673036575317383, "eval_runtime": 19.5514, "eval_samples_per_second": 12.787, "eval_steps_per_second": 3.222, "step": 600 }, { "epoch": 1.5091986280012473, "grad_norm": 0.7587366104125977, "learning_rate": 0.0003, "loss": 4.9818, "step": 605 }, { "epoch": 1.521671343935142, "grad_norm": 1.0271868705749512, "learning_rate": 0.0003, "loss": 4.9614, "step": 610 }, { "epoch": 1.5341440598690363, "grad_norm": 1.061369776725769, "learning_rate": 0.0003, "loss": 4.8608, "step": 615 }, { "epoch": 1.546616775802931, "grad_norm": 0.9442321062088013, "learning_rate": 0.0003, "loss": 4.9478, "step": 620 }, { "epoch": 1.5590894917368257, "grad_norm": 0.8110609650611877, "learning_rate": 0.0003, "loss": 5.0979, "step": 625 }, { "epoch": 1.5715622076707203, "grad_norm": 0.6862745881080627, "learning_rate": 0.0003, "loss": 4.8345, "step": 630 }, { "epoch": 1.584034923604615, "grad_norm": 0.8737391233444214, "learning_rate": 0.0003, "loss": 4.8572, "step": 635 }, { "epoch": 1.5965076395385096, "grad_norm": 0.8002131581306458, "learning_rate": 0.0003, "loss": 4.8072, "step": 640 }, { "epoch": 1.6089803554724043, "grad_norm": 0.7860103845596313, "learning_rate": 0.0003, "loss": 4.8922, "step": 645 }, { "epoch": 1.6214530714062987, "grad_norm": 0.9875708222389221, "learning_rate": 0.0003, "loss": 4.9247, "step": 650 }, { "epoch": 1.6339257873401933, "grad_norm": 0.8873936533927917, "learning_rate": 0.0003, "loss": 4.8795, "step": 655 }, { "epoch": 1.646398503274088, "grad_norm": 0.7963967323303223, "learning_rate": 0.0003, "loss": 4.835, "step": 660 }, { "epoch": 1.6588712192079824, "grad_norm": 0.8068607449531555, "learning_rate": 0.0003, "loss": 4.8713, "step": 665 }, { "epoch": 1.671343935141877, "grad_norm": 0.9093911647796631, "learning_rate": 0.0003, "loss": 4.7725, "step": 670 }, { "epoch": 1.6838166510757717, "grad_norm": 0.7699265480041504, "learning_rate": 0.0003, "loss": 4.7502, "step": 675 }, { "epoch": 1.6962893670096664, "grad_norm": 0.7545697689056396, "learning_rate": 0.0003, "loss": 4.9555, "step": 680 }, { "epoch": 1.708762082943561, "grad_norm": 0.7571801543235779, "learning_rate": 0.0003, "loss": 4.7616, "step": 685 }, { "epoch": 1.7212347988774557, "grad_norm": 0.7757474184036255, "learning_rate": 0.0003, "loss": 4.6462, "step": 690 }, { "epoch": 1.7337075148113503, "grad_norm": 0.7473092079162598, "learning_rate": 0.0003, "loss": 4.6699, "step": 695 }, { "epoch": 1.7461802307452448, "grad_norm": 1.2531319856643677, "learning_rate": 0.0003, "loss": 4.8347, "step": 700 }, { "epoch": 1.7461802307452448, "eval_accuracy": 0.37069794721407623, "eval_loss": 4.498379707336426, "eval_runtime": 20.0355, "eval_samples_per_second": 12.478, "eval_steps_per_second": 3.144, "step": 700 }, { "epoch": 1.7586529466791394, "grad_norm": 1.3069407939910889, "learning_rate": 0.0003, "loss": 4.7338, "step": 705 }, { "epoch": 1.7711256626130338, "grad_norm": 1.1146960258483887, "learning_rate": 0.0003, "loss": 4.8758, "step": 710 }, { "epoch": 1.7835983785469285, "grad_norm": 1.0376973152160645, "learning_rate": 0.0003, "loss": 4.7604, "step": 715 }, { "epoch": 1.7960710944808231, "grad_norm": 1.2044090032577515, "learning_rate": 0.0003, "loss": 4.7472, "step": 720 }, { "epoch": 1.8085438104147178, "grad_norm": 1.0660207271575928, "learning_rate": 0.0003, "loss": 4.79, "step": 725 }, { "epoch": 1.8210165263486124, "grad_norm": 0.7932606935501099, "learning_rate": 0.0003, "loss": 4.7476, "step": 730 }, { "epoch": 1.833489242282507, "grad_norm": 0.8554738759994507, "learning_rate": 0.0003, "loss": 4.7839, "step": 735 }, { "epoch": 1.8459619582164017, "grad_norm": 1.015703797340393, "learning_rate": 0.0003, "loss": 4.7935, "step": 740 }, { "epoch": 1.8584346741502962, "grad_norm": 1.1005243062973022, "learning_rate": 0.0003, "loss": 4.7913, "step": 745 }, { "epoch": 1.8709073900841908, "grad_norm": 0.8775972127914429, "learning_rate": 0.0003, "loss": 4.5128, "step": 750 }, { "epoch": 1.8833801060180855, "grad_norm": 0.8116542100906372, "learning_rate": 0.0003, "loss": 4.6496, "step": 755 }, { "epoch": 1.89585282195198, "grad_norm": 0.7614642381668091, "learning_rate": 0.0003, "loss": 4.7695, "step": 760 }, { "epoch": 1.9083255378858746, "grad_norm": 1.0064287185668945, "learning_rate": 0.0003, "loss": 4.7929, "step": 765 }, { "epoch": 1.9207982538197692, "grad_norm": 0.7342740297317505, "learning_rate": 0.0003, "loss": 4.6711, "step": 770 }, { "epoch": 1.9332709697536639, "grad_norm": 0.9723834991455078, "learning_rate": 0.0003, "loss": 4.6212, "step": 775 }, { "epoch": 1.9457436856875585, "grad_norm": 1.20729398727417, "learning_rate": 0.0003, "loss": 4.6513, "step": 780 }, { "epoch": 1.9582164016214532, "grad_norm": 0.7920907735824585, "learning_rate": 0.0003, "loss": 4.6264, "step": 785 }, { "epoch": 1.9706891175553478, "grad_norm": 0.6307650804519653, "learning_rate": 0.0003, "loss": 4.6481, "step": 790 }, { "epoch": 1.9831618334892422, "grad_norm": 0.8942980766296387, "learning_rate": 0.0003, "loss": 4.6598, "step": 795 }, { "epoch": 1.995634549423137, "grad_norm": 0.7046281099319458, "learning_rate": 0.0003, "loss": 4.7023, "step": 800 }, { "epoch": 1.995634549423137, "eval_accuracy": 0.3789325513196481, "eval_loss": 4.358436584472656, "eval_runtime": 20.1663, "eval_samples_per_second": 12.397, "eval_steps_per_second": 3.124, "step": 800 }, { "epoch": 1.995634549423137, "step": 800, "total_flos": 6.441101073108173e+16, "train_loss": 8.280340445041656, "train_runtime": 18888.2342, "train_samples_per_second": 5.433, "train_steps_per_second": 0.042 } ], "logging_steps": 5, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 6.441101073108173e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }