{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994869163673679, "eval_steps": 500, "global_step": 974, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.2869611310957953, "learning_rate": 2.040816326530612e-06, "loss": 0.8612, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.3194590499756201, "learning_rate": 1.0204081632653061e-05, "loss": 0.7892, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.2900590043808106, "learning_rate": 2.0408163265306123e-05, "loss": 0.8123, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.32206451305877987, "learning_rate": 3.061224489795919e-05, "loss": 0.8338, "step": 15 }, { "epoch": 0.02, "grad_norm": 0.412779025757005, "learning_rate": 4.0816326530612245e-05, "loss": 0.7852, "step": 20 }, { "epoch": 0.03, "grad_norm": 0.2440444128385405, "learning_rate": 5.102040816326531e-05, "loss": 0.7278, "step": 25 }, { "epoch": 0.03, "grad_norm": 0.23078040218918436, "learning_rate": 6.122448979591838e-05, "loss": 0.7749, "step": 30 }, { "epoch": 0.04, "grad_norm": 0.2649545106643593, "learning_rate": 7.142857142857143e-05, "loss": 0.7369, "step": 35 }, { "epoch": 0.04, "grad_norm": 0.21567429110623856, "learning_rate": 8.163265306122449e-05, "loss": 0.7412, "step": 40 }, { "epoch": 0.05, "grad_norm": 0.21372272882412666, "learning_rate": 9.183673469387756e-05, "loss": 0.7375, "step": 45 }, { "epoch": 0.05, "grad_norm": 0.2379221759632066, "learning_rate": 0.00010204081632653062, "loss": 0.7803, "step": 50 }, { "epoch": 0.06, "grad_norm": 0.2382749273594966, "learning_rate": 0.00011224489795918367, "loss": 0.7585, "step": 55 }, { "epoch": 0.06, "grad_norm": 0.20160910918787228, "learning_rate": 0.00012244897959183676, "loss": 0.7394, "step": 60 }, { "epoch": 0.07, "grad_norm": 0.21140946440246422, "learning_rate": 0.0001326530612244898, "loss": 0.73, "step": 65 }, { "epoch": 0.07, "grad_norm": 0.24165285545749063, "learning_rate": 0.00014285714285714287, "loss": 0.7034, "step": 70 }, { "epoch": 0.08, "grad_norm": 0.26276753886817406, "learning_rate": 0.0001530612244897959, "loss": 0.7225, "step": 75 }, { "epoch": 0.08, "grad_norm": 0.2278406577066701, "learning_rate": 0.00016326530612244898, "loss": 0.6887, "step": 80 }, { "epoch": 0.09, "grad_norm": 0.20166487628632143, "learning_rate": 0.00017346938775510205, "loss": 0.7636, "step": 85 }, { "epoch": 0.09, "grad_norm": 0.25404574731403684, "learning_rate": 0.00018367346938775512, "loss": 0.7536, "step": 90 }, { "epoch": 0.1, "grad_norm": 0.20421230133266374, "learning_rate": 0.00019387755102040816, "loss": 0.7142, "step": 95 }, { "epoch": 0.1, "grad_norm": 0.2106105346682244, "learning_rate": 0.0001999974277115551, "loss": 0.746, "step": 100 }, { "epoch": 0.11, "grad_norm": 0.21445953241413981, "learning_rate": 0.00019996849098629418, "loss": 0.6897, "step": 105 }, { "epoch": 0.11, "grad_norm": 0.21242787148744666, "learning_rate": 0.00019990741151022301, "loss": 0.7043, "step": 110 }, { "epoch": 0.12, "grad_norm": 0.2897266739044473, "learning_rate": 0.0001998142089221534, "loss": 0.7799, "step": 115 }, { "epoch": 0.12, "grad_norm": 0.1968138822308632, "learning_rate": 0.0001996889131894033, "loss": 0.7395, "step": 120 }, { "epoch": 0.13, "grad_norm": 0.21497851174958627, "learning_rate": 0.00019953156459816179, "loss": 0.7059, "step": 125 }, { "epoch": 0.13, "grad_norm": 0.21451890274852173, "learning_rate": 0.0001993422137405354, "loss": 0.7397, "step": 130 }, { "epoch": 0.14, "grad_norm": 0.19014360167506802, "learning_rate": 0.00019912092149828174, "loss": 0.7686, "step": 135 }, { "epoch": 0.14, "grad_norm": 0.22120779709504043, "learning_rate": 0.00019886775902323405, "loss": 0.7509, "step": 140 }, { "epoch": 0.15, "grad_norm": 0.29083330661638745, "learning_rate": 0.00019858280771442385, "loss": 0.7443, "step": 145 }, { "epoch": 0.15, "grad_norm": 0.27668275095195644, "learning_rate": 0.00019826615919190887, "loss": 0.7294, "step": 150 }, { "epoch": 0.16, "grad_norm": 0.19750932915933125, "learning_rate": 0.00019791791526731445, "loss": 0.7338, "step": 155 }, { "epoch": 0.16, "grad_norm": 0.22944621034884685, "learning_rate": 0.00019753818791109828, "loss": 0.7166, "step": 160 }, { "epoch": 0.17, "grad_norm": 0.2109443098333667, "learning_rate": 0.0001971270992165486, "loss": 0.674, "step": 165 }, { "epoch": 0.17, "grad_norm": 0.20676211281872686, "learning_rate": 0.00019668478136052774, "loss": 0.6949, "step": 170 }, { "epoch": 0.18, "grad_norm": 0.20575242743242247, "learning_rate": 0.0001962113765609735, "loss": 0.6906, "step": 175 }, { "epoch": 0.18, "grad_norm": 0.1951531711108696, "learning_rate": 0.0001957070370311717, "loss": 0.7289, "step": 180 }, { "epoch": 0.19, "grad_norm": 0.22414853264275936, "learning_rate": 0.00019517192493081565, "loss": 0.7208, "step": 185 }, { "epoch": 0.19, "grad_norm": 0.20668124787776296, "learning_rate": 0.00019460621231386676, "loss": 0.7671, "step": 190 }, { "epoch": 0.2, "grad_norm": 0.19009017619221066, "learning_rate": 0.00019401008107323455, "loss": 0.7351, "step": 195 }, { "epoch": 0.21, "grad_norm": 0.2189256627415435, "learning_rate": 0.0001933837228822925, "loss": 0.7093, "step": 200 }, { "epoch": 0.21, "grad_norm": 0.2145651973089389, "learning_rate": 0.0001927273391332499, "loss": 0.7386, "step": 205 }, { "epoch": 0.22, "grad_norm": 0.20465953071673096, "learning_rate": 0.00019204114087239806, "loss": 0.7456, "step": 210 }, { "epoch": 0.22, "grad_norm": 0.20371247448101287, "learning_rate": 0.00019132534873225323, "loss": 0.7316, "step": 215 }, { "epoch": 0.23, "grad_norm": 0.21926104230516402, "learning_rate": 0.00019058019286061665, "loss": 0.7153, "step": 220 }, { "epoch": 0.23, "grad_norm": 0.21308006677872718, "learning_rate": 0.00018980591284657535, "loss": 0.7363, "step": 225 }, { "epoch": 0.24, "grad_norm": 0.21830112014464587, "learning_rate": 0.00018900275764346768, "loss": 0.7188, "step": 230 }, { "epoch": 0.24, "grad_norm": 0.22594321207444476, "learning_rate": 0.0001881709854888372, "loss": 0.7107, "step": 235 }, { "epoch": 0.25, "grad_norm": 0.17764984572247824, "learning_rate": 0.00018731086382140226, "loss": 0.7247, "step": 240 }, { "epoch": 0.25, "grad_norm": 0.21624903539690288, "learning_rate": 0.00018642266919506644, "loss": 0.7276, "step": 245 }, { "epoch": 0.26, "grad_norm": 0.2171918383945808, "learning_rate": 0.00018550668718999872, "loss": 0.6937, "step": 250 }, { "epoch": 0.26, "grad_norm": 0.17788572350886178, "learning_rate": 0.0001845632123208111, "loss": 0.7511, "step": 255 }, { "epoch": 0.27, "grad_norm": 0.20199815612602928, "learning_rate": 0.0001835925479418637, "loss": 0.7522, "step": 260 }, { "epoch": 0.27, "grad_norm": 0.173119607094577, "learning_rate": 0.0001825950061497276, "loss": 0.684, "step": 265 }, { "epoch": 0.28, "grad_norm": 0.21342763081754199, "learning_rate": 0.00018157090768283678, "loss": 0.7026, "step": 270 }, { "epoch": 0.28, "grad_norm": 0.19749144239575178, "learning_rate": 0.00018052058181836151, "loss": 0.7153, "step": 275 }, { "epoch": 0.29, "grad_norm": 0.23666019704239125, "learning_rate": 0.00017944436626633623, "loss": 0.7337, "step": 280 }, { "epoch": 0.29, "grad_norm": 0.22317738087512298, "learning_rate": 0.00017834260706107595, "loss": 0.7044, "step": 285 }, { "epoch": 0.3, "grad_norm": 0.19219658466716574, "learning_rate": 0.00017721565844991643, "loss": 0.6799, "step": 290 }, { "epoch": 0.3, "grad_norm": 0.23381913650905206, "learning_rate": 0.00017606388277931328, "loss": 0.7463, "step": 295 }, { "epoch": 0.31, "grad_norm": 0.20207571127846147, "learning_rate": 0.0001748876503783373, "loss": 0.7305, "step": 300 }, { "epoch": 0.31, "grad_norm": 0.20537100306933584, "learning_rate": 0.00017368733943960276, "loss": 0.7274, "step": 305 }, { "epoch": 0.32, "grad_norm": 0.20177943742791943, "learning_rate": 0.00017246333589766787, "loss": 0.745, "step": 310 }, { "epoch": 0.32, "grad_norm": 0.20364782308656443, "learning_rate": 0.00017121603330494544, "loss": 0.7519, "step": 315 }, { "epoch": 0.33, "grad_norm": 0.21280082529891567, "learning_rate": 0.0001699458327051647, "loss": 0.736, "step": 320 }, { "epoch": 0.33, "grad_norm": 0.2195869989993284, "learning_rate": 0.00016865314250442398, "loss": 0.7058, "step": 325 }, { "epoch": 0.34, "grad_norm": 0.2179326990487266, "learning_rate": 0.00016733837833987633, "loss": 0.7033, "step": 330 }, { "epoch": 0.34, "grad_norm": 0.22163995265105102, "learning_rate": 0.00016600196294609045, "loss": 0.7062, "step": 335 }, { "epoch": 0.35, "grad_norm": 0.20563507571414916, "learning_rate": 0.00016464432601912912, "loss": 0.7177, "step": 340 }, { "epoch": 0.35, "grad_norm": 0.1872905006878245, "learning_rate": 0.0001632659040783897, "loss": 0.7286, "step": 345 }, { "epoch": 0.36, "grad_norm": 0.20516717514655824, "learning_rate": 0.00016186714032625035, "loss": 0.7409, "step": 350 }, { "epoch": 0.36, "grad_norm": 0.22889695695378193, "learning_rate": 0.00016044848450556787, "loss": 0.7167, "step": 355 }, { "epoch": 0.37, "grad_norm": 0.21338607450176578, "learning_rate": 0.00015901039275507245, "loss": 0.7118, "step": 360 }, { "epoch": 0.37, "grad_norm": 0.20932529207974, "learning_rate": 0.00015755332746270572, "loss": 0.7026, "step": 365 }, { "epoch": 0.38, "grad_norm": 0.21841220303292935, "learning_rate": 0.00015607775711694977, "loss": 0.7144, "step": 370 }, { "epoch": 0.38, "grad_norm": 0.2249926395810304, "learning_rate": 0.00015458415615619484, "loss": 0.7036, "step": 375 }, { "epoch": 0.39, "grad_norm": 0.18656241986804054, "learning_rate": 0.00015307300481619333, "loss": 0.7417, "step": 380 }, { "epoch": 0.4, "grad_norm": 0.2157891411015049, "learning_rate": 0.00015154478897565045, "loss": 0.7602, "step": 385 }, { "epoch": 0.4, "grad_norm": 0.18528004635067152, "learning_rate": 0.00015000000000000001, "loss": 0.7143, "step": 390 }, { "epoch": 0.41, "grad_norm": 0.20614234354483332, "learning_rate": 0.00014843913458341645, "loss": 0.693, "step": 395 }, { "epoch": 0.41, "grad_norm": 0.23913126448103222, "learning_rate": 0.00014686269458911332, "loss": 0.7057, "step": 400 }, { "epoch": 0.42, "grad_norm": 0.20395051777151454, "learning_rate": 0.00014527118688797963, "loss": 0.7512, "step": 405 }, { "epoch": 0.42, "grad_norm": 0.24309987168907193, "learning_rate": 0.0001436651231956064, "loss": 0.7164, "step": 410 }, { "epoch": 0.43, "grad_norm": 0.22288836478106547, "learning_rate": 0.00014204501990775533, "loss": 0.7333, "step": 415 }, { "epoch": 0.43, "grad_norm": 0.19997689565338814, "learning_rate": 0.00014041139793432274, "loss": 0.716, "step": 420 }, { "epoch": 0.44, "grad_norm": 0.21410827873496419, "learning_rate": 0.00013876478253185183, "loss": 0.7334, "step": 425 }, { "epoch": 0.44, "grad_norm": 0.22446127836607146, "learning_rate": 0.00013710570313464778, "loss": 0.7042, "step": 430 }, { "epoch": 0.45, "grad_norm": 0.23222688211452414, "learning_rate": 0.0001354346931845492, "loss": 0.7499, "step": 435 }, { "epoch": 0.45, "grad_norm": 0.19295374405986063, "learning_rate": 0.00013375228995941133, "loss": 0.7357, "step": 440 }, { "epoch": 0.46, "grad_norm": 0.19798750394457384, "learning_rate": 0.0001320590344003557, "loss": 0.7334, "step": 445 }, { "epoch": 0.46, "grad_norm": 0.21722730155618872, "learning_rate": 0.00013035547093784186, "loss": 0.7193, "step": 450 }, { "epoch": 0.47, "grad_norm": 0.184647192839392, "learning_rate": 0.00012864214731661742, "loss": 0.7062, "step": 455 }, { "epoch": 0.47, "grad_norm": 0.2286779200568456, "learning_rate": 0.00012691961441960238, "loss": 0.6945, "step": 460 }, { "epoch": 0.48, "grad_norm": 0.19896616453784802, "learning_rate": 0.00012518842609076413, "loss": 0.7188, "step": 465 }, { "epoch": 0.48, "grad_norm": 0.21786128458289136, "learning_rate": 0.00012344913895704097, "loss": 0.7296, "step": 470 }, { "epoch": 0.49, "grad_norm": 0.2070395610707047, "learning_rate": 0.00012170231224937032, "loss": 0.709, "step": 475 }, { "epoch": 0.49, "grad_norm": 0.2267979628686904, "learning_rate": 0.00011994850762287989, "loss": 0.7283, "step": 480 }, { "epoch": 0.5, "grad_norm": 0.20326820203968982, "learning_rate": 0.0001181882889762994, "loss": 0.739, "step": 485 }, { "epoch": 0.5, "grad_norm": 0.19127608343670294, "learning_rate": 0.00011642222227065089, "loss": 0.7072, "step": 490 }, { "epoch": 0.51, "grad_norm": 0.23023462990706073, "learning_rate": 0.00011465087534727587, "loss": 0.7792, "step": 495 }, { "epoch": 0.51, "grad_norm": 0.2299774733193733, "learning_rate": 0.0001128748177452581, "loss": 0.7127, "step": 500 }, { "epoch": 0.52, "grad_norm": 0.1933553970068975, "learning_rate": 0.00011109462051830017, "loss": 0.7673, "step": 505 }, { "epoch": 0.52, "grad_norm": 0.19419056271814522, "learning_rate": 0.00010931085605111354, "loss": 0.6975, "step": 510 }, { "epoch": 0.53, "grad_norm": 0.2142797681525271, "learning_rate": 0.00010752409787538, "loss": 0.7268, "step": 515 }, { "epoch": 0.53, "grad_norm": 0.19406849444774402, "learning_rate": 0.00010573492048534515, "loss": 0.6817, "step": 520 }, { "epoch": 0.54, "grad_norm": 0.19984584827497825, "learning_rate": 0.00010394389915310149, "loss": 0.6799, "step": 525 }, { "epoch": 0.54, "grad_norm": 0.23853144293131298, "learning_rate": 0.00010215160974362223, "loss": 0.7705, "step": 530 }, { "epoch": 0.55, "grad_norm": 0.20647647184488557, "learning_rate": 0.00010035862852960387, "loss": 0.7491, "step": 535 }, { "epoch": 0.55, "grad_norm": 0.24615456413858594, "learning_rate": 9.856553200617805e-05, "loss": 0.6883, "step": 540 }, { "epoch": 0.56, "grad_norm": 0.21240245143643857, "learning_rate": 9.677289670555169e-05, "loss": 0.7247, "step": 545 }, { "epoch": 0.56, "grad_norm": 0.2000823201510655, "learning_rate": 9.49812990116353e-05, "loss": 0.7413, "step": 550 }, { "epoch": 0.57, "grad_norm": 0.20666367113731635, "learning_rate": 9.319131497471894e-05, "loss": 0.75, "step": 555 }, { "epoch": 0.57, "grad_norm": 0.18469259616208825, "learning_rate": 9.140352012625537e-05, "loss": 0.7214, "step": 560 }, { "epoch": 0.58, "grad_norm": 0.21445560674239975, "learning_rate": 8.961848929381026e-05, "loss": 0.7376, "step": 565 }, { "epoch": 0.58, "grad_norm": 0.2059315178687912, "learning_rate": 8.783679641623845e-05, "loss": 0.733, "step": 570 }, { "epoch": 0.59, "grad_norm": 0.23365143820305953, "learning_rate": 8.605901435914607e-05, "loss": 0.7006, "step": 575 }, { "epoch": 0.6, "grad_norm": 0.20038518152384194, "learning_rate": 8.428571473069775e-05, "loss": 0.7281, "step": 580 }, { "epoch": 0.6, "grad_norm": 0.2270448364095405, "learning_rate": 8.25174676978282e-05, "loss": 0.6751, "step": 585 }, { "epoch": 0.61, "grad_norm": 0.22128387907048663, "learning_rate": 8.075484180291701e-05, "loss": 0.694, "step": 590 }, { "epoch": 0.61, "grad_norm": 0.1891101152811574, "learning_rate": 7.899840378098588e-05, "loss": 0.7384, "step": 595 }, { "epoch": 0.62, "grad_norm": 0.21027626346886683, "learning_rate": 7.724871837747707e-05, "loss": 0.7372, "step": 600 }, { "epoch": 0.62, "grad_norm": 0.22335920093565995, "learning_rate": 7.550634816667142e-05, "loss": 0.6979, "step": 605 }, { "epoch": 0.63, "grad_norm": 0.23911224721224508, "learning_rate": 7.377185337080442e-05, "loss": 0.7079, "step": 610 }, { "epoch": 0.63, "grad_norm": 0.22619894149171657, "learning_rate": 7.204579167993881e-05, "loss": 0.7306, "step": 615 }, { "epoch": 0.64, "grad_norm": 0.18690561718768828, "learning_rate": 7.032871807265096e-05, "loss": 0.7476, "step": 620 }, { "epoch": 0.64, "grad_norm": 0.19388179713391346, "learning_rate": 6.862118463758943e-05, "loss": 0.7121, "step": 625 }, { "epoch": 0.65, "grad_norm": 0.19589622259075404, "learning_rate": 6.69237403959624e-05, "loss": 0.7156, "step": 630 }, { "epoch": 0.65, "grad_norm": 0.22348889381112783, "learning_rate": 6.52369311250116e-05, "loss": 0.6988, "step": 635 }, { "epoch": 0.66, "grad_norm": 0.19199107569507434, "learning_rate": 6.356129918252927e-05, "loss": 0.7195, "step": 640 }, { "epoch": 0.66, "grad_norm": 0.21174035880989528, "learning_rate": 6.189738333247432e-05, "loss": 0.6889, "step": 645 }, { "epoch": 0.67, "grad_norm": 0.21951618555807303, "learning_rate": 6.024571857174443e-05, "loss": 0.7401, "step": 650 }, { "epoch": 0.67, "grad_norm": 0.21610930567187886, "learning_rate": 5.860683595815893e-05, "loss": 0.6946, "step": 655 }, { "epoch": 0.68, "grad_norm": 0.1911939965176069, "learning_rate": 5.698126243970845e-05, "loss": 0.7026, "step": 660 }, { "epoch": 0.68, "grad_norm": 0.20270357487417293, "learning_rate": 5.536952068512608e-05, "loss": 0.7083, "step": 665 }, { "epoch": 0.69, "grad_norm": 0.19360058915764913, "learning_rate": 5.3772128915834184e-05, "loss": 0.7945, "step": 670 }, { "epoch": 0.69, "grad_norm": 0.1977928757278191, "learning_rate": 5.218960073932122e-05, "loss": 0.7518, "step": 675 }, { "epoch": 0.7, "grad_norm": 0.20960016035080886, "learning_rate": 5.062244498400228e-05, "loss": 0.7402, "step": 680 }, { "epoch": 0.7, "grad_norm": 0.20026009581864332, "learning_rate": 4.907116553561607e-05, "loss": 0.7033, "step": 685 }, { "epoch": 0.71, "grad_norm": 0.19145735043892229, "learning_rate": 4.753626117521103e-05, "loss": 0.6859, "step": 690 }, { "epoch": 0.71, "grad_norm": 0.19584240345601217, "learning_rate": 4.601822541877291e-05, "loss": 0.6968, "step": 695 }, { "epoch": 0.72, "grad_norm": 0.23270920498649578, "learning_rate": 4.451754635854517e-05, "loss": 0.7173, "step": 700 }, { "epoch": 0.72, "grad_norm": 0.20792189933390842, "learning_rate": 4.303470650609325e-05, "loss": 0.6988, "step": 705 }, { "epoch": 0.73, "grad_norm": 0.2063753862224208, "learning_rate": 4.1570182637163155e-05, "loss": 0.7576, "step": 710 }, { "epoch": 0.73, "grad_norm": 0.21322675282501108, "learning_rate": 4.0124445638384366e-05, "loss": 0.6568, "step": 715 }, { "epoch": 0.74, "grad_norm": 0.21703806382908236, "learning_rate": 3.869796035586625e-05, "loss": 0.711, "step": 720 }, { "epoch": 0.74, "grad_norm": 0.1903818399050557, "learning_rate": 3.7291185445736444e-05, "loss": 0.7013, "step": 725 }, { "epoch": 0.75, "grad_norm": 0.20064983895338837, "learning_rate": 3.590457322666997e-05, "loss": 0.741, "step": 730 }, { "epoch": 0.75, "grad_norm": 0.21130061077032508, "learning_rate": 3.453856953445557e-05, "loss": 0.7383, "step": 735 }, { "epoch": 0.76, "grad_norm": 0.17684598502717375, "learning_rate": 3.319361357864663e-05, "loss": 0.7096, "step": 740 }, { "epoch": 0.76, "grad_norm": 0.17869261243212542, "learning_rate": 3.187013780134291e-05, "loss": 0.7285, "step": 745 }, { "epoch": 0.77, "grad_norm": 0.18740684635500252, "learning_rate": 3.05685677381475e-05, "loss": 0.7475, "step": 750 }, { "epoch": 0.77, "grad_norm": 0.21153099311303938, "learning_rate": 2.9289321881345254e-05, "loss": 0.7122, "step": 755 }, { "epoch": 0.78, "grad_norm": 0.26879324731183346, "learning_rate": 2.8032811545345294e-05, "loss": 0.7629, "step": 760 }, { "epoch": 0.79, "grad_norm": 0.2477379977951793, "learning_rate": 2.679944073443158e-05, "loss": 0.7172, "step": 765 }, { "epoch": 0.79, "grad_norm": 0.21621790135506005, "learning_rate": 2.5589606012863963e-05, "loss": 0.7397, "step": 770 }, { "epoch": 0.8, "grad_norm": 0.22042903614192005, "learning_rate": 2.4403696377371142e-05, "loss": 0.7446, "step": 775 }, { "epoch": 0.8, "grad_norm": 0.2046470002590343, "learning_rate": 2.324209313207736e-05, "loss": 0.7584, "step": 780 }, { "epoch": 0.81, "grad_norm": 0.19842023146287918, "learning_rate": 2.210516976590179e-05, "loss": 0.7306, "step": 785 }, { "epoch": 0.81, "grad_norm": 0.2271680029618541, "learning_rate": 2.099329183247126e-05, "loss": 0.7173, "step": 790 }, { "epoch": 0.82, "grad_norm": 0.19284567420576632, "learning_rate": 1.9906816832584253e-05, "loss": 0.6972, "step": 795 }, { "epoch": 0.82, "grad_norm": 0.18901685190843248, "learning_rate": 1.8846094099263912e-05, "loss": 0.7259, "step": 800 }, { "epoch": 0.83, "grad_norm": 0.20493697097697364, "learning_rate": 1.781146468543765e-05, "loss": 0.6665, "step": 805 }, { "epoch": 0.83, "grad_norm": 0.1892205656840172, "learning_rate": 1.6803261254278636e-05, "loss": 0.6993, "step": 810 }, { "epoch": 0.84, "grad_norm": 0.24736999850815686, "learning_rate": 1.582180797224507e-05, "loss": 0.7075, "step": 815 }, { "epoch": 0.84, "grad_norm": 0.21265134189137622, "learning_rate": 1.4867420404851307e-05, "loss": 0.7257, "step": 820 }, { "epoch": 0.85, "grad_norm": 0.16998077993496108, "learning_rate": 1.3940405415204416e-05, "loss": 0.7293, "step": 825 }, { "epoch": 0.85, "grad_norm": 0.19227746701204856, "learning_rate": 1.30410610653389e-05, "loss": 0.7578, "step": 830 }, { "epoch": 0.86, "grad_norm": 0.2163980137011639, "learning_rate": 1.2169676520381168e-05, "loss": 0.7201, "step": 835 }, { "epoch": 0.86, "grad_norm": 0.2174614380474134, "learning_rate": 1.1326531955574526e-05, "loss": 0.7393, "step": 840 }, { "epoch": 0.87, "grad_norm": 0.19753139078286383, "learning_rate": 1.0511898466194903e-05, "loss": 0.7164, "step": 845 }, { "epoch": 0.87, "grad_norm": 0.19652538273563885, "learning_rate": 9.726037980385738e-06, "loss": 0.687, "step": 850 }, { "epoch": 0.88, "grad_norm": 0.20725040623745117, "learning_rate": 8.969203174940654e-06, "loss": 0.7006, "step": 855 }, { "epoch": 0.88, "grad_norm": 0.1853922408804101, "learning_rate": 8.24163739406062e-06, "loss": 0.7334, "step": 860 }, { "epoch": 0.89, "grad_norm": 0.18542913579514714, "learning_rate": 7.543574571111655e-06, "loss": 0.7387, "step": 865 }, { "epoch": 0.89, "grad_norm": 0.2139733859772643, "learning_rate": 6.875239153408542e-06, "loss": 0.6985, "step": 870 }, { "epoch": 0.9, "grad_norm": 0.22236312870883582, "learning_rate": 6.236846030048604e-06, "loss": 0.6998, "step": 875 }, { "epoch": 0.9, "grad_norm": 0.1849060503719074, "learning_rate": 5.6286004628186675e-06, "loss": 0.7294, "step": 880 }, { "epoch": 0.91, "grad_norm": 0.2090390382331485, "learning_rate": 5.0506980201973974e-06, "loss": 0.7304, "step": 885 }, { "epoch": 0.91, "grad_norm": 0.2030892649120482, "learning_rate": 4.503324514474483e-06, "loss": 0.7548, "step": 890 }, { "epoch": 0.92, "grad_norm": 0.20299856345700878, "learning_rate": 3.986655942006579e-06, "loss": 0.7247, "step": 895 }, { "epoch": 0.92, "grad_norm": 0.2324930931474562, "learning_rate": 3.5008584266294386e-06, "loss": 0.7, "step": 900 }, { "epoch": 0.93, "grad_norm": 0.22219104051624397, "learning_rate": 3.0460881662442763e-06, "loss": 0.7508, "step": 905 }, { "epoch": 0.93, "grad_norm": 0.21326333904242326, "learning_rate": 2.622491382595693e-06, "loss": 0.7225, "step": 910 }, { "epoch": 0.94, "grad_norm": 0.2098358511349563, "learning_rate": 2.2302042742571193e-06, "loss": 0.7132, "step": 915 }, { "epoch": 0.94, "grad_norm": 0.1966565250666515, "learning_rate": 1.869352972839067e-06, "loss": 0.7059, "step": 920 }, { "epoch": 0.95, "grad_norm": 0.2367264112088522, "learning_rate": 1.5400535024342022e-06, "loss": 0.6868, "step": 925 }, { "epoch": 0.95, "grad_norm": 0.23055881813199589, "learning_rate": 1.2424117423122328e-06, "loss": 0.7367, "step": 930 }, { "epoch": 0.96, "grad_norm": 0.1884312004939245, "learning_rate": 9.765233928766493e-07, "loss": 0.7055, "step": 935 }, { "epoch": 0.96, "grad_norm": 0.21767998383781462, "learning_rate": 7.42473944894384e-07, "loss": 0.6955, "step": 940 }, { "epoch": 0.97, "grad_norm": 0.18573736516667685, "learning_rate": 5.403386520079323e-07, "loss": 0.6811, "step": 945 }, { "epoch": 0.97, "grad_norm": 0.1915354126552316, "learning_rate": 3.701825065392184e-07, "loss": 0.7429, "step": 950 }, { "epoch": 0.98, "grad_norm": 0.1852158639659614, "learning_rate": 2.320602185927001e-07, "loss": 0.6799, "step": 955 }, { "epoch": 0.99, "grad_norm": 0.21043187777240224, "learning_rate": 1.2601619846444035e-07, "loss": 0.7613, "step": 960 }, { "epoch": 0.99, "grad_norm": 0.20739430949561452, "learning_rate": 5.208454236296234e-08, "loss": 0.7183, "step": 965 }, { "epoch": 1.0, "grad_norm": 0.22836562458172804, "learning_rate": 1.0289021446308056e-08, "loss": 0.7469, "step": 970 }, { "epoch": 1.0, "eval_loss": 0.7437335252761841, "eval_runtime": 726.8692, "eval_samples_per_second": 9.538, "eval_steps_per_second": 0.597, "step": 974 }, { "epoch": 1.0, "step": 974, "total_flos": 1.1274999981146112e+16, "train_loss": 0.7250236611836255, "train_runtime": 16320.0317, "train_samples_per_second": 3.821, "train_steps_per_second": 0.06 } ], "logging_steps": 5, "max_steps": 974, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 1.1274999981146112e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }