{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9972932091393998, "eval_steps": 100, "global_step": 784, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012737839344001274, "grad_norm": 7.091875076293945, "learning_rate": 3.75e-05, "loss": 37.744, "step": 5 }, { "epoch": 0.02547567868800255, "grad_norm": 2.930401563644409, "learning_rate": 7.5e-05, "loss": 34.0864, "step": 10 }, { "epoch": 0.03821351803200382, "grad_norm": 1.8477588891983032, "learning_rate": 0.0001125, "loss": 31.2726, "step": 15 }, { "epoch": 0.0509513573760051, "grad_norm": 1.3455390930175781, "learning_rate": 0.00015, "loss": 28.3763, "step": 20 }, { "epoch": 0.06368919672000636, "grad_norm": 1.138717532157898, "learning_rate": 0.00018749999999999998, "loss": 26.957, "step": 25 }, { "epoch": 0.07642703606400764, "grad_norm": 0.9747544527053833, "learning_rate": 0.000225, "loss": 24.4616, "step": 30 }, { "epoch": 0.08916487540800892, "grad_norm": 0.9035225510597229, "learning_rate": 0.0002625, "loss": 22.5748, "step": 35 }, { "epoch": 0.1019027147520102, "grad_norm": 0.7786006927490234, "learning_rate": 0.0003, "loss": 20.6574, "step": 40 }, { "epoch": 0.11464055409601147, "grad_norm": 0.7649045586585999, "learning_rate": 0.0003, "loss": 18.9346, "step": 45 }, { "epoch": 0.12737839344001273, "grad_norm": 0.6415356993675232, "learning_rate": 0.0003, "loss": 17.8129, "step": 50 }, { "epoch": 0.140116232784014, "grad_norm": 0.5701594948768616, "learning_rate": 0.0003, "loss": 16.881, "step": 55 }, { "epoch": 0.15285407212801527, "grad_norm": 0.49638187885284424, "learning_rate": 0.0003, "loss": 16.2049, "step": 60 }, { "epoch": 0.16559191147201657, "grad_norm": 0.44346606731414795, "learning_rate": 0.0003, "loss": 15.9336, "step": 65 }, { "epoch": 0.17832975081601785, "grad_norm": 0.4194740355014801, "learning_rate": 0.0003, "loss": 15.2473, "step": 70 }, { "epoch": 0.19106759016001912, "grad_norm": 0.4130041301250458, "learning_rate": 0.0003, "loss": 15.1218, "step": 75 }, { "epoch": 0.2038054295040204, "grad_norm": 0.40480196475982666, "learning_rate": 0.0003, "loss": 14.7839, "step": 80 }, { "epoch": 0.21654326884802166, "grad_norm": 0.394378662109375, "learning_rate": 0.0003, "loss": 14.2312, "step": 85 }, { "epoch": 0.22928110819202294, "grad_norm": 0.39825204014778137, "learning_rate": 0.0003, "loss": 13.9441, "step": 90 }, { "epoch": 0.2420189475360242, "grad_norm": 0.38816991448402405, "learning_rate": 0.0003, "loss": 13.4799, "step": 95 }, { "epoch": 0.25475678688002545, "grad_norm": 0.36586159467697144, "learning_rate": 0.0003, "loss": 13.3276, "step": 100 }, { "epoch": 0.25475678688002545, "eval_accuracy": 0.013133919843597262, "eval_loss": 12.040165901184082, "eval_runtime": 14.4617, "eval_samples_per_second": 17.287, "eval_steps_per_second": 4.356, "step": 100 }, { "epoch": 0.26749462622402675, "grad_norm": 0.40571218729019165, "learning_rate": 0.0003, "loss": 13.1015, "step": 105 }, { "epoch": 0.280232465568028, "grad_norm": 0.3502795696258545, "learning_rate": 0.0003, "loss": 12.614, "step": 110 }, { "epoch": 0.2929703049120293, "grad_norm": 0.33776018023490906, "learning_rate": 0.0003, "loss": 12.488, "step": 115 }, { "epoch": 0.30570814425603055, "grad_norm": 0.3277961015701294, "learning_rate": 0.0003, "loss": 12.2282, "step": 120 }, { "epoch": 0.31844598360003185, "grad_norm": 0.3399854898452759, "learning_rate": 0.0003, "loss": 12.0168, "step": 125 }, { "epoch": 0.33118382294403315, "grad_norm": 0.31557145714759827, "learning_rate": 0.0003, "loss": 11.832, "step": 130 }, { "epoch": 0.3439216622880344, "grad_norm": 0.32902857661247253, "learning_rate": 0.0003, "loss": 11.4818, "step": 135 }, { "epoch": 0.3566595016320357, "grad_norm": 0.34518980979919434, "learning_rate": 0.0003, "loss": 11.3197, "step": 140 }, { "epoch": 0.36939734097603694, "grad_norm": 0.32530176639556885, "learning_rate": 0.0003, "loss": 11.0346, "step": 145 }, { "epoch": 0.38213518032003824, "grad_norm": 0.3253624141216278, "learning_rate": 0.0003, "loss": 10.6717, "step": 150 }, { "epoch": 0.3948730196640395, "grad_norm": 0.33527347445487976, "learning_rate": 0.0003, "loss": 10.5302, "step": 155 }, { "epoch": 0.4076108590080408, "grad_norm": 0.3164774477481842, "learning_rate": 0.0003, "loss": 10.2009, "step": 160 }, { "epoch": 0.420348698352042, "grad_norm": 0.3047502934932709, "learning_rate": 0.0003, "loss": 10.1689, "step": 165 }, { "epoch": 0.4330865376960433, "grad_norm": 0.31613191962242126, "learning_rate": 0.0003, "loss": 9.85, "step": 170 }, { "epoch": 0.4458243770400446, "grad_norm": 0.3114412724971771, "learning_rate": 0.0003, "loss": 9.6662, "step": 175 }, { "epoch": 0.4585622163840459, "grad_norm": 0.31863468885421753, "learning_rate": 0.0003, "loss": 9.4857, "step": 180 }, { "epoch": 0.4713000557280471, "grad_norm": 0.3024883568286896, "learning_rate": 0.0003, "loss": 9.2409, "step": 185 }, { "epoch": 0.4840378950720484, "grad_norm": 0.3118532598018646, "learning_rate": 0.0003, "loss": 9.156, "step": 190 }, { "epoch": 0.49677573441604966, "grad_norm": 0.3026701807975769, "learning_rate": 0.0003, "loss": 9.0273, "step": 195 }, { "epoch": 0.5095135737600509, "grad_norm": 0.3058376908302307, "learning_rate": 0.0003, "loss": 8.9207, "step": 200 }, { "epoch": 0.5095135737600509, "eval_accuracy": 0.03601173020527859, "eval_loss": 8.031224250793457, "eval_runtime": 14.6886, "eval_samples_per_second": 17.02, "eval_steps_per_second": 4.289, "step": 200 }, { "epoch": 0.5222514131040522, "grad_norm": 0.31776145100593567, "learning_rate": 0.0003, "loss": 8.819, "step": 205 }, { "epoch": 0.5349892524480535, "grad_norm": 0.3050650656223297, "learning_rate": 0.0003, "loss": 8.7563, "step": 210 }, { "epoch": 0.5477270917920548, "grad_norm": 0.31346216797828674, "learning_rate": 0.0003, "loss": 8.4781, "step": 215 }, { "epoch": 0.560464931136056, "grad_norm": 0.3162192404270172, "learning_rate": 0.0003, "loss": 8.49, "step": 220 }, { "epoch": 0.5732027704800573, "grad_norm": 0.2908290922641754, "learning_rate": 0.0003, "loss": 8.1487, "step": 225 }, { "epoch": 0.5859406098240586, "grad_norm": 0.29553738236427307, "learning_rate": 0.0003, "loss": 8.2668, "step": 230 }, { "epoch": 0.5986784491680599, "grad_norm": 0.288335919380188, "learning_rate": 0.0003, "loss": 8.1061, "step": 235 }, { "epoch": 0.6114162885120611, "grad_norm": 0.30966615676879883, "learning_rate": 0.0003, "loss": 8.1297, "step": 240 }, { "epoch": 0.6241541278560624, "grad_norm": 0.29941117763519287, "learning_rate": 0.0003, "loss": 7.8082, "step": 245 }, { "epoch": 0.6368919672000637, "grad_norm": 0.29136765003204346, "learning_rate": 0.0003, "loss": 7.937, "step": 250 }, { "epoch": 0.649629806544065, "grad_norm": 0.30150941014289856, "learning_rate": 0.0003, "loss": 7.7454, "step": 255 }, { "epoch": 0.6623676458880663, "grad_norm": 0.28709036111831665, "learning_rate": 0.0003, "loss": 7.8069, "step": 260 }, { "epoch": 0.6751054852320675, "grad_norm": 0.31939393281936646, "learning_rate": 0.0003, "loss": 7.631, "step": 265 }, { "epoch": 0.6878433245760688, "grad_norm": 0.29692211747169495, "learning_rate": 0.0003, "loss": 7.6632, "step": 270 }, { "epoch": 0.7005811639200701, "grad_norm": 0.3304164409637451, "learning_rate": 0.0003, "loss": 7.4727, "step": 275 }, { "epoch": 0.7133190032640714, "grad_norm": 0.28332462906837463, "learning_rate": 0.0003, "loss": 7.4796, "step": 280 }, { "epoch": 0.7260568426080726, "grad_norm": 0.2897827625274658, "learning_rate": 0.0003, "loss": 7.5389, "step": 285 }, { "epoch": 0.7387946819520739, "grad_norm": 0.2887686491012573, "learning_rate": 0.0003, "loss": 7.382, "step": 290 }, { "epoch": 0.7515325212960752, "grad_norm": 0.3093564212322235, "learning_rate": 0.0003, "loss": 7.2586, "step": 295 }, { "epoch": 0.7642703606400765, "grad_norm": 0.2902717590332031, "learning_rate": 0.0003, "loss": 7.2681, "step": 300 }, { "epoch": 0.7642703606400765, "eval_accuracy": 0.050643206256109484, "eval_loss": 6.477533340454102, "eval_runtime": 14.6327, "eval_samples_per_second": 17.085, "eval_steps_per_second": 4.305, "step": 300 }, { "epoch": 0.7770081999840777, "grad_norm": 0.2867899239063263, "learning_rate": 0.0003, "loss": 7.0712, "step": 305 }, { "epoch": 0.789746039328079, "grad_norm": 0.27321040630340576, "learning_rate": 0.0003, "loss": 7.0524, "step": 310 }, { "epoch": 0.8024838786720803, "grad_norm": 0.3487064242362976, "learning_rate": 0.0003, "loss": 7.0939, "step": 315 }, { "epoch": 0.8152217180160816, "grad_norm": 0.329608291387558, "learning_rate": 0.0003, "loss": 6.9997, "step": 320 }, { "epoch": 0.8279595573600828, "grad_norm": 0.3154338300228119, "learning_rate": 0.0003, "loss": 6.9663, "step": 325 }, { "epoch": 0.840697396704084, "grad_norm": 0.31021803617477417, "learning_rate": 0.0003, "loss": 6.7821, "step": 330 }, { "epoch": 0.8534352360480854, "grad_norm": 0.388336181640625, "learning_rate": 0.0003, "loss": 6.7751, "step": 335 }, { "epoch": 0.8661730753920867, "grad_norm": 0.31887954473495483, "learning_rate": 0.0003, "loss": 6.702, "step": 340 }, { "epoch": 0.8789109147360878, "grad_norm": 0.31558957695961, "learning_rate": 0.0003, "loss": 6.6206, "step": 345 }, { "epoch": 0.8916487540800891, "grad_norm": 0.30751529335975647, "learning_rate": 0.0003, "loss": 6.7077, "step": 350 }, { "epoch": 0.9043865934240904, "grad_norm": 0.33058232069015503, "learning_rate": 0.0003, "loss": 6.557, "step": 355 }, { "epoch": 0.9171244327680917, "grad_norm": 0.3375111222267151, "learning_rate": 0.0003, "loss": 6.6369, "step": 360 }, { "epoch": 0.9298622721120929, "grad_norm": 0.3047392964363098, "learning_rate": 0.0003, "loss": 6.5796, "step": 365 }, { "epoch": 0.9426001114560942, "grad_norm": 0.430053174495697, "learning_rate": 0.0003, "loss": 6.5548, "step": 370 }, { "epoch": 0.9553379508000955, "grad_norm": 0.3610515296459198, "learning_rate": 0.0003, "loss": 6.4576, "step": 375 }, { "epoch": 0.9680757901440968, "grad_norm": 0.32095110416412354, "learning_rate": 0.0003, "loss": 6.4266, "step": 380 }, { "epoch": 0.980813629488098, "grad_norm": 0.32170969247817993, "learning_rate": 0.0003, "loss": 6.5597, "step": 385 }, { "epoch": 0.9935514688320993, "grad_norm": 0.29942792654037476, "learning_rate": 0.0003, "loss": 6.3873, "step": 390 }, { "epoch": 1.0062893081761006, "grad_norm": 0.2971299886703491, "learning_rate": 0.0003, "loss": 6.3915, "step": 395 }, { "epoch": 1.0190271475201018, "grad_norm": 0.2800815999507904, "learning_rate": 0.0003, "loss": 6.3187, "step": 400 }, { "epoch": 1.0190271475201018, "eval_accuracy": 0.0433822091886608, "eval_loss": 5.622740268707275, "eval_runtime": 14.4103, "eval_samples_per_second": 17.349, "eval_steps_per_second": 4.372, "step": 400 }, { "epoch": 1.0317649868641032, "grad_norm": 0.28819501399993896, "learning_rate": 0.0003, "loss": 6.328, "step": 405 }, { "epoch": 1.0445028262081044, "grad_norm": 0.3983236849308014, "learning_rate": 0.0003, "loss": 6.3988, "step": 410 }, { "epoch": 1.0572406655521058, "grad_norm": 0.2969406545162201, "learning_rate": 0.0003, "loss": 6.2509, "step": 415 }, { "epoch": 1.069978504896107, "grad_norm": 0.2973212003707886, "learning_rate": 0.0003, "loss": 6.1234, "step": 420 }, { "epoch": 1.0827163442401082, "grad_norm": 0.3298945426940918, "learning_rate": 0.0003, "loss": 6.3219, "step": 425 }, { "epoch": 1.0954541835841096, "grad_norm": 0.3493943214416504, "learning_rate": 0.0003, "loss": 6.0888, "step": 430 }, { "epoch": 1.1081920229281108, "grad_norm": 0.3639209270477295, "learning_rate": 0.0003, "loss": 6.2226, "step": 435 }, { "epoch": 1.120929862272112, "grad_norm": 0.43913957476615906, "learning_rate": 0.0003, "loss": 6.0308, "step": 440 }, { "epoch": 1.1336677016161134, "grad_norm": 0.43267834186553955, "learning_rate": 0.0003, "loss": 6.0806, "step": 445 }, { "epoch": 1.1464055409601146, "grad_norm": 0.4563148021697998, "learning_rate": 0.0003, "loss": 5.9703, "step": 450 }, { "epoch": 1.159143380304116, "grad_norm": 0.4002761244773865, "learning_rate": 0.0003, "loss": 5.9163, "step": 455 }, { "epoch": 1.1718812196481172, "grad_norm": 0.4359826147556305, "learning_rate": 0.0003, "loss": 5.8285, "step": 460 }, { "epoch": 1.1846190589921184, "grad_norm": 0.5450247526168823, "learning_rate": 0.0003, "loss": 5.8063, "step": 465 }, { "epoch": 1.1973568983361198, "grad_norm": 0.3597274422645569, "learning_rate": 0.0003, "loss": 5.6978, "step": 470 }, { "epoch": 1.210094737680121, "grad_norm": 0.4141215980052948, "learning_rate": 0.0003, "loss": 5.6078, "step": 475 }, { "epoch": 1.2228325770241222, "grad_norm": 0.3695543110370636, "learning_rate": 0.0003, "loss": 5.6728, "step": 480 }, { "epoch": 1.2355704163681236, "grad_norm": 0.5060051083564758, "learning_rate": 0.0003, "loss": 5.6049, "step": 485 }, { "epoch": 1.2483082557121248, "grad_norm": 0.5355808138847351, "learning_rate": 0.0003, "loss": 5.6564, "step": 490 }, { "epoch": 1.261046095056126, "grad_norm": 0.4578459858894348, "learning_rate": 0.0003, "loss": 5.5758, "step": 495 }, { "epoch": 1.2737839344001274, "grad_norm": 0.4868403673171997, "learning_rate": 0.0003, "loss": 5.5695, "step": 500 }, { "epoch": 1.2737839344001274, "eval_accuracy": 0.36348778103616813, "eval_loss": 4.77961540222168, "eval_runtime": 14.5581, "eval_samples_per_second": 17.173, "eval_steps_per_second": 4.328, "step": 500 }, { "epoch": 1.2865217737441286, "grad_norm": 0.550255298614502, "learning_rate": 0.0003, "loss": 5.5591, "step": 505 }, { "epoch": 1.29925961308813, "grad_norm": 0.5515110492706299, "learning_rate": 0.0003, "loss": 5.4588, "step": 510 }, { "epoch": 1.3119974524321312, "grad_norm": 0.44656914472579956, "learning_rate": 0.0003, "loss": 5.4336, "step": 515 }, { "epoch": 1.3247352917761326, "grad_norm": 0.5925999283790588, "learning_rate": 0.0003, "loss": 5.5185, "step": 520 }, { "epoch": 1.3374731311201338, "grad_norm": 0.632453203201294, "learning_rate": 0.0003, "loss": 5.325, "step": 525 }, { "epoch": 1.350210970464135, "grad_norm": 0.5380024909973145, "learning_rate": 0.0003, "loss": 5.4005, "step": 530 }, { "epoch": 1.3629488098081364, "grad_norm": 0.5659191012382507, "learning_rate": 0.0003, "loss": 5.3564, "step": 535 }, { "epoch": 1.3756866491521376, "grad_norm": 0.8913821578025818, "learning_rate": 0.0003, "loss": 5.2763, "step": 540 }, { "epoch": 1.3884244884961388, "grad_norm": 0.9271002411842346, "learning_rate": 0.0003, "loss": 5.4129, "step": 545 }, { "epoch": 1.4011623278401402, "grad_norm": 0.7141408324241638, "learning_rate": 0.0003, "loss": 5.4437, "step": 550 }, { "epoch": 1.4139001671841414, "grad_norm": 0.5360827445983887, "learning_rate": 0.0003, "loss": 5.3523, "step": 555 }, { "epoch": 1.4266380065281425, "grad_norm": 0.6563194990158081, "learning_rate": 0.0003, "loss": 5.1103, "step": 560 }, { "epoch": 1.439375845872144, "grad_norm": 0.6325790882110596, "learning_rate": 0.0003, "loss": 5.4026, "step": 565 }, { "epoch": 1.4521136852161451, "grad_norm": 0.8463213443756104, "learning_rate": 0.0003, "loss": 5.3129, "step": 570 }, { "epoch": 1.4648515245601466, "grad_norm": 0.8394812345504761, "learning_rate": 0.0003, "loss": 5.3415, "step": 575 }, { "epoch": 1.4775893639041477, "grad_norm": 0.692244291305542, "learning_rate": 0.0003, "loss": 5.2649, "step": 580 }, { "epoch": 1.4903272032481492, "grad_norm": 0.6197806000709534, "learning_rate": 0.0003, "loss": 5.112, "step": 585 }, { "epoch": 1.5030650425921503, "grad_norm": 0.6573797464370728, "learning_rate": 0.0003, "loss": 5.1669, "step": 590 }, { "epoch": 1.5158028819361515, "grad_norm": 0.795892059803009, "learning_rate": 0.0003, "loss": 5.1693, "step": 595 }, { "epoch": 1.528540721280153, "grad_norm": 0.6279253363609314, "learning_rate": 0.0003, "loss": 5.2926, "step": 600 }, { "epoch": 1.528540721280153, "eval_accuracy": 0.3952492668621701, "eval_loss": 4.392324447631836, "eval_runtime": 14.409, "eval_samples_per_second": 17.35, "eval_steps_per_second": 4.372, "step": 600 }, { "epoch": 1.5412785606241541, "grad_norm": 0.5762287378311157, "learning_rate": 0.0003, "loss": 5.0475, "step": 605 }, { "epoch": 1.5540163999681553, "grad_norm": 0.5149503350257874, "learning_rate": 0.0003, "loss": 5.1185, "step": 610 }, { "epoch": 1.5667542393121567, "grad_norm": 0.581633985042572, "learning_rate": 0.0003, "loss": 5.1166, "step": 615 }, { "epoch": 1.579492078656158, "grad_norm": 0.5910624861717224, "learning_rate": 0.0003, "loss": 4.9907, "step": 620 }, { "epoch": 1.5922299180001591, "grad_norm": 0.8280585408210754, "learning_rate": 0.0003, "loss": 5.0748, "step": 625 }, { "epoch": 1.6049677573441605, "grad_norm": 0.5128599405288696, "learning_rate": 0.0003, "loss": 4.9768, "step": 630 }, { "epoch": 1.6177055966881617, "grad_norm": 0.7540919184684753, "learning_rate": 0.0003, "loss": 5.0806, "step": 635 }, { "epoch": 1.630443436032163, "grad_norm": 0.6239334940910339, "learning_rate": 0.0003, "loss": 5.1277, "step": 640 }, { "epoch": 1.6431812753761643, "grad_norm": 0.7787991166114807, "learning_rate": 0.0003, "loss": 5.0778, "step": 645 }, { "epoch": 1.6559191147201657, "grad_norm": 0.6328299641609192, "learning_rate": 0.0003, "loss": 4.9763, "step": 650 }, { "epoch": 1.668656954064167, "grad_norm": 0.5455794334411621, "learning_rate": 0.0003, "loss": 5.0049, "step": 655 }, { "epoch": 1.681394793408168, "grad_norm": 0.7078703045845032, "learning_rate": 0.0003, "loss": 5.0258, "step": 660 }, { "epoch": 1.6941326327521695, "grad_norm": 0.6339858770370483, "learning_rate": 0.0003, "loss": 5.1028, "step": 665 }, { "epoch": 1.7068704720961707, "grad_norm": 0.6060242652893066, "learning_rate": 0.0003, "loss": 5.0428, "step": 670 }, { "epoch": 1.719608311440172, "grad_norm": 0.9218889474868774, "learning_rate": 0.0003, "loss": 4.9891, "step": 675 }, { "epoch": 1.7323461507841733, "grad_norm": 0.6890697479248047, "learning_rate": 0.0003, "loss": 4.8921, "step": 680 }, { "epoch": 1.7450839901281745, "grad_norm": 0.9093934297561646, "learning_rate": 0.0003, "loss": 4.9385, "step": 685 }, { "epoch": 1.7578218294721757, "grad_norm": 0.5929202437400818, "learning_rate": 0.0003, "loss": 4.9376, "step": 690 }, { "epoch": 1.770559668816177, "grad_norm": 0.6317362189292908, "learning_rate": 0.0003, "loss": 4.9681, "step": 695 }, { "epoch": 1.7832975081601783, "grad_norm": 0.5537763237953186, "learning_rate": 0.0003, "loss": 4.878, "step": 700 }, { "epoch": 1.7832975081601783, "eval_accuracy": 0.40849266862170086, "eval_loss": 4.187656402587891, "eval_runtime": 14.4745, "eval_samples_per_second": 17.272, "eval_steps_per_second": 4.352, "step": 700 }, { "epoch": 1.7960353475041795, "grad_norm": 0.5984592437744141, "learning_rate": 0.0003, "loss": 4.9092, "step": 705 }, { "epoch": 1.808773186848181, "grad_norm": 0.5060558915138245, "learning_rate": 0.0003, "loss": 4.989, "step": 710 }, { "epoch": 1.8215110261921823, "grad_norm": 0.8713288903236389, "learning_rate": 0.0003, "loss": 4.8114, "step": 715 }, { "epoch": 1.8342488655361833, "grad_norm": 0.8011664748191833, "learning_rate": 0.0003, "loss": 4.8468, "step": 720 }, { "epoch": 1.8469867048801847, "grad_norm": 0.6774628758430481, "learning_rate": 0.0003, "loss": 4.8899, "step": 725 }, { "epoch": 1.859724544224186, "grad_norm": 1.05668044090271, "learning_rate": 0.0003, "loss": 4.8676, "step": 730 }, { "epoch": 1.8724623835681873, "grad_norm": 0.8638430237770081, "learning_rate": 0.0003, "loss": 4.8515, "step": 735 }, { "epoch": 1.8852002229121885, "grad_norm": 0.8210180997848511, "learning_rate": 0.0003, "loss": 4.9094, "step": 740 }, { "epoch": 1.8979380622561899, "grad_norm": 0.6894564032554626, "learning_rate": 0.0003, "loss": 4.8406, "step": 745 }, { "epoch": 1.910675901600191, "grad_norm": 0.7244303822517395, "learning_rate": 0.0003, "loss": 4.8299, "step": 750 }, { "epoch": 1.9234137409441923, "grad_norm": 0.5788025856018066, "learning_rate": 0.0003, "loss": 4.8843, "step": 755 }, { "epoch": 1.9361515802881937, "grad_norm": 0.5082942843437195, "learning_rate": 0.0003, "loss": 4.7624, "step": 760 }, { "epoch": 1.9488894196321949, "grad_norm": 0.6290297508239746, "learning_rate": 0.0003, "loss": 4.7709, "step": 765 }, { "epoch": 1.961627258976196, "grad_norm": 0.5582670569419861, "learning_rate": 0.0003, "loss": 4.7169, "step": 770 }, { "epoch": 1.9743650983201975, "grad_norm": 0.6051950454711914, "learning_rate": 0.0003, "loss": 4.7701, "step": 775 }, { "epoch": 1.9871029376641989, "grad_norm": 0.6427810788154602, "learning_rate": 0.0003, "loss": 4.7729, "step": 780 }, { "epoch": 1.9972932091393998, "step": 784, "total_flos": 6.247688798679859e+16, "train_loss": 8.457253451250038, "train_runtime": 65851.1603, "train_samples_per_second": 1.526, "train_steps_per_second": 0.012 } ], "logging_steps": 5, "max_steps": 784, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 6.247688798679859e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }