{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.995409685563462, "eval_steps": 500, "global_step": 21780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09180628873077806, "grad_norm": 0.3286115229129791, "learning_rate": 0.0003, "loss": 1.8271, "step": 100 }, { "epoch": 0.18361257746155613, "grad_norm": 0.401665598154068, "learning_rate": 0.0003, "loss": 1.7773, "step": 200 }, { "epoch": 0.2754188661923342, "grad_norm": 0.27676922082901, "learning_rate": 0.0003, "loss": 1.7687, "step": 300 }, { "epoch": 0.36722515492311225, "grad_norm": 0.3495130240917206, "learning_rate": 0.0003, "loss": 1.7648, "step": 400 }, { "epoch": 0.4590314436538903, "grad_norm": 0.3620341718196869, "learning_rate": 0.0003, "loss": 1.7494, "step": 500 }, { "epoch": 0.5508377323846684, "grad_norm": 0.3991380035877228, "learning_rate": 0.0003, "loss": 1.7729, "step": 600 }, { "epoch": 0.6426440211154464, "grad_norm": 0.4441532492637634, "learning_rate": 0.0003, "loss": 1.7407, "step": 700 }, { "epoch": 0.7344503098462245, "grad_norm": 0.38743725419044495, "learning_rate": 0.0003, "loss": 1.7537, "step": 800 }, { "epoch": 0.8262565985770025, "grad_norm": 0.42725738883018494, "learning_rate": 0.0003, "loss": 1.7377, "step": 900 }, { "epoch": 0.9180628873077806, "grad_norm": 0.43618279695510864, "learning_rate": 0.0003, "loss": 1.7567, "step": 1000 }, { "epoch": 0.999770484278173, "eval_accuracy": 0.5190476190476191, "eval_loss": 2.262606382369995, "eval_runtime": 5.8101, "eval_samples_per_second": 86.057, "eval_steps_per_second": 10.843, "step": 1089 }, { "epoch": 1.0098691760385587, "grad_norm": 0.4314129650592804, "learning_rate": 0.0003, "loss": 1.7239, "step": 1100 }, { "epoch": 1.1016754647693368, "grad_norm": 0.5108153820037842, "learning_rate": 0.0003, "loss": 1.5348, "step": 1200 }, { "epoch": 1.1934817535001148, "grad_norm": 0.4964566230773926, "learning_rate": 0.0003, "loss": 1.5236, "step": 1300 }, { "epoch": 1.2852880422308928, "grad_norm": 0.4912070035934448, "learning_rate": 0.0003, "loss": 1.5342, "step": 1400 }, { "epoch": 1.377094330961671, "grad_norm": 0.5223585367202759, "learning_rate": 0.0003, "loss": 1.5405, "step": 1500 }, { "epoch": 1.468900619692449, "grad_norm": 0.4836246371269226, "learning_rate": 0.0003, "loss": 1.546, "step": 1600 }, { "epoch": 1.560706908423227, "grad_norm": 0.6014620661735535, "learning_rate": 0.0003, "loss": 1.5504, "step": 1700 }, { "epoch": 1.652513197154005, "grad_norm": 0.47756990790367126, "learning_rate": 0.0003, "loss": 1.5461, "step": 1800 }, { "epoch": 1.744319485884783, "grad_norm": 0.5226470828056335, "learning_rate": 0.0003, "loss": 1.5502, "step": 1900 }, { "epoch": 1.836125774615561, "grad_norm": 0.5086384415626526, "learning_rate": 0.0003, "loss": 1.5474, "step": 2000 }, { "epoch": 1.9279320633463393, "grad_norm": 0.5471687912940979, "learning_rate": 0.0003, "loss": 1.5617, "step": 2100 }, { "epoch": 1.999540968556346, "eval_accuracy": 0.5246031746031746, "eval_loss": 2.2435803413391113, "eval_runtime": 5.3712, "eval_samples_per_second": 93.09, "eval_steps_per_second": 11.729, "step": 2178 }, { "epoch": 2.0197383520771175, "grad_norm": 0.5887908935546875, "learning_rate": 0.0003, "loss": 1.4881, "step": 2200 }, { "epoch": 2.1115446408078955, "grad_norm": 0.6036558151245117, "learning_rate": 0.0003, "loss": 1.267, "step": 2300 }, { "epoch": 2.2033509295386735, "grad_norm": 0.724539041519165, "learning_rate": 0.0003, "loss": 1.2882, "step": 2400 }, { "epoch": 2.2951572182694515, "grad_norm": 0.6767842173576355, "learning_rate": 0.0003, "loss": 1.2945, "step": 2500 }, { "epoch": 2.3869635070002295, "grad_norm": 0.5520135760307312, "learning_rate": 0.0003, "loss": 1.3038, "step": 2600 }, { "epoch": 2.4787697957310075, "grad_norm": 0.6388158798217773, "learning_rate": 0.0003, "loss": 1.3132, "step": 2700 }, { "epoch": 2.5705760844617855, "grad_norm": 0.6712493896484375, "learning_rate": 0.0003, "loss": 1.3205, "step": 2800 }, { "epoch": 2.6623823731925635, "grad_norm": 0.6082379221916199, "learning_rate": 0.0003, "loss": 1.3272, "step": 2900 }, { "epoch": 2.754188661923342, "grad_norm": 0.6824610233306885, "learning_rate": 0.0003, "loss": 1.3514, "step": 3000 }, { "epoch": 2.84599495065412, "grad_norm": 0.6347182989120483, "learning_rate": 0.0003, "loss": 1.3454, "step": 3100 }, { "epoch": 2.937801239384898, "grad_norm": 0.6638208627700806, "learning_rate": 0.0003, "loss": 1.343, "step": 3200 }, { "epoch": 2.9993114528345193, "eval_accuracy": 0.5236825396825396, "eval_loss": 2.3385279178619385, "eval_runtime": 5.417, "eval_samples_per_second": 92.301, "eval_steps_per_second": 11.63, "step": 3267 }, { "epoch": 3.029607528115676, "grad_norm": 0.8752885460853577, "learning_rate": 0.0003, "loss": 1.264, "step": 3300 }, { "epoch": 3.121413816846454, "grad_norm": 0.8712926506996155, "learning_rate": 0.0003, "loss": 1.0576, "step": 3400 }, { "epoch": 3.213220105577232, "grad_norm": 0.9298152327537537, "learning_rate": 0.0003, "loss": 1.0825, "step": 3500 }, { "epoch": 3.30502639430801, "grad_norm": 0.9302284717559814, "learning_rate": 0.0003, "loss": 1.0984, "step": 3600 }, { "epoch": 3.396832683038788, "grad_norm": 0.9003168940544128, "learning_rate": 0.0003, "loss": 1.112, "step": 3700 }, { "epoch": 3.488638971769566, "grad_norm": 0.8025063276290894, "learning_rate": 0.0003, "loss": 1.1348, "step": 3800 }, { "epoch": 3.580445260500344, "grad_norm": 0.9571109414100647, "learning_rate": 0.0003, "loss": 1.1117, "step": 3900 }, { "epoch": 3.672251549231122, "grad_norm": 0.8698576092720032, "learning_rate": 0.0003, "loss": 1.1431, "step": 4000 }, { "epoch": 3.7640578379619005, "grad_norm": 0.9028270244598389, "learning_rate": 0.0003, "loss": 1.1455, "step": 4100 }, { "epoch": 3.8558641266926785, "grad_norm": 0.851496696472168, "learning_rate": 0.0003, "loss": 1.1769, "step": 4200 }, { "epoch": 3.9476704154234565, "grad_norm": 0.8378119468688965, "learning_rate": 0.0003, "loss": 1.1682, "step": 4300 }, { "epoch": 4.0, "eval_accuracy": 0.5215238095238095, "eval_loss": 2.499518394470215, "eval_runtime": 5.6608, "eval_samples_per_second": 88.327, "eval_steps_per_second": 11.129, "step": 4357 }, { "epoch": 4.039476704154235, "grad_norm": 0.7991979718208313, "learning_rate": 0.0003, "loss": 1.0485, "step": 4400 }, { "epoch": 4.131282992885013, "grad_norm": 0.7637009024620056, "learning_rate": 0.0003, "loss": 0.8982, "step": 4500 }, { "epoch": 4.223089281615791, "grad_norm": 0.8598212003707886, "learning_rate": 0.0003, "loss": 0.9252, "step": 4600 }, { "epoch": 4.314895570346569, "grad_norm": 0.8812803030014038, "learning_rate": 0.0003, "loss": 0.9372, "step": 4700 }, { "epoch": 4.406701859077347, "grad_norm": 0.8750812411308289, "learning_rate": 0.0003, "loss": 0.9422, "step": 4800 }, { "epoch": 4.498508147808125, "grad_norm": 1.0362316370010376, "learning_rate": 0.0003, "loss": 0.9617, "step": 4900 }, { "epoch": 4.590314436538903, "grad_norm": 1.1548477411270142, "learning_rate": 0.0003, "loss": 0.9721, "step": 5000 }, { "epoch": 4.682120725269681, "grad_norm": 1.0806621313095093, "learning_rate": 0.0003, "loss": 0.9884, "step": 5100 }, { "epoch": 4.773927014000459, "grad_norm": 0.8893236517906189, "learning_rate": 0.0003, "loss": 0.9934, "step": 5200 }, { "epoch": 4.865733302731237, "grad_norm": 0.9162046313285828, "learning_rate": 0.0003, "loss": 1.0058, "step": 5300 }, { "epoch": 4.957539591462015, "grad_norm": 0.8382129073143005, "learning_rate": 0.0003, "loss": 1.0141, "step": 5400 }, { "epoch": 4.999770484278173, "eval_accuracy": 0.5181587301587302, "eval_loss": 2.6397223472595215, "eval_runtime": 5.7176, "eval_samples_per_second": 87.45, "eval_steps_per_second": 11.019, "step": 5446 }, { "epoch": 5.049345880192793, "grad_norm": 0.8659687042236328, "learning_rate": 0.0003, "loss": 0.8863, "step": 5500 }, { "epoch": 5.141152168923571, "grad_norm": 0.9213656187057495, "learning_rate": 0.0003, "loss": 0.7697, "step": 5600 }, { "epoch": 5.232958457654349, "grad_norm": 1.1257935762405396, "learning_rate": 0.0003, "loss": 0.7966, "step": 5700 }, { "epoch": 5.324764746385127, "grad_norm": 0.9488617777824402, "learning_rate": 0.0003, "loss": 0.7945, "step": 5800 }, { "epoch": 5.416571035115905, "grad_norm": 1.0306527614593506, "learning_rate": 0.0003, "loss": 0.8215, "step": 5900 }, { "epoch": 5.508377323846684, "grad_norm": 1.0242127180099487, "learning_rate": 0.0003, "loss": 0.8387, "step": 6000 }, { "epoch": 5.600183612577462, "grad_norm": 1.0761654376983643, "learning_rate": 0.0003, "loss": 0.8484, "step": 6100 }, { "epoch": 5.69198990130824, "grad_norm": 1.1436479091644287, "learning_rate": 0.0003, "loss": 0.8564, "step": 6200 }, { "epoch": 5.783796190039018, "grad_norm": 0.9349524974822998, "learning_rate": 0.0003, "loss": 0.8736, "step": 6300 }, { "epoch": 5.875602478769796, "grad_norm": 0.9073782563209534, "learning_rate": 0.0003, "loss": 0.8956, "step": 6400 }, { "epoch": 5.967408767500574, "grad_norm": 1.085017442703247, "learning_rate": 0.0003, "loss": 0.9023, "step": 6500 }, { "epoch": 5.999540968556346, "eval_accuracy": 0.516984126984127, "eval_loss": 2.7929389476776123, "eval_runtime": 6.1646, "eval_samples_per_second": 81.108, "eval_steps_per_second": 10.22, "step": 6535 }, { "epoch": 6.059215056231352, "grad_norm": 0.881386935710907, "learning_rate": 0.0003, "loss": 0.7462, "step": 6600 }, { "epoch": 6.15102134496213, "grad_norm": 1.0695873498916626, "learning_rate": 0.0003, "loss": 0.6751, "step": 6700 }, { "epoch": 6.242827633692908, "grad_norm": 0.9458519816398621, "learning_rate": 0.0003, "loss": 0.7007, "step": 6800 }, { "epoch": 6.334633922423686, "grad_norm": 1.045708179473877, "learning_rate": 0.0003, "loss": 0.7105, "step": 6900 }, { "epoch": 6.426440211154464, "grad_norm": 0.9866082668304443, "learning_rate": 0.0003, "loss": 0.7335, "step": 7000 }, { "epoch": 6.518246499885242, "grad_norm": 0.9309425950050354, "learning_rate": 0.0003, "loss": 0.7354, "step": 7100 }, { "epoch": 6.61005278861602, "grad_norm": 0.921173632144928, "learning_rate": 0.0003, "loss": 0.7593, "step": 7200 }, { "epoch": 6.701859077346798, "grad_norm": 1.2775295972824097, "learning_rate": 0.0003, "loss": 0.7603, "step": 7300 }, { "epoch": 6.793665366077576, "grad_norm": 1.1141977310180664, "learning_rate": 0.0003, "loss": 0.7792, "step": 7400 }, { "epoch": 6.885471654808354, "grad_norm": 1.1323295831680298, "learning_rate": 0.0003, "loss": 0.7966, "step": 7500 }, { "epoch": 6.977277943539132, "grad_norm": 1.2780287265777588, "learning_rate": 0.0003, "loss": 0.8008, "step": 7600 }, { "epoch": 6.999311452834519, "eval_accuracy": 0.5162222222222222, "eval_loss": 2.8233399391174316, "eval_runtime": 5.472, "eval_samples_per_second": 91.374, "eval_steps_per_second": 11.513, "step": 7624 }, { "epoch": 7.06908423226991, "grad_norm": 1.0170750617980957, "learning_rate": 0.0003, "loss": 0.6451, "step": 7700 }, { "epoch": 7.160890521000688, "grad_norm": 0.8980923891067505, "learning_rate": 0.0003, "loss": 0.6104, "step": 7800 }, { "epoch": 7.252696809731467, "grad_norm": 1.0722894668579102, "learning_rate": 0.0003, "loss": 0.6217, "step": 7900 }, { "epoch": 7.344503098462245, "grad_norm": 1.0610452890396118, "learning_rate": 0.0003, "loss": 0.6405, "step": 8000 }, { "epoch": 7.436309387193023, "grad_norm": 0.9819157123565674, "learning_rate": 0.0003, "loss": 0.6599, "step": 8100 }, { "epoch": 7.528115675923801, "grad_norm": 1.0208162069320679, "learning_rate": 0.0003, "loss": 0.6663, "step": 8200 }, { "epoch": 7.619921964654579, "grad_norm": 1.1422516107559204, "learning_rate": 0.0003, "loss": 0.6826, "step": 8300 }, { "epoch": 7.711728253385357, "grad_norm": 1.179495930671692, "learning_rate": 0.0003, "loss": 0.7054, "step": 8400 }, { "epoch": 7.803534542116135, "grad_norm": 1.155306100845337, "learning_rate": 0.0003, "loss": 0.7126, "step": 8500 }, { "epoch": 7.895340830846913, "grad_norm": 1.1133177280426025, "learning_rate": 0.0003, "loss": 0.7246, "step": 8600 }, { "epoch": 7.987147119577691, "grad_norm": 1.1150102615356445, "learning_rate": 0.0003, "loss": 0.7377, "step": 8700 }, { "epoch": 8.0, "eval_accuracy": 0.5180317460317461, "eval_loss": 2.883272886276245, "eval_runtime": 5.7501, "eval_samples_per_second": 86.955, "eval_steps_per_second": 10.956, "step": 8714 }, { "epoch": 8.07895340830847, "grad_norm": 0.9108123779296875, "learning_rate": 0.0003, "loss": 0.5637, "step": 8800 }, { "epoch": 8.170759697039248, "grad_norm": 1.1592975854873657, "learning_rate": 0.0003, "loss": 0.5671, "step": 8900 }, { "epoch": 8.262565985770026, "grad_norm": 1.1172692775726318, "learning_rate": 0.0003, "loss": 0.5708, "step": 9000 }, { "epoch": 8.354372274500804, "grad_norm": 1.0453649759292603, "learning_rate": 0.0003, "loss": 0.587, "step": 9100 }, { "epoch": 8.446178563231582, "grad_norm": 1.0275832414627075, "learning_rate": 0.0003, "loss": 0.6085, "step": 9200 }, { "epoch": 8.53798485196236, "grad_norm": 1.0241683721542358, "learning_rate": 0.0003, "loss": 0.6214, "step": 9300 }, { "epoch": 8.629791140693138, "grad_norm": 1.1279902458190918, "learning_rate": 0.0003, "loss": 0.6232, "step": 9400 }, { "epoch": 8.721597429423916, "grad_norm": 1.143254041671753, "learning_rate": 0.0003, "loss": 0.6446, "step": 9500 }, { "epoch": 8.813403718154694, "grad_norm": 1.138592004776001, "learning_rate": 0.0003, "loss": 0.6472, "step": 9600 }, { "epoch": 8.905210006885472, "grad_norm": 1.1288646459579468, "learning_rate": 0.0003, "loss": 0.6801, "step": 9700 }, { "epoch": 8.99701629561625, "grad_norm": 1.1128137111663818, "learning_rate": 0.0003, "loss": 0.6732, "step": 9800 }, { "epoch": 8.999770484278173, "eval_accuracy": 0.5165079365079365, "eval_loss": 2.9550092220306396, "eval_runtime": 5.5706, "eval_samples_per_second": 89.756, "eval_steps_per_second": 11.309, "step": 9803 }, { "epoch": 9.088822584347028, "grad_norm": 1.0105735063552856, "learning_rate": 0.0003, "loss": 0.5071, "step": 9900 }, { "epoch": 9.180628873077806, "grad_norm": 1.0888274908065796, "learning_rate": 0.0003, "loss": 0.5164, "step": 10000 }, { "epoch": 9.272435161808584, "grad_norm": 1.067857027053833, "learning_rate": 0.0003, "loss": 0.5311, "step": 10100 }, { "epoch": 9.364241450539362, "grad_norm": 1.0484533309936523, "learning_rate": 0.0003, "loss": 0.5501, "step": 10200 }, { "epoch": 9.45604773927014, "grad_norm": 1.0611076354980469, "learning_rate": 0.0003, "loss": 0.5635, "step": 10300 }, { "epoch": 9.547854028000918, "grad_norm": 1.1622737646102905, "learning_rate": 0.0003, "loss": 0.5713, "step": 10400 }, { "epoch": 9.639660316731696, "grad_norm": 1.272035837173462, "learning_rate": 0.0003, "loss": 0.5933, "step": 10500 }, { "epoch": 9.731466605462474, "grad_norm": 1.2226234674453735, "learning_rate": 0.0003, "loss": 0.5939, "step": 10600 }, { "epoch": 9.823272894193252, "grad_norm": 1.197426199913025, "learning_rate": 0.0003, "loss": 0.6289, "step": 10700 }, { "epoch": 9.91507918292403, "grad_norm": 1.2565838098526, "learning_rate": 0.0003, "loss": 0.6225, "step": 10800 }, { "epoch": 9.999540968556346, "eval_accuracy": 0.5165079365079365, "eval_loss": 2.9767260551452637, "eval_runtime": 5.7303, "eval_samples_per_second": 87.255, "eval_steps_per_second": 10.994, "step": 10892 }, { "epoch": 10.006885471654808, "grad_norm": 1.0739877223968506, "learning_rate": 0.0003, "loss": 0.6213, "step": 10900 }, { "epoch": 10.098691760385586, "grad_norm": 1.0812422037124634, "learning_rate": 0.0003, "loss": 0.4637, "step": 11000 }, { "epoch": 10.190498049116364, "grad_norm": 1.0868052244186401, "learning_rate": 0.0003, "loss": 0.485, "step": 11100 }, { "epoch": 10.282304337847142, "grad_norm": 1.1750460863113403, "learning_rate": 0.0003, "loss": 0.5093, "step": 11200 }, { "epoch": 10.37411062657792, "grad_norm": 1.2779016494750977, "learning_rate": 0.0003, "loss": 0.5184, "step": 11300 }, { "epoch": 10.465916915308698, "grad_norm": 1.2549773454666138, "learning_rate": 0.0003, "loss": 0.5342, "step": 11400 }, { "epoch": 10.557723204039476, "grad_norm": 1.008012294769287, "learning_rate": 0.0003, "loss": 0.5409, "step": 11500 }, { "epoch": 10.649529492770254, "grad_norm": 1.1311882734298706, "learning_rate": 0.0003, "loss": 0.5569, "step": 11600 }, { "epoch": 10.741335781501032, "grad_norm": 1.20146644115448, "learning_rate": 0.0003, "loss": 0.5644, "step": 11700 }, { "epoch": 10.83314207023181, "grad_norm": 1.1888364553451538, "learning_rate": 0.0003, "loss": 0.5914, "step": 11800 }, { "epoch": 10.924948358962588, "grad_norm": 1.1159058809280396, "learning_rate": 0.0003, "loss": 0.5858, "step": 11900 }, { "epoch": 10.999311452834519, "eval_accuracy": 0.5165396825396825, "eval_loss": 3.011744737625122, "eval_runtime": 5.5804, "eval_samples_per_second": 89.599, "eval_steps_per_second": 11.29, "step": 11981 }, { "epoch": 11.016754647693366, "grad_norm": 0.9989836812019348, "learning_rate": 0.0003, "loss": 0.5703, "step": 12000 }, { "epoch": 11.108560936424144, "grad_norm": 1.2236177921295166, "learning_rate": 0.0003, "loss": 0.4423, "step": 12100 }, { "epoch": 11.200367225154924, "grad_norm": 1.0680415630340576, "learning_rate": 0.0003, "loss": 0.4644, "step": 12200 }, { "epoch": 11.292173513885702, "grad_norm": 1.3059639930725098, "learning_rate": 0.0003, "loss": 0.4797, "step": 12300 }, { "epoch": 11.38397980261648, "grad_norm": 1.1068886518478394, "learning_rate": 0.0003, "loss": 0.4968, "step": 12400 }, { "epoch": 11.475786091347258, "grad_norm": 1.2058519124984741, "learning_rate": 0.0003, "loss": 0.5027, "step": 12500 }, { "epoch": 11.567592380078036, "grad_norm": 1.222372055053711, "learning_rate": 0.0003, "loss": 0.5157, "step": 12600 }, { "epoch": 11.659398668808814, "grad_norm": 1.2184628248214722, "learning_rate": 0.0003, "loss": 0.531, "step": 12700 }, { "epoch": 11.751204957539592, "grad_norm": 1.1435773372650146, "learning_rate": 0.0003, "loss": 0.5436, "step": 12800 }, { "epoch": 11.84301124627037, "grad_norm": 1.3586217164993286, "learning_rate": 0.0003, "loss": 0.5591, "step": 12900 }, { "epoch": 11.934817535001148, "grad_norm": 1.2372316122055054, "learning_rate": 0.0003, "loss": 0.5618, "step": 13000 }, { "epoch": 12.0, "eval_accuracy": 0.517015873015873, "eval_loss": 3.031684160232544, "eval_runtime": 5.7443, "eval_samples_per_second": 87.043, "eval_steps_per_second": 10.967, "step": 13071 }, { "epoch": 12.026623823731926, "grad_norm": 1.0793490409851074, "learning_rate": 0.0003, "loss": 0.531, "step": 13100 }, { "epoch": 12.118430112462704, "grad_norm": 1.0374959707260132, "learning_rate": 0.0003, "loss": 0.4188, "step": 13200 }, { "epoch": 12.210236401193482, "grad_norm": 1.1079918146133423, "learning_rate": 0.0003, "loss": 0.4472, "step": 13300 }, { "epoch": 12.30204268992426, "grad_norm": 1.2988286018371582, "learning_rate": 0.0003, "loss": 0.4579, "step": 13400 }, { "epoch": 12.393848978655038, "grad_norm": 1.110042691230774, "learning_rate": 0.0003, "loss": 0.4741, "step": 13500 }, { "epoch": 12.485655267385816, "grad_norm": 1.2625929117202759, "learning_rate": 0.0003, "loss": 0.4879, "step": 13600 }, { "epoch": 12.577461556116594, "grad_norm": 1.1651537418365479, "learning_rate": 0.0003, "loss": 0.4988, "step": 13700 }, { "epoch": 12.669267844847372, "grad_norm": 1.2762588262557983, "learning_rate": 0.0003, "loss": 0.5068, "step": 13800 }, { "epoch": 12.76107413357815, "grad_norm": 1.3166848421096802, "learning_rate": 0.0003, "loss": 0.5279, "step": 13900 }, { "epoch": 12.852880422308928, "grad_norm": 1.3245456218719482, "learning_rate": 0.0003, "loss": 0.5316, "step": 14000 }, { "epoch": 12.944686711039706, "grad_norm": 1.2448067665100098, "learning_rate": 0.0003, "loss": 0.5464, "step": 14100 }, { "epoch": 12.999770484278173, "eval_accuracy": 0.5166984126984127, "eval_loss": 3.0685949325561523, "eval_runtime": 5.5092, "eval_samples_per_second": 90.758, "eval_steps_per_second": 11.435, "step": 14160 }, { "epoch": 13.036492999770484, "grad_norm": 1.0070549249649048, "learning_rate": 0.0003, "loss": 0.4909, "step": 14200 }, { "epoch": 13.128299288501262, "grad_norm": 1.078823447227478, "learning_rate": 0.0003, "loss": 0.4126, "step": 14300 }, { "epoch": 13.22010557723204, "grad_norm": 1.2008830308914185, "learning_rate": 0.0003, "loss": 0.4279, "step": 14400 }, { "epoch": 13.311911865962818, "grad_norm": 1.258834958076477, "learning_rate": 0.0003, "loss": 0.445, "step": 14500 }, { "epoch": 13.403718154693596, "grad_norm": 1.2432317733764648, "learning_rate": 0.0003, "loss": 0.4563, "step": 14600 }, { "epoch": 13.495524443424374, "grad_norm": 1.2906520366668701, "learning_rate": 0.0003, "loss": 0.47, "step": 14700 }, { "epoch": 13.587330732155152, "grad_norm": 1.2167085409164429, "learning_rate": 0.0003, "loss": 0.481, "step": 14800 }, { "epoch": 13.67913702088593, "grad_norm": 1.3750929832458496, "learning_rate": 0.0003, "loss": 0.4949, "step": 14900 }, { "epoch": 13.770943309616708, "grad_norm": 1.3509438037872314, "learning_rate": 0.0003, "loss": 0.5036, "step": 15000 }, { "epoch": 13.862749598347486, "grad_norm": 1.2090297937393188, "learning_rate": 0.0003, "loss": 0.5101, "step": 15100 }, { "epoch": 13.954555887078264, "grad_norm": 1.3497878313064575, "learning_rate": 0.0003, "loss": 0.5243, "step": 15200 }, { "epoch": 13.999540968556346, "eval_accuracy": 0.5148571428571429, "eval_loss": 3.0829057693481445, "eval_runtime": 5.7223, "eval_samples_per_second": 87.377, "eval_steps_per_second": 11.01, "step": 15249 }, { "epoch": 14.046362175809042, "grad_norm": 1.0588971376419067, "learning_rate": 0.0003, "loss": 0.458, "step": 15300 }, { "epoch": 14.13816846453982, "grad_norm": 1.2202554941177368, "learning_rate": 0.0003, "loss": 0.3978, "step": 15400 }, { "epoch": 14.229974753270598, "grad_norm": 1.2965370416641235, "learning_rate": 0.0003, "loss": 0.4225, "step": 15500 }, { "epoch": 14.321781042001376, "grad_norm": 1.2187902927398682, "learning_rate": 0.0003, "loss": 0.4312, "step": 15600 }, { "epoch": 14.413587330732156, "grad_norm": 1.2526473999023438, "learning_rate": 0.0003, "loss": 0.4479, "step": 15700 }, { "epoch": 14.505393619462934, "grad_norm": 1.1553575992584229, "learning_rate": 0.0003, "loss": 0.4589, "step": 15800 }, { "epoch": 14.597199908193712, "grad_norm": 1.2674176692962646, "learning_rate": 0.0003, "loss": 0.4654, "step": 15900 }, { "epoch": 14.68900619692449, "grad_norm": 1.2161575555801392, "learning_rate": 0.0003, "loss": 0.475, "step": 16000 }, { "epoch": 14.780812485655268, "grad_norm": 1.078759789466858, "learning_rate": 0.0003, "loss": 0.4919, "step": 16100 }, { "epoch": 14.872618774386046, "grad_norm": 1.3057931661605835, "learning_rate": 0.0003, "loss": 0.4965, "step": 16200 }, { "epoch": 14.964425063116824, "grad_norm": 1.212488055229187, "learning_rate": 0.0003, "loss": 0.5066, "step": 16300 }, { "epoch": 14.999311452834519, "eval_accuracy": 0.5127301587301587, "eval_loss": 3.0957658290863037, "eval_runtime": 6.0214, "eval_samples_per_second": 83.037, "eval_steps_per_second": 10.463, "step": 16338 }, { "epoch": 15.056231351847602, "grad_norm": 0.9950692653656006, "learning_rate": 0.0003, "loss": 0.4266, "step": 16400 }, { "epoch": 15.14803764057838, "grad_norm": 1.0924224853515625, "learning_rate": 0.0003, "loss": 0.3937, "step": 16500 }, { "epoch": 15.239843929309158, "grad_norm": 1.1368046998977661, "learning_rate": 0.0003, "loss": 0.4018, "step": 16600 }, { "epoch": 15.331650218039936, "grad_norm": 1.1744548082351685, "learning_rate": 0.0003, "loss": 0.4195, "step": 16700 }, { "epoch": 15.423456506770714, "grad_norm": 1.1728862524032593, "learning_rate": 0.0003, "loss": 0.4324, "step": 16800 }, { "epoch": 15.515262795501492, "grad_norm": 1.2837214469909668, "learning_rate": 0.0003, "loss": 0.4444, "step": 16900 }, { "epoch": 15.60706908423227, "grad_norm": 1.2437090873718262, "learning_rate": 0.0003, "loss": 0.4551, "step": 17000 }, { "epoch": 15.698875372963048, "grad_norm": 1.2437779903411865, "learning_rate": 0.0003, "loss": 0.4676, "step": 17100 }, { "epoch": 15.790681661693826, "grad_norm": 1.182637095451355, "learning_rate": 0.0003, "loss": 0.478, "step": 17200 }, { "epoch": 15.882487950424604, "grad_norm": 1.199127435684204, "learning_rate": 0.0003, "loss": 0.4881, "step": 17300 }, { "epoch": 15.974294239155382, "grad_norm": 1.251943826675415, "learning_rate": 0.0003, "loss": 0.4947, "step": 17400 }, { "epoch": 16.0, "eval_accuracy": 0.5152698412698413, "eval_loss": 3.0920920372009277, "eval_runtime": 5.478, "eval_samples_per_second": 91.273, "eval_steps_per_second": 11.5, "step": 17428 }, { "epoch": 16.06610052788616, "grad_norm": 1.2216734886169434, "learning_rate": 0.0003, "loss": 0.4103, "step": 17500 }, { "epoch": 16.15790681661694, "grad_norm": 1.1105889081954956, "learning_rate": 0.0003, "loss": 0.3818, "step": 17600 }, { "epoch": 16.249713105347716, "grad_norm": 1.1674516201019287, "learning_rate": 0.0003, "loss": 0.3983, "step": 17700 }, { "epoch": 16.341519394078496, "grad_norm": 1.0942326784133911, "learning_rate": 0.0003, "loss": 0.4102, "step": 17800 }, { "epoch": 16.433325682809272, "grad_norm": 1.0696207284927368, "learning_rate": 0.0003, "loss": 0.4188, "step": 17900 }, { "epoch": 16.525131971540052, "grad_norm": 1.3964133262634277, "learning_rate": 0.0003, "loss": 0.434, "step": 18000 }, { "epoch": 16.616938260270828, "grad_norm": 1.2896054983139038, "learning_rate": 0.0003, "loss": 0.4465, "step": 18100 }, { "epoch": 16.708744549001608, "grad_norm": 1.2668951749801636, "learning_rate": 0.0003, "loss": 0.4544, "step": 18200 }, { "epoch": 16.800550837732384, "grad_norm": 1.2267131805419922, "learning_rate": 0.0003, "loss": 0.4647, "step": 18300 }, { "epoch": 16.892357126463164, "grad_norm": 1.5275602340698242, "learning_rate": 0.0003, "loss": 0.4756, "step": 18400 }, { "epoch": 16.98416341519394, "grad_norm": 1.2730522155761719, "learning_rate": 0.0003, "loss": 0.4841, "step": 18500 }, { "epoch": 16.999770484278173, "eval_accuracy": 0.5162222222222222, "eval_loss": 3.116957902908325, "eval_runtime": 5.4049, "eval_samples_per_second": 92.509, "eval_steps_per_second": 11.656, "step": 18517 }, { "epoch": 17.07596970392472, "grad_norm": 1.0755029916763306, "learning_rate": 0.0003, "loss": 0.382, "step": 18600 }, { "epoch": 17.167775992655496, "grad_norm": 1.0808229446411133, "learning_rate": 0.0003, "loss": 0.3769, "step": 18700 }, { "epoch": 17.259582281386276, "grad_norm": 1.1092764139175415, "learning_rate": 0.0003, "loss": 0.3884, "step": 18800 }, { "epoch": 17.351388570117052, "grad_norm": 1.169694423675537, "learning_rate": 0.0003, "loss": 0.4039, "step": 18900 }, { "epoch": 17.443194858847832, "grad_norm": 1.2209644317626953, "learning_rate": 0.0003, "loss": 0.419, "step": 19000 }, { "epoch": 17.53500114757861, "grad_norm": 1.3729647397994995, "learning_rate": 0.0003, "loss": 0.4292, "step": 19100 }, { "epoch": 17.626807436309388, "grad_norm": 1.1597981452941895, "learning_rate": 0.0003, "loss": 0.4326, "step": 19200 }, { "epoch": 17.718613725040164, "grad_norm": 1.608828067779541, "learning_rate": 0.0003, "loss": 0.4511, "step": 19300 }, { "epoch": 17.810420013770944, "grad_norm": 1.3395555019378662, "learning_rate": 0.0003, "loss": 0.4542, "step": 19400 }, { "epoch": 17.90222630250172, "grad_norm": 1.3384451866149902, "learning_rate": 0.0003, "loss": 0.4609, "step": 19500 }, { "epoch": 17.9940325912325, "grad_norm": 1.3556160926818848, "learning_rate": 0.0003, "loss": 0.4727, "step": 19600 }, { "epoch": 17.999540968556346, "eval_accuracy": 0.5172063492063492, "eval_loss": 3.137547492980957, "eval_runtime": 5.667, "eval_samples_per_second": 88.231, "eval_steps_per_second": 11.117, "step": 19606 }, { "epoch": 18.085838879963276, "grad_norm": 1.1512075662612915, "learning_rate": 0.0003, "loss": 0.3544, "step": 19700 }, { "epoch": 18.177645168694056, "grad_norm": 1.1157560348510742, "learning_rate": 0.0003, "loss": 0.3698, "step": 19800 }, { "epoch": 18.269451457424832, "grad_norm": 1.0202255249023438, "learning_rate": 0.0003, "loss": 0.3897, "step": 19900 }, { "epoch": 18.361257746155612, "grad_norm": 1.1312451362609863, "learning_rate": 0.0003, "loss": 0.3938, "step": 20000 }, { "epoch": 18.45306403488639, "grad_norm": 1.5055644512176514, "learning_rate": 0.0003, "loss": 0.4062, "step": 20100 }, { "epoch": 18.544870323617168, "grad_norm": 1.3119601011276245, "learning_rate": 0.0003, "loss": 0.4172, "step": 20200 }, { "epoch": 18.636676612347944, "grad_norm": 1.229153037071228, "learning_rate": 0.0003, "loss": 0.4294, "step": 20300 }, { "epoch": 18.728482901078724, "grad_norm": 1.1669813394546509, "learning_rate": 0.0003, "loss": 0.4396, "step": 20400 }, { "epoch": 18.8202891898095, "grad_norm": 1.4410171508789062, "learning_rate": 0.0003, "loss": 0.4459, "step": 20500 }, { "epoch": 18.91209547854028, "grad_norm": 1.4326797723770142, "learning_rate": 0.0003, "loss": 0.4634, "step": 20600 }, { "epoch": 18.99931145283452, "eval_accuracy": 0.514984126984127, "eval_loss": 3.132270097732544, "eval_runtime": 5.3564, "eval_samples_per_second": 93.346, "eval_steps_per_second": 11.762, "step": 20695 }, { "epoch": 19.003901767271056, "grad_norm": 1.1022387742996216, "learning_rate": 0.0003, "loss": 0.461, "step": 20700 }, { "epoch": 19.095708056001836, "grad_norm": 1.2258243560791016, "learning_rate": 0.0003, "loss": 0.3484, "step": 20800 }, { "epoch": 19.187514344732612, "grad_norm": 1.2331010103225708, "learning_rate": 0.0003, "loss": 0.3623, "step": 20900 }, { "epoch": 19.279320633463392, "grad_norm": 1.1330702304840088, "learning_rate": 0.0003, "loss": 0.3792, "step": 21000 }, { "epoch": 19.371126922194172, "grad_norm": 1.272649884223938, "learning_rate": 0.0003, "loss": 0.3929, "step": 21100 }, { "epoch": 19.462933210924948, "grad_norm": 1.1329351663589478, "learning_rate": 0.0003, "loss": 0.4027, "step": 21200 }, { "epoch": 19.554739499655728, "grad_norm": 1.2024905681610107, "learning_rate": 0.0003, "loss": 0.4081, "step": 21300 }, { "epoch": 19.646545788386504, "grad_norm": 1.2089053392410278, "learning_rate": 0.0003, "loss": 0.4156, "step": 21400 }, { "epoch": 19.738352077117284, "grad_norm": 1.2542715072631836, "learning_rate": 0.0003, "loss": 0.4377, "step": 21500 }, { "epoch": 19.83015836584806, "grad_norm": 1.5096327066421509, "learning_rate": 0.0003, "loss": 0.4446, "step": 21600 }, { "epoch": 19.92196465457884, "grad_norm": 1.2375317811965942, "learning_rate": 0.0003, "loss": 0.4468, "step": 21700 }, { "epoch": 19.995409685563462, "eval_accuracy": 0.5165079365079365, "eval_loss": 3.163097381591797, "eval_runtime": 5.9319, "eval_samples_per_second": 84.29, "eval_steps_per_second": 10.62, "step": 21780 }, { "epoch": 19.995409685563462, "step": 21780, "total_flos": 1.5026894410916823e+18, "train_loss": 0.737595841904317, "train_runtime": 47608.6465, "train_samples_per_second": 14.642, "train_steps_per_second": 0.457 } ], "logging_steps": 100, "max_steps": 21780, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 1.5026894410916823e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }