|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.99770484278173, |
|
"eval_steps": 500, |
|
"global_step": 21780, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09180628873077806, |
|
"grad_norm": 0.26081690192222595, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6526, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18361257746155613, |
|
"grad_norm": 0.25705486536026, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5908, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2754188661923342, |
|
"grad_norm": 0.2516353726387024, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5712, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36722515492311225, |
|
"grad_norm": 0.22883176803588867, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5768, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4590314436538903, |
|
"grad_norm": 0.236283078789711, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5744, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5508377323846684, |
|
"grad_norm": 0.231370747089386, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5722, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6426440211154464, |
|
"grad_norm": 0.24125300347805023, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5736, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7344503098462245, |
|
"grad_norm": 0.2657057046890259, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5831, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8262565985770025, |
|
"grad_norm": 0.27335691452026367, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5521, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9180628873077806, |
|
"grad_norm": 0.2826825976371765, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5635, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.999770484278173, |
|
"eval_accuracy": 0.6795836972343523, |
|
"eval_loss": 1.4614675045013428, |
|
"eval_runtime": 9.1641, |
|
"eval_samples_per_second": 54.561, |
|
"eval_steps_per_second": 6.875, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 1.0098691760385587, |
|
"grad_norm": 0.24073347449302673, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5475, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1016754647693368, |
|
"grad_norm": 0.3140055239200592, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4788, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1934817535001148, |
|
"grad_norm": 0.3724195659160614, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4698, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2852880422308928, |
|
"grad_norm": 0.34302350878715515, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4629, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.377094330961671, |
|
"grad_norm": 0.35881391167640686, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4596, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.468900619692449, |
|
"grad_norm": 0.3676307797431946, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4718, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.560706908423227, |
|
"grad_norm": 0.3709953725337982, |
|
"learning_rate": 0.0001, |
|
"loss": 1.435, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.652513197154005, |
|
"grad_norm": 0.38531753420829773, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4553, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.744319485884783, |
|
"grad_norm": 0.40058839321136475, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4444, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.836125774615561, |
|
"grad_norm": 0.4059107303619385, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4381, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9279320633463393, |
|
"grad_norm": 0.4201526939868927, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4521, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.999540968556346, |
|
"eval_accuracy": 0.6873740902474527, |
|
"eval_loss": 1.362550973892212, |
|
"eval_runtime": 9.1565, |
|
"eval_samples_per_second": 54.606, |
|
"eval_steps_per_second": 6.88, |
|
"step": 2178 |
|
}, |
|
{ |
|
"epoch": 2.0197383520771175, |
|
"grad_norm": 0.397616446018219, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4163, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.1115446408078955, |
|
"grad_norm": 0.4666147232055664, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3015, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.2033509295386735, |
|
"grad_norm": 0.5091608762741089, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3117, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.2951572182694515, |
|
"grad_norm": 0.44425851106643677, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3042, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.3869635070002295, |
|
"grad_norm": 0.4947376251220703, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3091, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.4787697957310075, |
|
"grad_norm": 0.49756625294685364, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2878, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.5705760844617855, |
|
"grad_norm": 0.48819100856781006, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3036, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.6623823731925635, |
|
"grad_norm": 0.49992629885673523, |
|
"learning_rate": 0.0001, |
|
"loss": 1.29, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.754188661923342, |
|
"grad_norm": 0.5537226796150208, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3049, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.84599495065412, |
|
"grad_norm": 0.5161275267601013, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2796, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.937801239384898, |
|
"grad_norm": 0.5615408420562744, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2848, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.9993114528345193, |
|
"eval_accuracy": 0.6957729257641921, |
|
"eval_loss": 1.257521390914917, |
|
"eval_runtime": 9.264, |
|
"eval_samples_per_second": 53.972, |
|
"eval_steps_per_second": 6.801, |
|
"step": 3267 |
|
}, |
|
{ |
|
"epoch": 3.029607528115676, |
|
"grad_norm": 0.6211069226264954, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2349, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.121413816846454, |
|
"grad_norm": 0.6274811029434204, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1362, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.213220105577232, |
|
"grad_norm": 0.7168062925338745, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1299, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.30502639430801, |
|
"grad_norm": 0.6573987603187561, |
|
"learning_rate": 0.0001, |
|
"loss": 1.153, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.396832683038788, |
|
"grad_norm": 0.702870786190033, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1402, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.488638971769566, |
|
"grad_norm": 0.6937388181686401, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1344, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.580445260500344, |
|
"grad_norm": 0.705838680267334, |
|
"learning_rate": 0.0001, |
|
"loss": 1.125, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.672251549231122, |
|
"grad_norm": 0.8442272543907166, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1423, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.7640578379619005, |
|
"grad_norm": 0.9211050868034363, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1227, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.8558641266926785, |
|
"grad_norm": 0.6930621862411499, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1286, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.9476704154234565, |
|
"grad_norm": 0.6763383746147156, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1197, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7054410480349345, |
|
"eval_loss": 1.1526687145233154, |
|
"eval_runtime": 8.1013, |
|
"eval_samples_per_second": 61.718, |
|
"eval_steps_per_second": 7.777, |
|
"step": 4357 |
|
}, |
|
{ |
|
"epoch": 4.039476704154235, |
|
"grad_norm": 0.6891958713531494, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0562, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.131282992885013, |
|
"grad_norm": 0.7408663630485535, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9585, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.223089281615791, |
|
"grad_norm": 0.8520354628562927, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9652, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.314895570346569, |
|
"grad_norm": 0.8522772789001465, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9819, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.406701859077347, |
|
"grad_norm": 0.8211854696273804, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9777, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.498508147808125, |
|
"grad_norm": 0.8455579280853271, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9748, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.590314436538903, |
|
"grad_norm": 0.9336457848548889, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9806, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.682120725269681, |
|
"grad_norm": 0.8030388355255127, |
|
"learning_rate": 0.0001, |
|
"loss": 0.977, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.773927014000459, |
|
"grad_norm": 0.8392836451530457, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9773, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.865733302731237, |
|
"grad_norm": 0.823242723941803, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9776, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.957539591462015, |
|
"grad_norm": 0.9073436260223389, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9756, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.999770484278173, |
|
"eval_accuracy": 0.7142823871906842, |
|
"eval_loss": 1.0531576871871948, |
|
"eval_runtime": 9.2013, |
|
"eval_samples_per_second": 54.34, |
|
"eval_steps_per_second": 6.847, |
|
"step": 5446 |
|
}, |
|
{ |
|
"epoch": 5.049345880192793, |
|
"grad_norm": 1.1068471670150757, |
|
"learning_rate": 0.0001, |
|
"loss": 0.887, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.141152168923571, |
|
"grad_norm": 0.9577746987342834, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8052, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.232958457654349, |
|
"grad_norm": 0.9153982996940613, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8118, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.324764746385127, |
|
"grad_norm": 1.0308676958084106, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8332, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.416571035115905, |
|
"grad_norm": 0.9150503873825073, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8242, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.508377323846684, |
|
"grad_norm": 1.0191842317581177, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8323, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.600183612577462, |
|
"grad_norm": 1.1198716163635254, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8301, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.69198990130824, |
|
"grad_norm": 0.996842622756958, |
|
"learning_rate": 0.0001, |
|
"loss": 0.84, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.783796190039018, |
|
"grad_norm": 1.086377739906311, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8414, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.875602478769796, |
|
"grad_norm": 0.9792770147323608, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8277, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.967408767500574, |
|
"grad_norm": 1.0763967037200928, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8393, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.999540968556346, |
|
"eval_accuracy": 0.7241455604075692, |
|
"eval_loss": 0.9538469314575195, |
|
"eval_runtime": 8.1157, |
|
"eval_samples_per_second": 61.609, |
|
"eval_steps_per_second": 7.763, |
|
"step": 6535 |
|
}, |
|
{ |
|
"epoch": 6.059215056231352, |
|
"grad_norm": 1.08652663230896, |
|
"learning_rate": 0.0001, |
|
"loss": 0.729, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.15102134496213, |
|
"grad_norm": 0.9392278790473938, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6772, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.242827633692908, |
|
"grad_norm": 1.126567006111145, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6904, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.334633922423686, |
|
"grad_norm": 1.0995755195617676, |
|
"learning_rate": 0.0001, |
|
"loss": 0.69, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.426440211154464, |
|
"grad_norm": 0.983116090297699, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6962, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.518246499885242, |
|
"grad_norm": 1.2054848670959473, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7061, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.61005278861602, |
|
"grad_norm": 1.2262558937072754, |
|
"learning_rate": 0.0001, |
|
"loss": 0.71, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.701859077346798, |
|
"grad_norm": 1.069161295890808, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7003, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.793665366077576, |
|
"grad_norm": 1.2181183099746704, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7024, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.885471654808354, |
|
"grad_norm": 1.0989437103271484, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7002, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.977277943539132, |
|
"grad_norm": 1.1168180704116821, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7125, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 6.999311452834519, |
|
"eval_accuracy": 0.7324425036390102, |
|
"eval_loss": 0.8674135208129883, |
|
"eval_runtime": 9.2169, |
|
"eval_samples_per_second": 54.248, |
|
"eval_steps_per_second": 6.835, |
|
"step": 7624 |
|
}, |
|
{ |
|
"epoch": 7.06908423226991, |
|
"grad_norm": 1.0699574947357178, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6029, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.160890521000688, |
|
"grad_norm": 0.9937657713890076, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5805, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.252696809731467, |
|
"grad_norm": 1.178791880607605, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5752, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.344503098462245, |
|
"grad_norm": 1.2159409523010254, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5912, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.436309387193023, |
|
"grad_norm": 1.0133622884750366, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5932, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.528115675923801, |
|
"grad_norm": 1.0923631191253662, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6071, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.619921964654579, |
|
"grad_norm": 1.3819491863250732, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5956, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.711728253385357, |
|
"grad_norm": 1.182358980178833, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5922, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.803534542116135, |
|
"grad_norm": 1.1674267053604126, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5912, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.895340830846913, |
|
"grad_norm": 1.1732617616653442, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5969, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.987147119577691, |
|
"grad_norm": 1.2167391777038574, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6144, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7404046579330422, |
|
"eval_loss": 0.7907233834266663, |
|
"eval_runtime": 9.1451, |
|
"eval_samples_per_second": 54.674, |
|
"eval_steps_per_second": 6.889, |
|
"step": 8714 |
|
}, |
|
{ |
|
"epoch": 8.07895340830847, |
|
"grad_norm": 1.1496325731277466, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5043, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.170759697039248, |
|
"grad_norm": 1.4953396320343018, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4848, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.262565985770026, |
|
"grad_norm": 1.2796908617019653, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5007, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.354372274500804, |
|
"grad_norm": 1.2108944654464722, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4987, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.446178563231582, |
|
"grad_norm": 0.9534372687339783, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5068, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.53798485196236, |
|
"grad_norm": 1.1545357704162598, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5072, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.629791140693138, |
|
"grad_norm": 1.2086093425750732, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5173, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.721597429423916, |
|
"grad_norm": 1.20607328414917, |
|
"learning_rate": 0.0001, |
|
"loss": 0.519, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.813403718154694, |
|
"grad_norm": 1.2534675598144531, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5261, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.905210006885472, |
|
"grad_norm": 1.2726677656173706, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5213, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.99701629561625, |
|
"grad_norm": 1.28297758102417, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5355, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 8.999770484278173, |
|
"eval_accuracy": 0.7468762736535662, |
|
"eval_loss": 0.7288308143615723, |
|
"eval_runtime": 9.1574, |
|
"eval_samples_per_second": 54.601, |
|
"eval_steps_per_second": 6.88, |
|
"step": 9803 |
|
}, |
|
{ |
|
"epoch": 9.088822584347028, |
|
"grad_norm": 1.278200626373291, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4247, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.180628873077806, |
|
"grad_norm": 1.3318266868591309, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4284, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.272435161808584, |
|
"grad_norm": 1.209088683128357, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4381, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.364241450539362, |
|
"grad_norm": 1.0169490575790405, |
|
"learning_rate": 0.0001, |
|
"loss": 0.443, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.45604773927014, |
|
"grad_norm": 1.4842835664749146, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4424, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.547854028000918, |
|
"grad_norm": 1.1761025190353394, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4463, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.639660316731696, |
|
"grad_norm": 1.270493984222412, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4541, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.731466605462474, |
|
"grad_norm": 1.346306562423706, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4551, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.823272894193252, |
|
"grad_norm": 1.2559789419174194, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4611, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.91507918292403, |
|
"grad_norm": 1.4359290599822998, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4584, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.997704842781731, |
|
"eval_accuracy": 0.75309461426492, |
|
"eval_loss": 0.6794138550758362, |
|
"eval_runtime": 8.1091, |
|
"eval_samples_per_second": 61.659, |
|
"eval_steps_per_second": 7.769, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 10.009180628873079, |
|
"grad_norm": 1.3423351049423218, |
|
"learning_rate": 0.0001, |
|
"loss": 0.386, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.100986917603857, |
|
"grad_norm": 1.132673740386963, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3726, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 10.192793206334635, |
|
"grad_norm": 1.614931344985962, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3819, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 10.284599495065413, |
|
"grad_norm": 1.3352196216583252, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3876, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 10.37640578379619, |
|
"grad_norm": 1.0957690477371216, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3973, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 10.468212072526969, |
|
"grad_norm": 1.142330527305603, |
|
"learning_rate": 0.0001, |
|
"loss": 0.396, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 10.560018361257747, |
|
"grad_norm": 1.4076579809188843, |
|
"learning_rate": 0.0001, |
|
"loss": 0.402, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 10.651824649988525, |
|
"grad_norm": 1.389333963394165, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3962, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 10.743630938719303, |
|
"grad_norm": 1.4440951347351074, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4049, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 10.83543722745008, |
|
"grad_norm": 1.4290118217468262, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4093, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 10.927243516180859, |
|
"grad_norm": 1.46366548538208, |
|
"learning_rate": 0.0001, |
|
"loss": 0.413, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 10.999770484278173, |
|
"eval_accuracy": 0.7576768558951965, |
|
"eval_loss": 0.6291825175285339, |
|
"eval_runtime": 8.951, |
|
"eval_samples_per_second": 55.86, |
|
"eval_steps_per_second": 7.038, |
|
"step": 11979 |
|
}, |
|
{ |
|
"epoch": 11.019049804911637, |
|
"grad_norm": 1.1713697910308838, |
|
"learning_rate": 0.0001, |
|
"loss": 0.395, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 11.110856093642415, |
|
"grad_norm": 1.277626395225525, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3355, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 11.202662382373193, |
|
"grad_norm": 1.3597822189331055, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3412, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 11.29446867110397, |
|
"grad_norm": 1.4017976522445679, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3414, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 11.386274959834749, |
|
"grad_norm": 1.409915804862976, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3558, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 11.478081248565527, |
|
"grad_norm": 1.400634765625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3577, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 11.569887537296305, |
|
"grad_norm": 1.5898892879486084, |
|
"learning_rate": 0.0001, |
|
"loss": 0.354, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 11.661693826027083, |
|
"grad_norm": 1.3252007961273193, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3682, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 11.75350011475786, |
|
"grad_norm": 1.302128791809082, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3715, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 11.845306403488639, |
|
"grad_norm": 1.3374468088150024, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3707, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 11.937112692219417, |
|
"grad_norm": 1.1755791902542114, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3731, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 11.999540968556346, |
|
"eval_accuracy": 0.76164192139738, |
|
"eval_loss": 0.5926400423049927, |
|
"eval_runtime": 9.0463, |
|
"eval_samples_per_second": 55.271, |
|
"eval_steps_per_second": 6.964, |
|
"step": 13068 |
|
}, |
|
{ |
|
"epoch": 12.028918980950195, |
|
"grad_norm": 1.3085649013519287, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3482, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 12.120725269680973, |
|
"grad_norm": 1.1860175132751465, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2982, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 12.21253155841175, |
|
"grad_norm": 1.1902750730514526, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3095, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 12.304337847142529, |
|
"grad_norm": 1.2473431825637817, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3216, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 12.396144135873307, |
|
"grad_norm": 1.443493366241455, |
|
"learning_rate": 0.0001, |
|
"loss": 0.319, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 12.487950424604085, |
|
"grad_norm": 1.4389948844909668, |
|
"learning_rate": 0.0001, |
|
"loss": 0.328, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 12.579756713334863, |
|
"grad_norm": 1.1586631536483765, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3285, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 12.671563002065641, |
|
"grad_norm": 1.180396318435669, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3311, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 12.763369290796419, |
|
"grad_norm": 1.4230598211288452, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3346, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 12.855175579527197, |
|
"grad_norm": 1.5782092809677124, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3415, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.946981868257975, |
|
"grad_norm": 1.418642282485962, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3423, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 12.999311452834519, |
|
"eval_accuracy": 0.7655866084425036, |
|
"eval_loss": 0.5619787573814392, |
|
"eval_runtime": 8.9635, |
|
"eval_samples_per_second": 55.782, |
|
"eval_steps_per_second": 7.029, |
|
"step": 14157 |
|
}, |
|
{ |
|
"epoch": 13.038788156988753, |
|
"grad_norm": 1.1923723220825195, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3119, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 13.130594445719531, |
|
"grad_norm": 1.2736058235168457, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2762, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 13.22240073445031, |
|
"grad_norm": 0.9496171474456787, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2844, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 13.314207023181089, |
|
"grad_norm": 1.22100031375885, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2938, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 13.406013311911867, |
|
"grad_norm": 1.381606101989746, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2978, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 13.497819600642645, |
|
"grad_norm": 1.43625807762146, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3035, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 13.589625889373423, |
|
"grad_norm": 1.4393320083618164, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3065, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 13.6814321781042, |
|
"grad_norm": 1.184833288192749, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3091, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 13.773238466834979, |
|
"grad_norm": 1.4501614570617676, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3103, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 13.865044755565757, |
|
"grad_norm": 1.368249535560608, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3137, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 13.956851044296535, |
|
"grad_norm": 1.4249024391174316, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3185, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7682037845705968, |
|
"eval_loss": 0.542601466178894, |
|
"eval_runtime": 8.9503, |
|
"eval_samples_per_second": 55.864, |
|
"eval_steps_per_second": 7.039, |
|
"step": 15247 |
|
}, |
|
{ |
|
"epoch": 14.048657333027313, |
|
"grad_norm": 1.3438467979431152, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2849, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 14.14046362175809, |
|
"grad_norm": 1.4617668390274048, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2658, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 14.232269910488869, |
|
"grad_norm": 1.266655683517456, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2679, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 14.324076199219647, |
|
"grad_norm": 1.2162944078445435, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2711, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 14.415882487950425, |
|
"grad_norm": 1.10415518283844, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2763, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 14.507688776681203, |
|
"grad_norm": 1.1962913274765015, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2827, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 14.59949506541198, |
|
"grad_norm": 1.2264560461044312, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2845, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 14.691301354142759, |
|
"grad_norm": 1.3857085704803467, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2897, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 14.783107642873537, |
|
"grad_norm": 1.447581171989441, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2894, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 14.874913931604315, |
|
"grad_norm": 1.3408719301223755, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2899, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 14.966720220335093, |
|
"grad_norm": 1.695694088935852, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2924, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 14.999770484278173, |
|
"eval_accuracy": 0.7708355167394468, |
|
"eval_loss": 0.5231938362121582, |
|
"eval_runtime": 9.1808, |
|
"eval_samples_per_second": 54.462, |
|
"eval_steps_per_second": 6.862, |
|
"step": 16336 |
|
}, |
|
{ |
|
"epoch": 15.05852650906587, |
|
"grad_norm": 1.1147023439407349, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2605, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 15.150332797796649, |
|
"grad_norm": 1.518908977508545, |
|
"learning_rate": 0.0001, |
|
"loss": 0.245, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 15.242139086527427, |
|
"grad_norm": 1.1342830657958984, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2447, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 15.333945375258205, |
|
"grad_norm": 1.2657541036605835, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2599, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 15.425751663988983, |
|
"grad_norm": 0.9707338809967041, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2591, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 15.517557952719761, |
|
"grad_norm": 1.2904791831970215, |
|
"learning_rate": 0.0001, |
|
"loss": 0.264, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 15.609364241450539, |
|
"grad_norm": 1.4617804288864136, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2665, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 15.701170530181317, |
|
"grad_norm": 1.1893932819366455, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2689, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 15.792976818912095, |
|
"grad_norm": 1.3138148784637451, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2731, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 15.884783107642873, |
|
"grad_norm": 1.2247110605239868, |
|
"learning_rate": 0.0001, |
|
"loss": 0.278, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 15.976589396373651, |
|
"grad_norm": 1.1995705366134644, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2824, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 15.999540968556346, |
|
"eval_accuracy": 0.7727045123726346, |
|
"eval_loss": 0.5129293203353882, |
|
"eval_runtime": 8.9728, |
|
"eval_samples_per_second": 55.724, |
|
"eval_steps_per_second": 7.021, |
|
"step": 17425 |
|
}, |
|
{ |
|
"epoch": 16.06839568510443, |
|
"grad_norm": 1.088183045387268, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2408, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 16.160201973835207, |
|
"grad_norm": 1.27170991897583, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2339, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 16.252008262565987, |
|
"grad_norm": 1.093220591545105, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2381, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 16.343814551296763, |
|
"grad_norm": 1.3761118650436401, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2361, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 16.435620840027543, |
|
"grad_norm": 1.3061089515686035, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2437, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 16.52742712875832, |
|
"grad_norm": 1.318901538848877, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2475, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 16.6192334174891, |
|
"grad_norm": 1.241626262664795, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2542, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 16.711039706219875, |
|
"grad_norm": 1.1289949417114258, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2566, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 16.802845994950655, |
|
"grad_norm": 1.4046275615692139, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2594, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 16.89465228368143, |
|
"grad_norm": 1.1862374544143677, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2611, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 16.98645857241221, |
|
"grad_norm": 1.3014901876449585, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2669, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 16.99931145283452, |
|
"eval_accuracy": 0.774806404657933, |
|
"eval_loss": 0.49875929951667786, |
|
"eval_runtime": 9.0234, |
|
"eval_samples_per_second": 55.411, |
|
"eval_steps_per_second": 6.982, |
|
"step": 18514 |
|
}, |
|
{ |
|
"epoch": 17.078264861142987, |
|
"grad_norm": 1.0681638717651367, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2239, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 17.170071149873767, |
|
"grad_norm": 1.1279337406158447, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2223, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 17.261877438604543, |
|
"grad_norm": 1.3798402547836304, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2241, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 17.353683727335323, |
|
"grad_norm": 1.1741504669189453, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2326, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 17.4454900160661, |
|
"grad_norm": 1.1289469003677368, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2345, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 17.53729630479688, |
|
"grad_norm": 1.508701205253601, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2421, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 17.629102593527655, |
|
"grad_norm": 1.449561357498169, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2387, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 17.720908882258435, |
|
"grad_norm": 1.1868849992752075, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2402, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 17.81271517098921, |
|
"grad_norm": 1.4335336685180664, |
|
"learning_rate": 0.0001, |
|
"loss": 0.249, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 17.90452145971999, |
|
"grad_norm": 1.3802162408828735, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2491, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 17.996327748450767, |
|
"grad_norm": 1.3790746927261353, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2517, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.776174672489083, |
|
"eval_loss": 0.4891900420188904, |
|
"eval_runtime": 8.9325, |
|
"eval_samples_per_second": 55.975, |
|
"eval_steps_per_second": 7.053, |
|
"step": 19604 |
|
}, |
|
{ |
|
"epoch": 18.088134037181547, |
|
"grad_norm": 1.1314564943313599, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2088, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 18.179940325912327, |
|
"grad_norm": 1.2055948972702026, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2128, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 18.271746614643103, |
|
"grad_norm": 1.1677360534667969, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2178, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 18.363552903373883, |
|
"grad_norm": 1.2793176174163818, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2216, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 18.45535919210466, |
|
"grad_norm": 1.187522292137146, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2243, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 18.54716548083544, |
|
"grad_norm": 1.5564976930618286, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2249, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 18.638971769566215, |
|
"grad_norm": 1.2912520170211792, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2319, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 18.730778058296995, |
|
"grad_norm": 1.5046939849853516, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2343, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 18.82258434702777, |
|
"grad_norm": 1.4738825559616089, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2342, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 18.91439063575855, |
|
"grad_norm": 1.427435278892517, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2376, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 18.999770484278173, |
|
"eval_accuracy": 0.7773391557496361, |
|
"eval_loss": 0.4808199405670166, |
|
"eval_runtime": 8.9815, |
|
"eval_samples_per_second": 55.67, |
|
"eval_steps_per_second": 7.014, |
|
"step": 20693 |
|
}, |
|
{ |
|
"epoch": 19.006196924489327, |
|
"grad_norm": 1.294245719909668, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2396, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 19.098003213220107, |
|
"grad_norm": 0.9566488862037659, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2006, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 19.189809501950883, |
|
"grad_norm": 1.184180736541748, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2049, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 19.281615790681663, |
|
"grad_norm": 1.1258317232131958, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2081, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 19.37342207941244, |
|
"grad_norm": 1.2547038793563843, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2133, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 19.46522836814322, |
|
"grad_norm": 1.3770051002502441, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2175, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 19.557034656873995, |
|
"grad_norm": 1.3640483617782593, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2178, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 19.648840945604775, |
|
"grad_norm": 1.2219371795654297, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2233, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 19.74064723433555, |
|
"grad_norm": 1.3438184261322021, |
|
"learning_rate": 0.0001, |
|
"loss": 0.224, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 19.83245352306633, |
|
"grad_norm": 1.2909867763519287, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2274, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 19.924259811797107, |
|
"grad_norm": 1.482640027999878, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2316, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 19.99770484278173, |
|
"eval_accuracy": 0.7780232896652111, |
|
"eval_loss": 0.4803846478462219, |
|
"eval_runtime": 8.977, |
|
"eval_samples_per_second": 55.698, |
|
"eval_steps_per_second": 7.018, |
|
"step": 21780 |
|
}, |
|
{ |
|
"epoch": 19.99770484278173, |
|
"step": 21780, |
|
"total_flos": 2.2953494160657613e+18, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.0873, |
|
"train_samples_per_second": 7982672.15, |
|
"train_steps_per_second": 249415.561 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 21780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 2.2953494160657613e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|