|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.995409685563462, |
|
"eval_steps": 500, |
|
"global_step": 21780, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09180628873077806, |
|
"grad_norm": 0.289526104927063, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7578, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18361257746155613, |
|
"grad_norm": 0.27366071939468384, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6025, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2754188661923342, |
|
"grad_norm": 0.2666202485561371, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5821, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36722515492311225, |
|
"grad_norm": 0.25359421968460083, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5884, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4590314436538903, |
|
"grad_norm": 0.26042038202285767, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5873, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5508377323846684, |
|
"grad_norm": 0.25545641779899597, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5861, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6426440211154464, |
|
"grad_norm": 0.2685534358024597, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5889, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7344503098462245, |
|
"grad_norm": 0.30292272567749023, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6001, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8262565985770025, |
|
"grad_norm": 0.3182919919490814, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5712, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9180628873077806, |
|
"grad_norm": 0.31249287724494934, |
|
"learning_rate": 3e-05, |
|
"loss": 1.584, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.999770484278173, |
|
"eval_accuracy": 0.6762416302765648, |
|
"eval_loss": 1.5102524757385254, |
|
"eval_runtime": 8.9715, |
|
"eval_samples_per_second": 55.732, |
|
"eval_steps_per_second": 7.022, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 1.0098691760385587, |
|
"grad_norm": 0.2788257300853729, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5741, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1016754647693368, |
|
"grad_norm": 0.3435765504837036, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5538, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1934817535001148, |
|
"grad_norm": 0.42897695302963257, |
|
"learning_rate": 3e-05, |
|
"loss": 1.548, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2852880422308928, |
|
"grad_norm": 0.3922116160392761, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5439, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.377094330961671, |
|
"grad_norm": 0.39680397510528564, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5422, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.468900619692449, |
|
"grad_norm": 0.42543351650238037, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5569, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.560706908423227, |
|
"grad_norm": 0.41228362917900085, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5229, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.652513197154005, |
|
"grad_norm": 0.46888694167137146, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5442, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.744319485884783, |
|
"grad_norm": 0.4511169195175171, |
|
"learning_rate": 3e-05, |
|
"loss": 1.538, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.836125774615561, |
|
"grad_norm": 0.46713030338287354, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5343, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9279320633463393, |
|
"grad_norm": 0.5275651812553406, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5504, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.999540968556346, |
|
"eval_accuracy": 0.6791033478893741, |
|
"eval_loss": 1.4772261381149292, |
|
"eval_runtime": 9.0172, |
|
"eval_samples_per_second": 55.45, |
|
"eval_steps_per_second": 6.987, |
|
"step": 2178 |
|
}, |
|
{ |
|
"epoch": 2.0197383520771175, |
|
"grad_norm": 0.47441884875297546, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5331, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.1115446408078955, |
|
"grad_norm": 0.47345635294914246, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4902, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.2033509295386735, |
|
"grad_norm": 0.592490017414093, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4996, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.2951572182694515, |
|
"grad_norm": 0.5280076265335083, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4949, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.3869635070002295, |
|
"grad_norm": 0.5791444182395935, |
|
"learning_rate": 3e-05, |
|
"loss": 1.499, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.4787697957310075, |
|
"grad_norm": 0.5848264098167419, |
|
"learning_rate": 3e-05, |
|
"loss": 1.479, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.5705760844617855, |
|
"grad_norm": 0.5598397254943848, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4938, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.6623823731925635, |
|
"grad_norm": 0.5774019360542297, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4884, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.754188661923342, |
|
"grad_norm": 0.6278976202011108, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5027, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.84599495065412, |
|
"grad_norm": 0.5700748562812805, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4777, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.937801239384898, |
|
"grad_norm": 0.6338950395584106, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4842, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.9993114528345193, |
|
"eval_accuracy": 0.6811615720524018, |
|
"eval_loss": 1.4502207040786743, |
|
"eval_runtime": 9.3503, |
|
"eval_samples_per_second": 53.474, |
|
"eval_steps_per_second": 6.738, |
|
"step": 3267 |
|
}, |
|
{ |
|
"epoch": 3.029607528115676, |
|
"grad_norm": 0.6205599904060364, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4695, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.121413816846454, |
|
"grad_norm": 0.6470921635627747, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4401, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.213220105577232, |
|
"grad_norm": 0.8113517761230469, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4312, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.30502639430801, |
|
"grad_norm": 0.6976670026779175, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4548, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.396832683038788, |
|
"grad_norm": 0.7569802403450012, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4447, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.488638971769566, |
|
"grad_norm": 0.8047822117805481, |
|
"learning_rate": 3e-05, |
|
"loss": 1.434, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.580445260500344, |
|
"grad_norm": 0.710166871547699, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4283, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.672251549231122, |
|
"grad_norm": 0.7864311933517456, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4493, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.7640578379619005, |
|
"grad_norm": 0.7331141829490662, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4259, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.8558641266926785, |
|
"grad_norm": 0.7041341662406921, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4316, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.9476704154234565, |
|
"grad_norm": 0.6956498622894287, |
|
"learning_rate": 3e-05, |
|
"loss": 1.427, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.6837205240174673, |
|
"eval_loss": 1.4199937582015991, |
|
"eval_runtime": 9.1672, |
|
"eval_samples_per_second": 54.542, |
|
"eval_steps_per_second": 6.872, |
|
"step": 4357 |
|
}, |
|
{ |
|
"epoch": 4.039476704154235, |
|
"grad_norm": 0.7389739751815796, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4099, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.131282992885013, |
|
"grad_norm": 0.7883840203285217, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3709, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.223089281615791, |
|
"grad_norm": 0.7341814041137695, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3743, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.314895570346569, |
|
"grad_norm": 0.9007183909416199, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3929, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.406701859077347, |
|
"grad_norm": 0.8208268284797668, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3843, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.498508147808125, |
|
"grad_norm": 0.7786016464233398, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3789, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.590314436538903, |
|
"grad_norm": 0.9414695501327515, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3812, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.682120725269681, |
|
"grad_norm": 0.7854552268981934, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3851, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.773927014000459, |
|
"grad_norm": 0.8319596648216248, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3846, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.865733302731237, |
|
"grad_norm": 0.8832118511199951, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3845, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.957539591462015, |
|
"grad_norm": 0.8607555627822876, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3827, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.999770484278173, |
|
"eval_accuracy": 0.6860465793304221, |
|
"eval_loss": 1.3910759687423706, |
|
"eval_runtime": 8.9399, |
|
"eval_samples_per_second": 55.929, |
|
"eval_steps_per_second": 7.047, |
|
"step": 5446 |
|
}, |
|
{ |
|
"epoch": 5.049345880192793, |
|
"grad_norm": 0.8777875900268555, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3512, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.141152168923571, |
|
"grad_norm": 0.9193658232688904, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3124, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.232958457654349, |
|
"grad_norm": 0.9822832345962524, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3189, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.324764746385127, |
|
"grad_norm": 0.9231218099594116, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3321, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.416571035115905, |
|
"grad_norm": 0.961618185043335, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3275, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.508377323846684, |
|
"grad_norm": 1.1759928464889526, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3301, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.600183612577462, |
|
"grad_norm": 1.0055111646652222, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3261, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.69198990130824, |
|
"grad_norm": 0.9605348110198975, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3329, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.783796190039018, |
|
"grad_norm": 1.0969476699829102, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3347, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.875602478769796, |
|
"grad_norm": 0.9841852188110352, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3215, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.967408767500574, |
|
"grad_norm": 1.0173933506011963, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3425, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.999540968556346, |
|
"eval_accuracy": 0.6886841339155749, |
|
"eval_loss": 1.3614963293075562, |
|
"eval_runtime": 8.9983, |
|
"eval_samples_per_second": 55.566, |
|
"eval_steps_per_second": 7.001, |
|
"step": 6535 |
|
}, |
|
{ |
|
"epoch": 6.059215056231352, |
|
"grad_norm": 0.9646373987197876, |
|
"learning_rate": 3e-05, |
|
"loss": 1.287, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.15102134496213, |
|
"grad_norm": 0.8929613828659058, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2574, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.242827633692908, |
|
"grad_norm": 1.285346508026123, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2762, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.334633922423686, |
|
"grad_norm": 1.102123498916626, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2667, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.426440211154464, |
|
"grad_norm": 1.021745204925537, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2698, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.518246499885242, |
|
"grad_norm": 1.1759482622146606, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2781, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.61005278861602, |
|
"grad_norm": 1.2193723917007446, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2784, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.701859077346798, |
|
"grad_norm": 1.1053309440612793, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2732, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.793665366077576, |
|
"grad_norm": 1.7023396492004395, |
|
"learning_rate": 3e-05, |
|
"loss": 1.269, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.885471654808354, |
|
"grad_norm": 1.0934760570526123, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2691, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.977277943539132, |
|
"grad_norm": 1.0586143732070923, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2738, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 6.999311452834519, |
|
"eval_accuracy": 0.6910072780203784, |
|
"eval_loss": 1.3299835920333862, |
|
"eval_runtime": 9.0186, |
|
"eval_samples_per_second": 55.441, |
|
"eval_steps_per_second": 6.986, |
|
"step": 7624 |
|
}, |
|
{ |
|
"epoch": 7.06908423226991, |
|
"grad_norm": 1.2295095920562744, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2221, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.160890521000688, |
|
"grad_norm": 1.0440340042114258, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2119, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.252696809731467, |
|
"grad_norm": 1.1318169832229614, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2061, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.344503098462245, |
|
"grad_norm": 1.2543174028396606, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2222, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.436309387193023, |
|
"grad_norm": 1.098528504371643, |
|
"learning_rate": 3e-05, |
|
"loss": 1.211, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.528115675923801, |
|
"grad_norm": 1.5505329370498657, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2313, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.619921964654579, |
|
"grad_norm": 1.2159889936447144, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2155, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.711728253385357, |
|
"grad_norm": 1.2545368671417236, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2125, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.803534542116135, |
|
"grad_norm": 1.0893586874008179, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2067, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.895340830846913, |
|
"grad_norm": 1.2962942123413086, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2123, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.987147119577691, |
|
"grad_norm": 1.0764884948730469, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2283, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6934876273653566, |
|
"eval_loss": 1.3020232915878296, |
|
"eval_runtime": 8.9433, |
|
"eval_samples_per_second": 55.908, |
|
"eval_steps_per_second": 7.044, |
|
"step": 8714 |
|
}, |
|
{ |
|
"epoch": 8.07895340830847, |
|
"grad_norm": 1.2428829669952393, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1629, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.170759697039248, |
|
"grad_norm": 1.19692862033844, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1448, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.262565985770026, |
|
"grad_norm": 1.3424954414367676, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1617, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.354372274500804, |
|
"grad_norm": 1.3499901294708252, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1564, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.446178563231582, |
|
"grad_norm": 1.183600664138794, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1524, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.53798485196236, |
|
"grad_norm": 1.3151459693908691, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1499, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.629791140693138, |
|
"grad_norm": 1.3484901189804077, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1716, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.721597429423916, |
|
"grad_norm": 1.326663851737976, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1665, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.813403718154694, |
|
"grad_norm": 1.315091609954834, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1713, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.905210006885472, |
|
"grad_norm": 1.3116127252578735, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1669, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.99701629561625, |
|
"grad_norm": 1.4249849319458008, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1788, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 8.999770484278173, |
|
"eval_accuracy": 0.6963406113537118, |
|
"eval_loss": 1.272213339805603, |
|
"eval_runtime": 8.9482, |
|
"eval_samples_per_second": 55.877, |
|
"eval_steps_per_second": 7.041, |
|
"step": 9803 |
|
}, |
|
{ |
|
"epoch": 9.088822584347028, |
|
"grad_norm": 1.412617802619934, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0912, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.180628873077806, |
|
"grad_norm": 1.2806413173675537, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1013, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.272435161808584, |
|
"grad_norm": 1.8053137063980103, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1071, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.364241450539362, |
|
"grad_norm": 1.352771520614624, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1043, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.45604773927014, |
|
"grad_norm": 1.5698919296264648, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1187, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.547854028000918, |
|
"grad_norm": 1.4473572969436646, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0991, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.639660316731696, |
|
"grad_norm": 1.5458990335464478, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1168, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.731466605462474, |
|
"grad_norm": 1.3577615022659302, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1081, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.823272894193252, |
|
"grad_norm": 1.6055794954299927, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1117, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.91507918292403, |
|
"grad_norm": 1.5611170530319214, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1156, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.999540968556346, |
|
"eval_accuracy": 0.6990072780203784, |
|
"eval_loss": 1.2414618730545044, |
|
"eval_runtime": 9.5021, |
|
"eval_samples_per_second": 52.62, |
|
"eval_steps_per_second": 6.63, |
|
"step": 10892 |
|
}, |
|
{ |
|
"epoch": 10.006885471654808, |
|
"grad_norm": 1.4584790468215942, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1099, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.098691760385586, |
|
"grad_norm": 1.4327212572097778, |
|
"learning_rate": 3e-05, |
|
"loss": 1.039, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 10.190498049116364, |
|
"grad_norm": 1.4160873889923096, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0578, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 10.282304337847142, |
|
"grad_norm": 1.506165862083435, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0481, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 10.37411062657792, |
|
"grad_norm": 1.6476013660430908, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0582, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 10.465916915308698, |
|
"grad_norm": 1.4203314781188965, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0615, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 10.557723204039476, |
|
"grad_norm": 1.591191053390503, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0654, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 10.649529492770254, |
|
"grad_norm": 1.552139401435852, |
|
"learning_rate": 3e-05, |
|
"loss": 1.043, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 10.741335781501032, |
|
"grad_norm": 1.5005476474761963, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0518, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 10.83314207023181, |
|
"grad_norm": 1.6541969776153564, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0617, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 10.924948358962588, |
|
"grad_norm": 1.4498178958892822, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0526, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 10.999311452834519, |
|
"eval_accuracy": 0.7014876273653566, |
|
"eval_loss": 1.2131479978561401, |
|
"eval_runtime": 9.02, |
|
"eval_samples_per_second": 55.432, |
|
"eval_steps_per_second": 6.984, |
|
"step": 11981 |
|
}, |
|
{ |
|
"epoch": 11.016754647693366, |
|
"grad_norm": 1.384069561958313, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0554, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 11.108560936424144, |
|
"grad_norm": 1.4845672845840454, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9886, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 11.200367225154924, |
|
"grad_norm": 1.7744626998901367, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9969, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 11.292173513885702, |
|
"grad_norm": 1.6337647438049316, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9899, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 11.38397980261648, |
|
"grad_norm": 2.003005266189575, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0111, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 11.475786091347258, |
|
"grad_norm": 1.968371033668518, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0012, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 11.567592380078036, |
|
"grad_norm": 1.6538879871368408, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9972, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 11.659398668808814, |
|
"grad_norm": 1.6392265558242798, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0084, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 11.751204957539592, |
|
"grad_norm": 1.7361793518066406, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0109, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 11.84301124627037, |
|
"grad_norm": 1.4300850629806519, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0163, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 11.934817535001148, |
|
"grad_norm": 1.6984518766403198, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0146, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7045123726346434, |
|
"eval_loss": 1.1803950071334839, |
|
"eval_runtime": 8.9699, |
|
"eval_samples_per_second": 55.742, |
|
"eval_steps_per_second": 7.024, |
|
"step": 13071 |
|
}, |
|
{ |
|
"epoch": 12.026623823731926, |
|
"grad_norm": 1.52531898021698, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9803, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 12.118430112462704, |
|
"grad_norm": 1.8437868356704712, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9362, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 12.210236401193482, |
|
"grad_norm": 1.7236285209655762, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9476, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 12.30204268992426, |
|
"grad_norm": 1.7923431396484375, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9473, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 12.393848978655038, |
|
"grad_norm": 1.9459409713745117, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9521, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 12.485655267385816, |
|
"grad_norm": 1.8831307888031006, |
|
"learning_rate": 3e-05, |
|
"loss": 0.942, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 12.577461556116594, |
|
"grad_norm": 1.629230260848999, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9558, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 12.669267844847372, |
|
"grad_norm": 1.5318315029144287, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9525, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 12.76107413357815, |
|
"grad_norm": 1.611336588859558, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9619, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 12.852880422308928, |
|
"grad_norm": 1.6721709966659546, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9619, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.944686711039706, |
|
"grad_norm": 1.8074623346328735, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9613, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 12.999770484278173, |
|
"eval_accuracy": 0.7071382823871907, |
|
"eval_loss": 1.1507638692855835, |
|
"eval_runtime": 8.9983, |
|
"eval_samples_per_second": 55.566, |
|
"eval_steps_per_second": 7.001, |
|
"step": 14160 |
|
}, |
|
{ |
|
"epoch": 13.036492999770484, |
|
"grad_norm": 2.195594549179077, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9259, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 13.128299288501262, |
|
"grad_norm": 1.8173458576202393, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8902, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 13.22010557723204, |
|
"grad_norm": 1.7481939792633057, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8859, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 13.311911865962818, |
|
"grad_norm": 1.938438892364502, |
|
"learning_rate": 3e-05, |
|
"loss": 0.899, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 13.403718154693596, |
|
"grad_norm": 1.8565011024475098, |
|
"learning_rate": 3e-05, |
|
"loss": 0.889, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 13.495524443424374, |
|
"grad_norm": 1.6509944200515747, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9182, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 13.587330732155152, |
|
"grad_norm": 1.9225726127624512, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9091, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 13.67913702088593, |
|
"grad_norm": 1.7917280197143555, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9137, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 13.770943309616708, |
|
"grad_norm": 2.0736453533172607, |
|
"learning_rate": 3e-05, |
|
"loss": 0.904, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 13.862749598347486, |
|
"grad_norm": 2.1191747188568115, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9146, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 13.954555887078264, |
|
"grad_norm": 1.8331027030944824, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9109, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 13.999540968556346, |
|
"eval_accuracy": 0.7097409024745269, |
|
"eval_loss": 1.12144935131073, |
|
"eval_runtime": 8.9899, |
|
"eval_samples_per_second": 55.618, |
|
"eval_steps_per_second": 7.008, |
|
"step": 15249 |
|
}, |
|
{ |
|
"epoch": 14.046362175809042, |
|
"grad_norm": 1.818524718284607, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8787, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 14.13816846453982, |
|
"grad_norm": 1.9324177503585815, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8487, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 14.229974753270598, |
|
"grad_norm": 1.952480435371399, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8526, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 14.321781042001376, |
|
"grad_norm": 1.9058892726898193, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8433, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 14.413587330732156, |
|
"grad_norm": 1.9198521375656128, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8591, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 14.505393619462934, |
|
"grad_norm": 2.374208927154541, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8643, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 14.597199908193712, |
|
"grad_norm": 1.8864604234695435, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8623, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 14.68900619692449, |
|
"grad_norm": 1.9877722263336182, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8569, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 14.780812485655268, |
|
"grad_norm": 2.204672336578369, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8629, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 14.872618774386046, |
|
"grad_norm": 1.790323257446289, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8542, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 14.964425063116824, |
|
"grad_norm": 1.8623679876327515, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8566, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 14.999311452834519, |
|
"eval_accuracy": 0.7127714701601164, |
|
"eval_loss": 1.0913478136062622, |
|
"eval_runtime": 8.9695, |
|
"eval_samples_per_second": 55.745, |
|
"eval_steps_per_second": 7.024, |
|
"step": 16338 |
|
}, |
|
{ |
|
"epoch": 15.056231351847602, |
|
"grad_norm": 1.6198936700820923, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8233, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 15.14803764057838, |
|
"grad_norm": 2.117966413497925, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8004, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 15.239843929309158, |
|
"grad_norm": 1.9046192169189453, |
|
"learning_rate": 3e-05, |
|
"loss": 0.783, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 15.331650218039936, |
|
"grad_norm": 1.8354123830795288, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8123, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 15.423456506770714, |
|
"grad_norm": 1.8810902833938599, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8062, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 15.515262795501492, |
|
"grad_norm": 2.2442831993103027, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8121, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 15.60706908423227, |
|
"grad_norm": 2.308647394180298, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8155, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 15.698875372963048, |
|
"grad_norm": 2.2714340686798096, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8211, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 15.790681661693826, |
|
"grad_norm": 1.9850467443466187, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8054, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 15.882487950424604, |
|
"grad_norm": 2.5280234813690186, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8198, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 15.974294239155382, |
|
"grad_norm": 2.184380292892456, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8307, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7155633187772926, |
|
"eval_loss": 1.0599175691604614, |
|
"eval_runtime": 8.9521, |
|
"eval_samples_per_second": 55.853, |
|
"eval_steps_per_second": 7.037, |
|
"step": 17428 |
|
}, |
|
{ |
|
"epoch": 16.06610052788616, |
|
"grad_norm": 2.437701463699341, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7609, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 16.15790681661694, |
|
"grad_norm": 2.636090040206909, |
|
"learning_rate": 3e-05, |
|
"loss": 0.758, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 16.249713105347716, |
|
"grad_norm": 2.1846566200256348, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7439, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 16.341519394078496, |
|
"grad_norm": 2.1148085594177246, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7568, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 16.433325682809272, |
|
"grad_norm": 1.8323599100112915, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7678, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 16.525131971540052, |
|
"grad_norm": 2.67404842376709, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7719, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 16.616938260270828, |
|
"grad_norm": 2.3159210681915283, |
|
"learning_rate": 3e-05, |
|
"loss": 0.78, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 16.708744549001608, |
|
"grad_norm": 1.924141526222229, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7774, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 16.800550837732384, |
|
"grad_norm": 1.9718719720840454, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7714, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 16.892357126463164, |
|
"grad_norm": 2.0986855030059814, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7861, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 16.98416341519394, |
|
"grad_norm": 2.2935447692871094, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7803, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 16.999770484278173, |
|
"eval_accuracy": 0.7184133915574964, |
|
"eval_loss": 1.028311014175415, |
|
"eval_runtime": 8.9448, |
|
"eval_samples_per_second": 55.898, |
|
"eval_steps_per_second": 7.043, |
|
"step": 18517 |
|
}, |
|
{ |
|
"epoch": 17.07596970392472, |
|
"grad_norm": 2.364075183868408, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7264, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 17.167775992655496, |
|
"grad_norm": 2.1636979579925537, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7038, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 17.259582281386276, |
|
"grad_norm": 2.135673761367798, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7136, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 17.351388570117052, |
|
"grad_norm": 2.1516411304473877, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7231, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 17.443194858847832, |
|
"grad_norm": 2.499406337738037, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7302, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 17.53500114757861, |
|
"grad_norm": 2.455547332763672, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7407, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 17.626807436309388, |
|
"grad_norm": 2.248194932937622, |
|
"learning_rate": 3e-05, |
|
"loss": 0.722, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 17.718613725040164, |
|
"grad_norm": 2.3520660400390625, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7291, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 17.810420013770944, |
|
"grad_norm": 2.1547889709472656, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7317, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 17.90222630250172, |
|
"grad_norm": 2.608548402786255, |
|
"learning_rate": 3e-05, |
|
"loss": 0.738, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 17.9940325912325, |
|
"grad_norm": 2.2248220443725586, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7486, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 17.999540968556346, |
|
"eval_accuracy": 0.7214614264919942, |
|
"eval_loss": 0.9996564984321594, |
|
"eval_runtime": 8.9645, |
|
"eval_samples_per_second": 55.776, |
|
"eval_steps_per_second": 7.028, |
|
"step": 19606 |
|
}, |
|
{ |
|
"epoch": 18.085838879963276, |
|
"grad_norm": 2.197584867477417, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6731, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 18.177645168694056, |
|
"grad_norm": 2.392916440963745, |
|
"learning_rate": 3e-05, |
|
"loss": 0.676, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 18.269451457424832, |
|
"grad_norm": 2.4115874767303467, |
|
"learning_rate": 3e-05, |
|
"loss": 0.691, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 18.361257746155612, |
|
"grad_norm": 2.320349931716919, |
|
"learning_rate": 3e-05, |
|
"loss": 0.677, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 18.45306403488639, |
|
"grad_norm": 2.2987887859344482, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6857, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 18.544870323617168, |
|
"grad_norm": 2.541984796524048, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6787, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 18.636676612347944, |
|
"grad_norm": 2.0782082080841064, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6973, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 18.728482901078724, |
|
"grad_norm": 2.4935009479522705, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7083, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 18.8202891898095, |
|
"grad_norm": 2.8205904960632324, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6872, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 18.91209547854028, |
|
"grad_norm": 2.335952043533325, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6992, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 18.99931145283452, |
|
"eval_accuracy": 0.7237583697234352, |
|
"eval_loss": 0.971889078617096, |
|
"eval_runtime": 9.1348, |
|
"eval_samples_per_second": 54.736, |
|
"eval_steps_per_second": 6.897, |
|
"step": 20695 |
|
}, |
|
{ |
|
"epoch": 19.003901767271056, |
|
"grad_norm": 1.9122214317321777, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7018, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 19.095708056001836, |
|
"grad_norm": 2.1178972721099854, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6337, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 19.187514344732612, |
|
"grad_norm": 2.1954286098480225, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6294, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 19.279320633463392, |
|
"grad_norm": 2.2881522178649902, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6436, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 19.371126922194172, |
|
"grad_norm": 2.2738537788391113, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6584, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 19.462933210924948, |
|
"grad_norm": 2.3467330932617188, |
|
"learning_rate": 3e-05, |
|
"loss": 0.655, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 19.554739499655728, |
|
"grad_norm": 2.7984132766723633, |
|
"learning_rate": 3e-05, |
|
"loss": 0.647, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 19.646545788386504, |
|
"grad_norm": 2.397935152053833, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6642, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 19.738352077117284, |
|
"grad_norm": 2.7952253818511963, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6604, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 19.83015836584806, |
|
"grad_norm": 2.212345600128174, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6598, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 19.92196465457884, |
|
"grad_norm": 2.5237057209014893, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6632, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 19.995409685563462, |
|
"eval_accuracy": 0.7263318777292577, |
|
"eval_loss": 0.9440018534660339, |
|
"eval_runtime": 8.9833, |
|
"eval_samples_per_second": 55.659, |
|
"eval_steps_per_second": 7.013, |
|
"step": 21780 |
|
}, |
|
{ |
|
"epoch": 19.995409685563462, |
|
"step": 21780, |
|
"total_flos": 2.2953223726028554e+18, |
|
"train_loss": 1.0977098983400344, |
|
"train_runtime": 46950.9086, |
|
"train_samples_per_second": 14.847, |
|
"train_steps_per_second": 0.464 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 21780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 2.2953223726028554e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|