{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4370868163689013, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008741736327378026, "grad_norm": 19.75, "learning_rate": 8.620689655172415e-07, "loss": 4.7931, "step": 1 }, { "epoch": 0.0017483472654756052, "grad_norm": 21.75, "learning_rate": 1.724137931034483e-06, "loss": 4.937, "step": 2 }, { "epoch": 0.0026225208982134074, "grad_norm": 20.625, "learning_rate": 2.586206896551724e-06, "loss": 5.0176, "step": 3 }, { "epoch": 0.0034966945309512104, "grad_norm": 19.875, "learning_rate": 3.448275862068966e-06, "loss": 4.8236, "step": 4 }, { "epoch": 0.004370868163689013, "grad_norm": 16.875, "learning_rate": 4.310344827586207e-06, "loss": 4.761, "step": 5 }, { "epoch": 0.005245041796426815, "grad_norm": 16.125, "learning_rate": 5.172413793103448e-06, "loss": 4.9055, "step": 6 }, { "epoch": 0.006119215429164618, "grad_norm": 11.5625, "learning_rate": 6.03448275862069e-06, "loss": 4.6787, "step": 7 }, { "epoch": 0.006993389061902421, "grad_norm": 14.5625, "learning_rate": 6.896551724137932e-06, "loss": 4.6797, "step": 8 }, { "epoch": 0.007867562694640224, "grad_norm": 19.0, "learning_rate": 7.758620689655173e-06, "loss": 4.6406, "step": 9 }, { "epoch": 0.008741736327378026, "grad_norm": 12.0625, "learning_rate": 8.620689655172414e-06, "loss": 4.5986, "step": 10 }, { "epoch": 0.009615909960115828, "grad_norm": 7.8125, "learning_rate": 9.482758620689655e-06, "loss": 4.5762, "step": 11 }, { "epoch": 0.01049008359285363, "grad_norm": 6.875, "learning_rate": 1.0344827586206897e-05, "loss": 4.5882, "step": 12 }, { "epoch": 0.011364257225591434, "grad_norm": 6.0, "learning_rate": 1.1206896551724138e-05, "loss": 4.3547, "step": 13 }, { "epoch": 0.012238430858329236, "grad_norm": 6.125, "learning_rate": 1.206896551724138e-05, "loss": 4.2879, "step": 14 }, { "epoch": 0.013112604491067038, "grad_norm": 5.875, "learning_rate": 1.2931034482758622e-05, "loss": 4.3241, "step": 15 }, { "epoch": 0.013986778123804841, "grad_norm": 4.625, "learning_rate": 1.3793103448275863e-05, "loss": 4.386, "step": 16 }, { "epoch": 0.014860951756542643, "grad_norm": 3.390625, "learning_rate": 1.4655172413793103e-05, "loss": 4.266, "step": 17 }, { "epoch": 0.015735125389280447, "grad_norm": 3.296875, "learning_rate": 1.5517241379310346e-05, "loss": 4.2994, "step": 18 }, { "epoch": 0.01660929902201825, "grad_norm": 2.796875, "learning_rate": 1.6379310344827585e-05, "loss": 4.1198, "step": 19 }, { "epoch": 0.01748347265475605, "grad_norm": 4.0625, "learning_rate": 1.7241379310344828e-05, "loss": 4.1751, "step": 20 }, { "epoch": 0.018357646287493853, "grad_norm": 3.109375, "learning_rate": 1.810344827586207e-05, "loss": 3.9965, "step": 21 }, { "epoch": 0.019231819920231655, "grad_norm": 2.875, "learning_rate": 1.896551724137931e-05, "loss": 3.9684, "step": 22 }, { "epoch": 0.020105993552969458, "grad_norm": 2.953125, "learning_rate": 1.9827586206896554e-05, "loss": 3.9812, "step": 23 }, { "epoch": 0.02098016718570726, "grad_norm": 2.28125, "learning_rate": 2.0689655172413793e-05, "loss": 3.9997, "step": 24 }, { "epoch": 0.021854340818445065, "grad_norm": 2.53125, "learning_rate": 2.1551724137931033e-05, "loss": 3.9327, "step": 25 }, { "epoch": 0.022728514451182867, "grad_norm": 2.09375, "learning_rate": 2.2413793103448276e-05, "loss": 3.937, "step": 26 }, { "epoch": 0.02360268808392067, "grad_norm": 2.125, "learning_rate": 2.327586206896552e-05, "loss": 3.8923, "step": 27 }, { "epoch": 0.02447686171665847, "grad_norm": 1.6171875, "learning_rate": 2.413793103448276e-05, "loss": 3.8121, "step": 28 }, { "epoch": 0.025351035349396273, "grad_norm": 1.671875, "learning_rate": 2.5e-05, "loss": 3.8329, "step": 29 }, { "epoch": 0.026225208982134075, "grad_norm": 1.5625, "learning_rate": 2.5862068965517244e-05, "loss": 3.7719, "step": 30 }, { "epoch": 0.027099382614871877, "grad_norm": 1.59375, "learning_rate": 2.672413793103448e-05, "loss": 3.8168, "step": 31 }, { "epoch": 0.027973556247609683, "grad_norm": 1.6796875, "learning_rate": 2.7586206896551727e-05, "loss": 3.6792, "step": 32 }, { "epoch": 0.028847729880347485, "grad_norm": 1.421875, "learning_rate": 2.844827586206897e-05, "loss": 3.6723, "step": 33 }, { "epoch": 0.029721903513085287, "grad_norm": 1.3671875, "learning_rate": 2.9310344827586206e-05, "loss": 3.6362, "step": 34 }, { "epoch": 0.03059607714582309, "grad_norm": 1.671875, "learning_rate": 3.017241379310345e-05, "loss": 3.6452, "step": 35 }, { "epoch": 0.031470250778560895, "grad_norm": 1.8046875, "learning_rate": 3.103448275862069e-05, "loss": 3.5118, "step": 36 }, { "epoch": 0.03234442441129869, "grad_norm": 1.8125, "learning_rate": 3.1896551724137935e-05, "loss": 3.4852, "step": 37 }, { "epoch": 0.0332185980440365, "grad_norm": 1.484375, "learning_rate": 3.275862068965517e-05, "loss": 3.3851, "step": 38 }, { "epoch": 0.0340927716767743, "grad_norm": 1.3359375, "learning_rate": 3.3620689655172414e-05, "loss": 3.3676, "step": 39 }, { "epoch": 0.0349669453095121, "grad_norm": 1.4375, "learning_rate": 3.4482758620689657e-05, "loss": 3.4513, "step": 40 }, { "epoch": 0.0358411189422499, "grad_norm": 1.7421875, "learning_rate": 3.53448275862069e-05, "loss": 3.3572, "step": 41 }, { "epoch": 0.03671529257498771, "grad_norm": 1.6796875, "learning_rate": 3.620689655172414e-05, "loss": 3.229, "step": 42 }, { "epoch": 0.03758946620772551, "grad_norm": 1.3046875, "learning_rate": 3.7068965517241385e-05, "loss": 3.2683, "step": 43 }, { "epoch": 0.03846363984046331, "grad_norm": 2.078125, "learning_rate": 3.793103448275862e-05, "loss": 3.2209, "step": 44 }, { "epoch": 0.039337813473201116, "grad_norm": 1.3515625, "learning_rate": 3.8793103448275865e-05, "loss": 3.3169, "step": 45 }, { "epoch": 0.040211987105938915, "grad_norm": 1.3046875, "learning_rate": 3.965517241379311e-05, "loss": 3.2609, "step": 46 }, { "epoch": 0.04108616073867672, "grad_norm": 1.75, "learning_rate": 4.0517241379310344e-05, "loss": 3.2348, "step": 47 }, { "epoch": 0.04196033437141452, "grad_norm": 1.5546875, "learning_rate": 4.1379310344827587e-05, "loss": 3.2157, "step": 48 }, { "epoch": 0.042834508004152325, "grad_norm": 1.3046875, "learning_rate": 4.224137931034483e-05, "loss": 3.118, "step": 49 }, { "epoch": 0.04370868163689013, "grad_norm": 1.265625, "learning_rate": 4.3103448275862066e-05, "loss": 3.1812, "step": 50 }, { "epoch": 0.04458285526962793, "grad_norm": 1.28125, "learning_rate": 4.396551724137931e-05, "loss": 3.1164, "step": 51 }, { "epoch": 0.045457028902365734, "grad_norm": 1.296875, "learning_rate": 4.482758620689655e-05, "loss": 3.1016, "step": 52 }, { "epoch": 0.04633120253510353, "grad_norm": 1.3046875, "learning_rate": 4.5689655172413794e-05, "loss": 3.0119, "step": 53 }, { "epoch": 0.04720537616784134, "grad_norm": 1.265625, "learning_rate": 4.655172413793104e-05, "loss": 3.0376, "step": 54 }, { "epoch": 0.04807954980057914, "grad_norm": 1.359375, "learning_rate": 4.741379310344828e-05, "loss": 3.0525, "step": 55 }, { "epoch": 0.04895372343331694, "grad_norm": 1.21875, "learning_rate": 4.827586206896552e-05, "loss": 3.0417, "step": 56 }, { "epoch": 0.04982789706605475, "grad_norm": 1.1640625, "learning_rate": 4.913793103448276e-05, "loss": 2.9921, "step": 57 }, { "epoch": 0.050702070698792547, "grad_norm": 1.390625, "learning_rate": 5e-05, "loss": 2.9874, "step": 58 }, { "epoch": 0.05157624433153035, "grad_norm": 1.546875, "learning_rate": 4.9999895202727756e-05, "loss": 2.9822, "step": 59 }, { "epoch": 0.05245041796426815, "grad_norm": 1.4375, "learning_rate": 4.9999580811789614e-05, "loss": 2.9278, "step": 60 }, { "epoch": 0.053324591597005956, "grad_norm": 1.171875, "learning_rate": 4.999905682982135e-05, "loss": 2.9424, "step": 61 }, { "epoch": 0.054198765229743755, "grad_norm": 1.21875, "learning_rate": 4.999832326121594e-05, "loss": 2.8771, "step": 62 }, { "epoch": 0.05507293886248156, "grad_norm": 1.15625, "learning_rate": 4.999738011212344e-05, "loss": 2.9132, "step": 63 }, { "epoch": 0.055947112495219366, "grad_norm": 1.15625, "learning_rate": 4.999622739045101e-05, "loss": 2.9479, "step": 64 }, { "epoch": 0.056821286127957164, "grad_norm": 1.2890625, "learning_rate": 4.999486510586282e-05, "loss": 2.948, "step": 65 }, { "epoch": 0.05769545976069497, "grad_norm": 1.3828125, "learning_rate": 4.9993293269779975e-05, "loss": 2.948, "step": 66 }, { "epoch": 0.05856963339343277, "grad_norm": 1.28125, "learning_rate": 4.9991511895380396e-05, "loss": 2.9111, "step": 67 }, { "epoch": 0.059443807026170574, "grad_norm": 1.375, "learning_rate": 4.998952099759874e-05, "loss": 2.9496, "step": 68 }, { "epoch": 0.06031798065890837, "grad_norm": 1.125, "learning_rate": 4.998732059312625e-05, "loss": 2.8007, "step": 69 }, { "epoch": 0.06119215429164618, "grad_norm": 1.40625, "learning_rate": 4.998491070041066e-05, "loss": 2.8642, "step": 70 }, { "epoch": 0.062066327924383984, "grad_norm": 1.21875, "learning_rate": 4.998229133965596e-05, "loss": 2.8762, "step": 71 }, { "epoch": 0.06294050155712179, "grad_norm": 1.109375, "learning_rate": 4.997946253282231e-05, "loss": 2.8961, "step": 72 }, { "epoch": 0.06381467518985959, "grad_norm": 1.25, "learning_rate": 4.9976424303625815e-05, "loss": 2.825, "step": 73 }, { "epoch": 0.06468884882259739, "grad_norm": 1.1953125, "learning_rate": 4.997317667753831e-05, "loss": 2.8532, "step": 74 }, { "epoch": 0.06556302245533518, "grad_norm": 1.09375, "learning_rate": 4.9969719681787196e-05, "loss": 2.9245, "step": 75 }, { "epoch": 0.066437196088073, "grad_norm": 1.0859375, "learning_rate": 4.9966053345355174e-05, "loss": 2.7549, "step": 76 }, { "epoch": 0.0673113697208108, "grad_norm": 1.046875, "learning_rate": 4.9962177698979995e-05, "loss": 2.8295, "step": 77 }, { "epoch": 0.0681855433535486, "grad_norm": 1.1328125, "learning_rate": 4.995809277515424e-05, "loss": 2.7792, "step": 78 }, { "epoch": 0.0690597169862864, "grad_norm": 1.1015625, "learning_rate": 4.9953798608125025e-05, "loss": 2.7635, "step": 79 }, { "epoch": 0.0699338906190242, "grad_norm": 1.3203125, "learning_rate": 4.99492952338937e-05, "loss": 2.8317, "step": 80 }, { "epoch": 0.070808064251762, "grad_norm": 1.1171875, "learning_rate": 4.994458269021557e-05, "loss": 2.7627, "step": 81 }, { "epoch": 0.0716822378844998, "grad_norm": 1.09375, "learning_rate": 4.993966101659958e-05, "loss": 2.8273, "step": 82 }, { "epoch": 0.07255641151723762, "grad_norm": 1.1171875, "learning_rate": 4.993453025430797e-05, "loss": 2.8587, "step": 83 }, { "epoch": 0.07343058514997541, "grad_norm": 1.1953125, "learning_rate": 4.992919044635592e-05, "loss": 2.8023, "step": 84 }, { "epoch": 0.07430475878271321, "grad_norm": 1.140625, "learning_rate": 4.9923641637511226e-05, "loss": 2.6944, "step": 85 }, { "epoch": 0.07517893241545102, "grad_norm": 1.1640625, "learning_rate": 4.991788387429388e-05, "loss": 2.7955, "step": 86 }, { "epoch": 0.07605310604818882, "grad_norm": 1.3671875, "learning_rate": 4.9911917204975724e-05, "loss": 2.8184, "step": 87 }, { "epoch": 0.07692727968092662, "grad_norm": 1.3671875, "learning_rate": 4.9905741679580007e-05, "loss": 2.8002, "step": 88 }, { "epoch": 0.07780145331366442, "grad_norm": 1.1328125, "learning_rate": 4.989935734988098e-05, "loss": 2.7749, "step": 89 }, { "epoch": 0.07867562694640223, "grad_norm": 1.2578125, "learning_rate": 4.989276426940348e-05, "loss": 2.8351, "step": 90 }, { "epoch": 0.07954980057914003, "grad_norm": 0.9609375, "learning_rate": 4.988596249342244e-05, "loss": 2.7638, "step": 91 }, { "epoch": 0.08042397421187783, "grad_norm": 1.2265625, "learning_rate": 4.987895207896248e-05, "loss": 2.7492, "step": 92 }, { "epoch": 0.08129814784461564, "grad_norm": 1.1328125, "learning_rate": 4.987173308479738e-05, "loss": 2.7668, "step": 93 }, { "epoch": 0.08217232147735344, "grad_norm": 1.2265625, "learning_rate": 4.9864305571449616e-05, "loss": 2.7527, "step": 94 }, { "epoch": 0.08304649511009124, "grad_norm": 1.109375, "learning_rate": 4.985666960118983e-05, "loss": 2.7963, "step": 95 }, { "epoch": 0.08392066874282904, "grad_norm": 1.109375, "learning_rate": 4.984882523803634e-05, "loss": 2.7924, "step": 96 }, { "epoch": 0.08479484237556685, "grad_norm": 1.4375, "learning_rate": 4.9840772547754566e-05, "loss": 2.763, "step": 97 }, { "epoch": 0.08566901600830465, "grad_norm": 1.203125, "learning_rate": 4.983251159785651e-05, "loss": 2.7398, "step": 98 }, { "epoch": 0.08654318964104245, "grad_norm": 1.1875, "learning_rate": 4.982404245760018e-05, "loss": 2.7528, "step": 99 }, { "epoch": 0.08741736327378026, "grad_norm": 1.1328125, "learning_rate": 4.9815365197988986e-05, "loss": 2.8205, "step": 100 }, { "epoch": 0.08829153690651806, "grad_norm": 1.109375, "learning_rate": 4.9806479891771195e-05, "loss": 2.7228, "step": 101 }, { "epoch": 0.08916571053925586, "grad_norm": 1.03125, "learning_rate": 4.9797386613439265e-05, "loss": 2.7599, "step": 102 }, { "epoch": 0.09003988417199366, "grad_norm": 1.0703125, "learning_rate": 4.978808543922925e-05, "loss": 2.7388, "step": 103 }, { "epoch": 0.09091405780473147, "grad_norm": 1.1484375, "learning_rate": 4.9778576447120184e-05, "loss": 2.7801, "step": 104 }, { "epoch": 0.09178823143746927, "grad_norm": 0.97265625, "learning_rate": 4.976885971683337e-05, "loss": 2.656, "step": 105 }, { "epoch": 0.09266240507020707, "grad_norm": 1.078125, "learning_rate": 4.9758935329831754e-05, "loss": 2.763, "step": 106 }, { "epoch": 0.09353657870294488, "grad_norm": 1.078125, "learning_rate": 4.974880336931923e-05, "loss": 2.7975, "step": 107 }, { "epoch": 0.09441075233568268, "grad_norm": 1.078125, "learning_rate": 4.9738463920239955e-05, "loss": 2.7029, "step": 108 }, { "epoch": 0.09528492596842048, "grad_norm": 1.0546875, "learning_rate": 4.972791706927759e-05, "loss": 2.689, "step": 109 }, { "epoch": 0.09615909960115827, "grad_norm": 1.0546875, "learning_rate": 4.9717162904854664e-05, "loss": 2.7322, "step": 110 }, { "epoch": 0.09703327323389609, "grad_norm": 1.0546875, "learning_rate": 4.9706201517131725e-05, "loss": 2.778, "step": 111 }, { "epoch": 0.09790744686663388, "grad_norm": 1.0703125, "learning_rate": 4.9695032998006655e-05, "loss": 2.8284, "step": 112 }, { "epoch": 0.09878162049937168, "grad_norm": 1.0546875, "learning_rate": 4.9683657441113884e-05, "loss": 2.71, "step": 113 }, { "epoch": 0.0996557941321095, "grad_norm": 1.2109375, "learning_rate": 4.967207494182361e-05, "loss": 2.6782, "step": 114 }, { "epoch": 0.1005299677648473, "grad_norm": 1.2421875, "learning_rate": 4.966028559724096e-05, "loss": 2.706, "step": 115 }, { "epoch": 0.10140414139758509, "grad_norm": 1.09375, "learning_rate": 4.964828950620524e-05, "loss": 2.7667, "step": 116 }, { "epoch": 0.10227831503032289, "grad_norm": 0.94921875, "learning_rate": 4.963608676928905e-05, "loss": 2.685, "step": 117 }, { "epoch": 0.1031524886630607, "grad_norm": 1.0234375, "learning_rate": 4.962367748879748e-05, "loss": 2.6407, "step": 118 }, { "epoch": 0.1040266622957985, "grad_norm": 1.1015625, "learning_rate": 4.961106176876723e-05, "loss": 2.662, "step": 119 }, { "epoch": 0.1049008359285363, "grad_norm": 1.0078125, "learning_rate": 4.959823971496574e-05, "loss": 2.7101, "step": 120 }, { "epoch": 0.10577500956127411, "grad_norm": 1.0390625, "learning_rate": 4.958521143489032e-05, "loss": 2.7607, "step": 121 }, { "epoch": 0.10664918319401191, "grad_norm": 1.0078125, "learning_rate": 4.9571977037767217e-05, "loss": 2.6531, "step": 122 }, { "epoch": 0.10752335682674971, "grad_norm": 0.98828125, "learning_rate": 4.955853663455072e-05, "loss": 2.6706, "step": 123 }, { "epoch": 0.10839753045948751, "grad_norm": 1.0390625, "learning_rate": 4.954489033792227e-05, "loss": 2.6516, "step": 124 }, { "epoch": 0.10927170409222532, "grad_norm": 1.1015625, "learning_rate": 4.95310382622894e-05, "loss": 2.6962, "step": 125 }, { "epoch": 0.11014587772496312, "grad_norm": 1.015625, "learning_rate": 4.951698052378492e-05, "loss": 2.702, "step": 126 }, { "epoch": 0.11102005135770092, "grad_norm": 1.0390625, "learning_rate": 4.950271724026582e-05, "loss": 2.6833, "step": 127 }, { "epoch": 0.11189422499043873, "grad_norm": 1.234375, "learning_rate": 4.948824853131236e-05, "loss": 2.691, "step": 128 }, { "epoch": 0.11276839862317653, "grad_norm": 0.9921875, "learning_rate": 4.947357451822706e-05, "loss": 2.64, "step": 129 }, { "epoch": 0.11364257225591433, "grad_norm": 1.09375, "learning_rate": 4.945869532403362e-05, "loss": 2.6507, "step": 130 }, { "epoch": 0.11451674588865213, "grad_norm": 0.984375, "learning_rate": 4.944361107347597e-05, "loss": 2.7446, "step": 131 }, { "epoch": 0.11539091952138994, "grad_norm": 1.09375, "learning_rate": 4.942832189301716e-05, "loss": 2.6651, "step": 132 }, { "epoch": 0.11626509315412774, "grad_norm": 1.1015625, "learning_rate": 4.941282791083836e-05, "loss": 2.6495, "step": 133 }, { "epoch": 0.11713926678686554, "grad_norm": 1.109375, "learning_rate": 4.9397129256837724e-05, "loss": 2.6474, "step": 134 }, { "epoch": 0.11801344041960335, "grad_norm": 1.171875, "learning_rate": 4.938122606262936e-05, "loss": 2.6893, "step": 135 }, { "epoch": 0.11888761405234115, "grad_norm": 1.1171875, "learning_rate": 4.936511846154215e-05, "loss": 2.6667, "step": 136 }, { "epoch": 0.11976178768507895, "grad_norm": 1.0234375, "learning_rate": 4.934880658861872e-05, "loss": 2.7114, "step": 137 }, { "epoch": 0.12063596131781675, "grad_norm": 1.0859375, "learning_rate": 4.933229058061425e-05, "loss": 2.6641, "step": 138 }, { "epoch": 0.12151013495055456, "grad_norm": 1.109375, "learning_rate": 4.9315570575995364e-05, "loss": 2.7359, "step": 139 }, { "epoch": 0.12238430858329236, "grad_norm": 1.0703125, "learning_rate": 4.92986467149389e-05, "loss": 2.6406, "step": 140 }, { "epoch": 0.12325848221603015, "grad_norm": 1.0, "learning_rate": 4.9281519139330846e-05, "loss": 2.6395, "step": 141 }, { "epoch": 0.12413265584876797, "grad_norm": 1.1015625, "learning_rate": 4.926418799276504e-05, "loss": 2.664, "step": 142 }, { "epoch": 0.12500682948150577, "grad_norm": 0.99609375, "learning_rate": 4.924665342054204e-05, "loss": 2.6725, "step": 143 }, { "epoch": 0.12588100311424358, "grad_norm": 1.0625, "learning_rate": 4.922891556966788e-05, "loss": 2.6244, "step": 144 }, { "epoch": 0.12675517674698136, "grad_norm": 1.078125, "learning_rate": 4.921097458885282e-05, "loss": 2.6786, "step": 145 }, { "epoch": 0.12762935037971918, "grad_norm": 0.9765625, "learning_rate": 4.9192830628510126e-05, "loss": 2.7084, "step": 146 }, { "epoch": 0.128503524012457, "grad_norm": 1.1640625, "learning_rate": 4.9174483840754815e-05, "loss": 2.688, "step": 147 }, { "epoch": 0.12937769764519477, "grad_norm": 0.99609375, "learning_rate": 4.9155934379402335e-05, "loss": 2.6582, "step": 148 }, { "epoch": 0.13025187127793258, "grad_norm": 1.0625, "learning_rate": 4.9137182399967343e-05, "loss": 2.6099, "step": 149 }, { "epoch": 0.13112604491067037, "grad_norm": 0.96484375, "learning_rate": 4.911822805966232e-05, "loss": 2.6315, "step": 150 }, { "epoch": 0.13200021854340818, "grad_norm": 1.109375, "learning_rate": 4.909907151739633e-05, "loss": 2.6418, "step": 151 }, { "epoch": 0.132874392176146, "grad_norm": 0.94140625, "learning_rate": 4.907971293377365e-05, "loss": 2.6344, "step": 152 }, { "epoch": 0.13374856580888378, "grad_norm": 1.078125, "learning_rate": 4.9060152471092414e-05, "loss": 2.6904, "step": 153 }, { "epoch": 0.1346227394416216, "grad_norm": 1.0625, "learning_rate": 4.904039029334326e-05, "loss": 2.6464, "step": 154 }, { "epoch": 0.1354969130743594, "grad_norm": 0.95703125, "learning_rate": 4.9020426566207997e-05, "loss": 2.6811, "step": 155 }, { "epoch": 0.1363710867070972, "grad_norm": 1.0078125, "learning_rate": 4.900026145705815e-05, "loss": 2.6346, "step": 156 }, { "epoch": 0.137245260339835, "grad_norm": 0.9375, "learning_rate": 4.897989513495358e-05, "loss": 2.6762, "step": 157 }, { "epoch": 0.1381194339725728, "grad_norm": 1.0546875, "learning_rate": 4.89593277706411e-05, "loss": 2.6383, "step": 158 }, { "epoch": 0.1389936076053106, "grad_norm": 0.9765625, "learning_rate": 4.8938559536552994e-05, "loss": 2.634, "step": 159 }, { "epoch": 0.1398677812380484, "grad_norm": 1.0078125, "learning_rate": 4.891759060680562e-05, "loss": 2.6626, "step": 160 }, { "epoch": 0.14074195487078622, "grad_norm": 0.9609375, "learning_rate": 4.8896421157197896e-05, "loss": 2.664, "step": 161 }, { "epoch": 0.141616128503524, "grad_norm": 1.046875, "learning_rate": 4.887505136520987e-05, "loss": 2.6787, "step": 162 }, { "epoch": 0.14249030213626182, "grad_norm": 1.015625, "learning_rate": 4.885348141000122e-05, "loss": 2.6107, "step": 163 }, { "epoch": 0.1433644757689996, "grad_norm": 0.97265625, "learning_rate": 4.883171147240975e-05, "loss": 2.6128, "step": 164 }, { "epoch": 0.14423864940173742, "grad_norm": 1.0859375, "learning_rate": 4.880974173494984e-05, "loss": 2.6087, "step": 165 }, { "epoch": 0.14511282303447523, "grad_norm": 1.0234375, "learning_rate": 4.8787572381811e-05, "loss": 2.6377, "step": 166 }, { "epoch": 0.14598699666721301, "grad_norm": 1.0078125, "learning_rate": 4.876520359885624e-05, "loss": 2.6326, "step": 167 }, { "epoch": 0.14686117029995083, "grad_norm": 1.015625, "learning_rate": 4.874263557362056e-05, "loss": 2.6361, "step": 168 }, { "epoch": 0.14773534393268864, "grad_norm": 1.0859375, "learning_rate": 4.871986849530934e-05, "loss": 2.7243, "step": 169 }, { "epoch": 0.14860951756542642, "grad_norm": 0.96875, "learning_rate": 4.869690255479682e-05, "loss": 2.6845, "step": 170 }, { "epoch": 0.14948369119816424, "grad_norm": 1.0234375, "learning_rate": 4.867373794462442e-05, "loss": 2.6677, "step": 171 }, { "epoch": 0.15035786483090205, "grad_norm": 1.046875, "learning_rate": 4.8650374858999185e-05, "loss": 2.659, "step": 172 }, { "epoch": 0.15123203846363983, "grad_norm": 0.9921875, "learning_rate": 4.862681349379212e-05, "loss": 2.6327, "step": 173 }, { "epoch": 0.15210621209637765, "grad_norm": 1.2265625, "learning_rate": 4.860305404653657e-05, "loss": 2.6229, "step": 174 }, { "epoch": 0.15298038572911546, "grad_norm": 1.0234375, "learning_rate": 4.857909671642656e-05, "loss": 2.618, "step": 175 }, { "epoch": 0.15385455936185324, "grad_norm": 1.0078125, "learning_rate": 4.8554941704315116e-05, "loss": 2.5778, "step": 176 }, { "epoch": 0.15472873299459106, "grad_norm": 0.94921875, "learning_rate": 4.853058921271259e-05, "loss": 2.5795, "step": 177 }, { "epoch": 0.15560290662732884, "grad_norm": 1.0390625, "learning_rate": 4.850603944578494e-05, "loss": 2.6069, "step": 178 }, { "epoch": 0.15647708026006665, "grad_norm": 0.9765625, "learning_rate": 4.848129260935208e-05, "loss": 2.6211, "step": 179 }, { "epoch": 0.15735125389280447, "grad_norm": 1.1171875, "learning_rate": 4.845634891088608e-05, "loss": 2.601, "step": 180 }, { "epoch": 0.15822542752554225, "grad_norm": 0.98828125, "learning_rate": 4.8431208559509456e-05, "loss": 2.6104, "step": 181 }, { "epoch": 0.15909960115828006, "grad_norm": 1.0625, "learning_rate": 4.8405871765993433e-05, "loss": 2.6695, "step": 182 }, { "epoch": 0.15997377479101788, "grad_norm": 1.1484375, "learning_rate": 4.8380338742756157e-05, "loss": 2.6339, "step": 183 }, { "epoch": 0.16084794842375566, "grad_norm": 0.9609375, "learning_rate": 4.835460970386093e-05, "loss": 2.6176, "step": 184 }, { "epoch": 0.16172212205649347, "grad_norm": 1.1015625, "learning_rate": 4.8328684865014386e-05, "loss": 2.6188, "step": 185 }, { "epoch": 0.16259629568923128, "grad_norm": 0.9140625, "learning_rate": 4.830256444356473e-05, "loss": 2.5651, "step": 186 }, { "epoch": 0.16347046932196907, "grad_norm": 1.078125, "learning_rate": 4.827624865849987e-05, "loss": 2.6513, "step": 187 }, { "epoch": 0.16434464295470688, "grad_norm": 0.953125, "learning_rate": 4.82497377304456e-05, "loss": 2.6408, "step": 188 }, { "epoch": 0.1652188165874447, "grad_norm": 1.015625, "learning_rate": 4.822303188166377e-05, "loss": 2.6039, "step": 189 }, { "epoch": 0.16609299022018248, "grad_norm": 0.921875, "learning_rate": 4.819613133605036e-05, "loss": 2.6749, "step": 190 }, { "epoch": 0.1669671638529203, "grad_norm": 1.171875, "learning_rate": 4.816903631913372e-05, "loss": 2.602, "step": 191 }, { "epoch": 0.16784133748565808, "grad_norm": 0.9921875, "learning_rate": 4.814174705807252e-05, "loss": 2.5986, "step": 192 }, { "epoch": 0.1687155111183959, "grad_norm": 1.046875, "learning_rate": 4.811426378165398e-05, "loss": 2.5921, "step": 193 }, { "epoch": 0.1695896847511337, "grad_norm": 0.984375, "learning_rate": 4.808658672029189e-05, "loss": 2.5958, "step": 194 }, { "epoch": 0.17046385838387149, "grad_norm": 1.2265625, "learning_rate": 4.8058716106024705e-05, "loss": 2.5892, "step": 195 }, { "epoch": 0.1713380320166093, "grad_norm": 0.99609375, "learning_rate": 4.803065217251357e-05, "loss": 2.5633, "step": 196 }, { "epoch": 0.1722122056493471, "grad_norm": 0.98046875, "learning_rate": 4.800239515504036e-05, "loss": 2.6577, "step": 197 }, { "epoch": 0.1730863792820849, "grad_norm": 0.94921875, "learning_rate": 4.7973945290505766e-05, "loss": 2.6721, "step": 198 }, { "epoch": 0.1739605529148227, "grad_norm": 1.0625, "learning_rate": 4.794530281742724e-05, "loss": 2.6837, "step": 199 }, { "epoch": 0.17483472654756052, "grad_norm": 0.9296875, "learning_rate": 4.791646797593702e-05, "loss": 2.5801, "step": 200 }, { "epoch": 0.1757089001802983, "grad_norm": 1.0078125, "learning_rate": 4.7887441007780123e-05, "loss": 2.5675, "step": 201 }, { "epoch": 0.17658307381303612, "grad_norm": 0.953125, "learning_rate": 4.7858222156312316e-05, "loss": 2.6157, "step": 202 }, { "epoch": 0.17745724744577393, "grad_norm": 0.95703125, "learning_rate": 4.782881166649808e-05, "loss": 2.6109, "step": 203 }, { "epoch": 0.17833142107851171, "grad_norm": 0.9609375, "learning_rate": 4.779920978490854e-05, "loss": 2.5524, "step": 204 }, { "epoch": 0.17920559471124953, "grad_norm": 0.953125, "learning_rate": 4.776941675971941e-05, "loss": 2.6292, "step": 205 }, { "epoch": 0.1800797683439873, "grad_norm": 0.94140625, "learning_rate": 4.773943284070892e-05, "loss": 2.5868, "step": 206 }, { "epoch": 0.18095394197672512, "grad_norm": 1.0234375, "learning_rate": 4.7709258279255696e-05, "loss": 2.5811, "step": 207 }, { "epoch": 0.18182811560946294, "grad_norm": 0.98828125, "learning_rate": 4.767889332833667e-05, "loss": 2.6033, "step": 208 }, { "epoch": 0.18270228924220072, "grad_norm": 0.953125, "learning_rate": 4.764833824252498e-05, "loss": 2.5816, "step": 209 }, { "epoch": 0.18357646287493853, "grad_norm": 1.03125, "learning_rate": 4.7617593277987794e-05, "loss": 2.6657, "step": 210 }, { "epoch": 0.18445063650767635, "grad_norm": 0.90234375, "learning_rate": 4.758665869248417e-05, "loss": 2.5748, "step": 211 }, { "epoch": 0.18532481014041413, "grad_norm": 0.9375, "learning_rate": 4.755553474536294e-05, "loss": 2.6091, "step": 212 }, { "epoch": 0.18619898377315194, "grad_norm": 0.921875, "learning_rate": 4.752422169756048e-05, "loss": 2.5747, "step": 213 }, { "epoch": 0.18707315740588976, "grad_norm": 0.8984375, "learning_rate": 4.749271981159855e-05, "loss": 2.6302, "step": 214 }, { "epoch": 0.18794733103862754, "grad_norm": 0.9296875, "learning_rate": 4.7461029351582076e-05, "loss": 2.6072, "step": 215 }, { "epoch": 0.18882150467136535, "grad_norm": 1.0078125, "learning_rate": 4.7429150583196976e-05, "loss": 2.6458, "step": 216 }, { "epoch": 0.18969567830410317, "grad_norm": 0.94140625, "learning_rate": 4.739708377370789e-05, "loss": 2.5746, "step": 217 }, { "epoch": 0.19056985193684095, "grad_norm": 0.9375, "learning_rate": 4.736482919195593e-05, "loss": 2.5883, "step": 218 }, { "epoch": 0.19144402556957876, "grad_norm": 0.96484375, "learning_rate": 4.733238710835648e-05, "loss": 2.657, "step": 219 }, { "epoch": 0.19231819920231655, "grad_norm": 0.91796875, "learning_rate": 4.729975779489689e-05, "loss": 2.6394, "step": 220 }, { "epoch": 0.19319237283505436, "grad_norm": 0.98046875, "learning_rate": 4.7266941525134215e-05, "loss": 2.6204, "step": 221 }, { "epoch": 0.19406654646779217, "grad_norm": 0.98828125, "learning_rate": 4.7233938574192894e-05, "loss": 2.5254, "step": 222 }, { "epoch": 0.19494072010052996, "grad_norm": 0.96484375, "learning_rate": 4.720074921876245e-05, "loss": 2.5567, "step": 223 }, { "epoch": 0.19581489373326777, "grad_norm": 1.125, "learning_rate": 4.716737373709521e-05, "loss": 2.6215, "step": 224 }, { "epoch": 0.19668906736600558, "grad_norm": 0.9453125, "learning_rate": 4.713381240900394e-05, "loss": 2.5763, "step": 225 }, { "epoch": 0.19756324099874337, "grad_norm": 1.1484375, "learning_rate": 4.710006551585946e-05, "loss": 2.6087, "step": 226 }, { "epoch": 0.19843741463148118, "grad_norm": 0.89453125, "learning_rate": 4.7066133340588394e-05, "loss": 2.5327, "step": 227 }, { "epoch": 0.199311588264219, "grad_norm": 0.95703125, "learning_rate": 4.703201616767067e-05, "loss": 2.5569, "step": 228 }, { "epoch": 0.20018576189695678, "grad_norm": 0.9765625, "learning_rate": 4.699771428313722e-05, "loss": 2.5719, "step": 229 }, { "epoch": 0.2010599355296946, "grad_norm": 0.9296875, "learning_rate": 4.696322797456757e-05, "loss": 2.5906, "step": 230 }, { "epoch": 0.2019341091624324, "grad_norm": 0.9296875, "learning_rate": 4.69285575310874e-05, "loss": 2.5452, "step": 231 }, { "epoch": 0.20280828279517019, "grad_norm": 1.03125, "learning_rate": 4.689370324336615e-05, "loss": 2.6078, "step": 232 }, { "epoch": 0.203682456427908, "grad_norm": 0.890625, "learning_rate": 4.685866540361456e-05, "loss": 2.561, "step": 233 }, { "epoch": 0.20455663006064578, "grad_norm": 0.91796875, "learning_rate": 4.682344430558222e-05, "loss": 2.6126, "step": 234 }, { "epoch": 0.2054308036933836, "grad_norm": 0.9140625, "learning_rate": 4.6788040244555145e-05, "loss": 2.6181, "step": 235 }, { "epoch": 0.2063049773261214, "grad_norm": 0.88671875, "learning_rate": 4.6752453517353245e-05, "loss": 2.5554, "step": 236 }, { "epoch": 0.2071791509588592, "grad_norm": 0.91796875, "learning_rate": 4.6716684422327886e-05, "loss": 2.5949, "step": 237 }, { "epoch": 0.208053324591597, "grad_norm": 0.94140625, "learning_rate": 4.6680733259359346e-05, "loss": 2.5931, "step": 238 }, { "epoch": 0.20892749822433482, "grad_norm": 0.8515625, "learning_rate": 4.6644600329854325e-05, "loss": 2.5865, "step": 239 }, { "epoch": 0.2098016718570726, "grad_norm": 0.9375, "learning_rate": 4.6608285936743445e-05, "loss": 2.5658, "step": 240 }, { "epoch": 0.21067584548981041, "grad_norm": 0.88671875, "learning_rate": 4.657179038447862e-05, "loss": 2.5902, "step": 241 }, { "epoch": 0.21155001912254823, "grad_norm": 0.89453125, "learning_rate": 4.653511397903063e-05, "loss": 2.5303, "step": 242 }, { "epoch": 0.212424192755286, "grad_norm": 0.890625, "learning_rate": 4.649825702788643e-05, "loss": 2.6264, "step": 243 }, { "epoch": 0.21329836638802382, "grad_norm": 0.95703125, "learning_rate": 4.6461219840046654e-05, "loss": 2.5831, "step": 244 }, { "epoch": 0.21417254002076164, "grad_norm": 0.90234375, "learning_rate": 4.642400272602302e-05, "loss": 2.6215, "step": 245 }, { "epoch": 0.21504671365349942, "grad_norm": 0.921875, "learning_rate": 4.638660599783567e-05, "loss": 2.5877, "step": 246 }, { "epoch": 0.21592088728623723, "grad_norm": 0.8984375, "learning_rate": 4.6349029969010644e-05, "loss": 2.5607, "step": 247 }, { "epoch": 0.21679506091897502, "grad_norm": 0.90234375, "learning_rate": 4.631127495457713e-05, "loss": 2.5615, "step": 248 }, { "epoch": 0.21766923455171283, "grad_norm": 0.9453125, "learning_rate": 4.6273341271064965e-05, "loss": 2.6131, "step": 249 }, { "epoch": 0.21854340818445064, "grad_norm": 0.890625, "learning_rate": 4.6235229236501845e-05, "loss": 2.6152, "step": 250 }, { "epoch": 0.21941758181718843, "grad_norm": 0.9296875, "learning_rate": 4.619693917041076e-05, "loss": 2.5947, "step": 251 }, { "epoch": 0.22029175544992624, "grad_norm": 0.9296875, "learning_rate": 4.615847139380728e-05, "loss": 2.6395, "step": 252 }, { "epoch": 0.22116592908266405, "grad_norm": 0.8828125, "learning_rate": 4.611982622919683e-05, "loss": 2.5855, "step": 253 }, { "epoch": 0.22204010271540184, "grad_norm": 0.875, "learning_rate": 4.608100400057206e-05, "loss": 2.5098, "step": 254 }, { "epoch": 0.22291427634813965, "grad_norm": 0.9296875, "learning_rate": 4.604200503341004e-05, "loss": 2.6061, "step": 255 }, { "epoch": 0.22378844998087746, "grad_norm": 0.93359375, "learning_rate": 4.6002829654669616e-05, "loss": 2.5075, "step": 256 }, { "epoch": 0.22466262361361525, "grad_norm": 0.89453125, "learning_rate": 4.596347819278861e-05, "loss": 2.5869, "step": 257 }, { "epoch": 0.22553679724635306, "grad_norm": 0.92578125, "learning_rate": 4.5923950977681084e-05, "loss": 2.586, "step": 258 }, { "epoch": 0.22641097087909087, "grad_norm": 0.9296875, "learning_rate": 4.58842483407346e-05, "loss": 2.5124, "step": 259 }, { "epoch": 0.22728514451182866, "grad_norm": 0.88671875, "learning_rate": 4.584437061480739e-05, "loss": 2.5364, "step": 260 }, { "epoch": 0.22815931814456647, "grad_norm": 0.93359375, "learning_rate": 4.58043181342256e-05, "loss": 2.5939, "step": 261 }, { "epoch": 0.22903349177730425, "grad_norm": 0.96484375, "learning_rate": 4.5764091234780504e-05, "loss": 2.5893, "step": 262 }, { "epoch": 0.22990766541004207, "grad_norm": 0.9375, "learning_rate": 4.572369025372564e-05, "loss": 2.5496, "step": 263 }, { "epoch": 0.23078183904277988, "grad_norm": 0.95703125, "learning_rate": 4.568311552977401e-05, "loss": 2.6138, "step": 264 }, { "epoch": 0.23165601267551766, "grad_norm": 0.90625, "learning_rate": 4.564236740309525e-05, "loss": 2.5724, "step": 265 }, { "epoch": 0.23253018630825548, "grad_norm": 0.8984375, "learning_rate": 4.560144621531278e-05, "loss": 2.5762, "step": 266 }, { "epoch": 0.2334043599409933, "grad_norm": 1.046875, "learning_rate": 4.5560352309500886e-05, "loss": 2.5781, "step": 267 }, { "epoch": 0.23427853357373107, "grad_norm": 0.90625, "learning_rate": 4.551908603018191e-05, "loss": 2.606, "step": 268 }, { "epoch": 0.2351527072064689, "grad_norm": 1.0234375, "learning_rate": 4.547764772332333e-05, "loss": 2.589, "step": 269 }, { "epoch": 0.2360268808392067, "grad_norm": 0.91015625, "learning_rate": 4.5436037736334894e-05, "loss": 2.6229, "step": 270 }, { "epoch": 0.23690105447194448, "grad_norm": 0.91796875, "learning_rate": 4.539425641806562e-05, "loss": 2.5875, "step": 271 }, { "epoch": 0.2377752281046823, "grad_norm": 0.875, "learning_rate": 4.535230411880098e-05, "loss": 2.6023, "step": 272 }, { "epoch": 0.2386494017374201, "grad_norm": 0.8828125, "learning_rate": 4.531018119025989e-05, "loss": 2.5965, "step": 273 }, { "epoch": 0.2395235753701579, "grad_norm": 0.953125, "learning_rate": 4.5267887985591795e-05, "loss": 2.5359, "step": 274 }, { "epoch": 0.2403977490028957, "grad_norm": 0.9140625, "learning_rate": 4.522542485937369e-05, "loss": 2.5603, "step": 275 }, { "epoch": 0.2412719226356335, "grad_norm": 0.8515625, "learning_rate": 4.5182792167607155e-05, "loss": 2.6296, "step": 276 }, { "epoch": 0.2421460962683713, "grad_norm": 0.91015625, "learning_rate": 4.513999026771539e-05, "loss": 2.5896, "step": 277 }, { "epoch": 0.24302026990110911, "grad_norm": 0.8671875, "learning_rate": 4.509701951854017e-05, "loss": 2.5494, "step": 278 }, { "epoch": 0.2438944435338469, "grad_norm": 1.1484375, "learning_rate": 4.505388028033888e-05, "loss": 2.6256, "step": 279 }, { "epoch": 0.2447686171665847, "grad_norm": 0.88671875, "learning_rate": 4.501057291478149e-05, "loss": 2.6245, "step": 280 }, { "epoch": 0.24564279079932252, "grad_norm": 0.94140625, "learning_rate": 4.496709778494749e-05, "loss": 2.5308, "step": 281 }, { "epoch": 0.2465169644320603, "grad_norm": 0.8828125, "learning_rate": 4.492345525532288e-05, "loss": 2.6629, "step": 282 }, { "epoch": 0.24739113806479812, "grad_norm": 0.95703125, "learning_rate": 4.487964569179711e-05, "loss": 2.4932, "step": 283 }, { "epoch": 0.24826531169753593, "grad_norm": 0.94921875, "learning_rate": 4.4835669461660004e-05, "loss": 2.5798, "step": 284 }, { "epoch": 0.24913948533027372, "grad_norm": 0.9375, "learning_rate": 4.479152693359868e-05, "loss": 2.6232, "step": 285 }, { "epoch": 0.25001365896301153, "grad_norm": 0.94921875, "learning_rate": 4.474721847769445e-05, "loss": 2.5524, "step": 286 }, { "epoch": 0.2508878325957493, "grad_norm": 0.9375, "learning_rate": 4.4702744465419744e-05, "loss": 2.6093, "step": 287 }, { "epoch": 0.25176200622848716, "grad_norm": 1.265625, "learning_rate": 4.465810526963499e-05, "loss": 2.5971, "step": 288 }, { "epoch": 0.25263617986122494, "grad_norm": 0.92578125, "learning_rate": 4.461330126458544e-05, "loss": 2.529, "step": 289 }, { "epoch": 0.2535103534939627, "grad_norm": 0.90234375, "learning_rate": 4.4568332825898105e-05, "loss": 2.5475, "step": 290 }, { "epoch": 0.25438452712670057, "grad_norm": 0.9140625, "learning_rate": 4.452320033057856e-05, "loss": 2.5431, "step": 291 }, { "epoch": 0.25525870075943835, "grad_norm": 0.96875, "learning_rate": 4.447790415700781e-05, "loss": 2.5771, "step": 292 }, { "epoch": 0.25613287439217614, "grad_norm": 0.890625, "learning_rate": 4.4432444684939077e-05, "loss": 2.6166, "step": 293 }, { "epoch": 0.257007048024914, "grad_norm": 0.9765625, "learning_rate": 4.438682229549466e-05, "loss": 2.5507, "step": 294 }, { "epoch": 0.25788122165765176, "grad_norm": 0.90625, "learning_rate": 4.434103737116272e-05, "loss": 2.5351, "step": 295 }, { "epoch": 0.25875539529038954, "grad_norm": 0.90625, "learning_rate": 4.429509029579405e-05, "loss": 2.6678, "step": 296 }, { "epoch": 0.25962956892312733, "grad_norm": 0.9765625, "learning_rate": 4.4248981454598935e-05, "loss": 2.5859, "step": 297 }, { "epoch": 0.26050374255586517, "grad_norm": 0.8984375, "learning_rate": 4.420271123414381e-05, "loss": 2.5215, "step": 298 }, { "epoch": 0.26137791618860295, "grad_norm": 0.8515625, "learning_rate": 4.415628002234812e-05, "loss": 2.5394, "step": 299 }, { "epoch": 0.26225208982134074, "grad_norm": 0.9765625, "learning_rate": 4.4109688208481015e-05, "loss": 2.6149, "step": 300 }, { "epoch": 0.2631262634540786, "grad_norm": 0.921875, "learning_rate": 4.406293618315809e-05, "loss": 2.5216, "step": 301 }, { "epoch": 0.26400043708681636, "grad_norm": 0.875, "learning_rate": 4.4016024338338114e-05, "loss": 2.5536, "step": 302 }, { "epoch": 0.26487461071955415, "grad_norm": 0.88671875, "learning_rate": 4.3968953067319777e-05, "loss": 2.5415, "step": 303 }, { "epoch": 0.265748784352292, "grad_norm": 0.87890625, "learning_rate": 4.3921722764738326e-05, "loss": 2.5575, "step": 304 }, { "epoch": 0.2666229579850298, "grad_norm": 0.86328125, "learning_rate": 4.387433382656232e-05, "loss": 2.4776, "step": 305 }, { "epoch": 0.26749713161776756, "grad_norm": 0.91015625, "learning_rate": 4.382678665009028e-05, "loss": 2.5806, "step": 306 }, { "epoch": 0.2683713052505054, "grad_norm": 0.84765625, "learning_rate": 4.377908163394734e-05, "loss": 2.5854, "step": 307 }, { "epoch": 0.2692454788832432, "grad_norm": 0.87109375, "learning_rate": 4.373121917808196e-05, "loss": 2.5241, "step": 308 }, { "epoch": 0.27011965251598097, "grad_norm": 0.91015625, "learning_rate": 4.368319968376253e-05, "loss": 2.4803, "step": 309 }, { "epoch": 0.2709938261487188, "grad_norm": 0.84765625, "learning_rate": 4.363502355357399e-05, "loss": 2.5509, "step": 310 }, { "epoch": 0.2718679997814566, "grad_norm": 0.8671875, "learning_rate": 4.358669119141453e-05, "loss": 2.5421, "step": 311 }, { "epoch": 0.2727421734141944, "grad_norm": 0.88671875, "learning_rate": 4.3538203002492104e-05, "loss": 2.5374, "step": 312 }, { "epoch": 0.2736163470469322, "grad_norm": 0.85546875, "learning_rate": 4.348955939332111e-05, "loss": 2.5822, "step": 313 }, { "epoch": 0.27449052067967, "grad_norm": 0.87890625, "learning_rate": 4.344076077171897e-05, "loss": 2.5644, "step": 314 }, { "epoch": 0.2753646943124078, "grad_norm": 0.86328125, "learning_rate": 4.339180754680267e-05, "loss": 2.6278, "step": 315 }, { "epoch": 0.2762388679451456, "grad_norm": 0.86328125, "learning_rate": 4.3342700128985345e-05, "loss": 2.577, "step": 316 }, { "epoch": 0.2771130415778834, "grad_norm": 0.84765625, "learning_rate": 4.3293438929972894e-05, "loss": 2.5167, "step": 317 }, { "epoch": 0.2779872152106212, "grad_norm": 0.8515625, "learning_rate": 4.324402436276046e-05, "loss": 2.5297, "step": 318 }, { "epoch": 0.27886138884335904, "grad_norm": 0.890625, "learning_rate": 4.319445684162897e-05, "loss": 2.58, "step": 319 }, { "epoch": 0.2797355624760968, "grad_norm": 0.921875, "learning_rate": 4.3144736782141725e-05, "loss": 2.5503, "step": 320 }, { "epoch": 0.2806097361088346, "grad_norm": 0.8984375, "learning_rate": 4.309486460114085e-05, "loss": 2.4978, "step": 321 }, { "epoch": 0.28148390974157245, "grad_norm": 0.9140625, "learning_rate": 4.3044840716743824e-05, "loss": 2.5319, "step": 322 }, { "epoch": 0.28235808337431023, "grad_norm": 0.890625, "learning_rate": 4.299466554833997e-05, "loss": 2.5353, "step": 323 }, { "epoch": 0.283232257007048, "grad_norm": 0.87890625, "learning_rate": 4.294433951658697e-05, "loss": 2.6071, "step": 324 }, { "epoch": 0.2841064306397858, "grad_norm": 0.9375, "learning_rate": 4.289386304340727e-05, "loss": 2.6526, "step": 325 }, { "epoch": 0.28498060427252364, "grad_norm": 0.86328125, "learning_rate": 4.284323655198462e-05, "loss": 2.553, "step": 326 }, { "epoch": 0.2858547779052614, "grad_norm": 0.9296875, "learning_rate": 4.2792460466760485e-05, "loss": 2.5924, "step": 327 }, { "epoch": 0.2867289515379992, "grad_norm": 0.91796875, "learning_rate": 4.274153521343046e-05, "loss": 2.5093, "step": 328 }, { "epoch": 0.28760312517073705, "grad_norm": 0.87890625, "learning_rate": 4.269046121894077e-05, "loss": 2.5962, "step": 329 }, { "epoch": 0.28847729880347484, "grad_norm": 0.89453125, "learning_rate": 4.2639238911484633e-05, "loss": 2.5287, "step": 330 }, { "epoch": 0.2893514724362126, "grad_norm": 0.85546875, "learning_rate": 4.2587868720498705e-05, "loss": 2.5151, "step": 331 }, { "epoch": 0.29022564606895046, "grad_norm": 0.890625, "learning_rate": 4.253635107665945e-05, "loss": 2.5844, "step": 332 }, { "epoch": 0.29109981970168824, "grad_norm": 0.83203125, "learning_rate": 4.2484686411879554e-05, "loss": 2.5545, "step": 333 }, { "epoch": 0.29197399333442603, "grad_norm": 0.84375, "learning_rate": 4.2432875159304295e-05, "loss": 2.5029, "step": 334 }, { "epoch": 0.29284816696716387, "grad_norm": 0.875, "learning_rate": 4.2380917753307904e-05, "loss": 2.5439, "step": 335 }, { "epoch": 0.29372234059990165, "grad_norm": 0.88671875, "learning_rate": 4.232881462948994e-05, "loss": 2.5714, "step": 336 }, { "epoch": 0.29459651423263944, "grad_norm": 0.8671875, "learning_rate": 4.227656622467162e-05, "loss": 2.5515, "step": 337 }, { "epoch": 0.2954706878653773, "grad_norm": 0.8828125, "learning_rate": 4.222417297689217e-05, "loss": 2.5615, "step": 338 }, { "epoch": 0.29634486149811506, "grad_norm": 0.9296875, "learning_rate": 4.217163532540514e-05, "loss": 2.57, "step": 339 }, { "epoch": 0.29721903513085285, "grad_norm": 0.8984375, "learning_rate": 4.211895371067474e-05, "loss": 2.5805, "step": 340 }, { "epoch": 0.2980932087635907, "grad_norm": 0.859375, "learning_rate": 4.206612857437213e-05, "loss": 2.6419, "step": 341 }, { "epoch": 0.2989673823963285, "grad_norm": 0.8828125, "learning_rate": 4.2013160359371736e-05, "loss": 2.5025, "step": 342 }, { "epoch": 0.29984155602906626, "grad_norm": 0.8515625, "learning_rate": 4.19600495097475e-05, "loss": 2.4513, "step": 343 }, { "epoch": 0.3007157296618041, "grad_norm": 0.9140625, "learning_rate": 4.1906796470769195e-05, "loss": 2.6036, "step": 344 }, { "epoch": 0.3015899032945419, "grad_norm": 0.90234375, "learning_rate": 4.185340168889868e-05, "loss": 2.5366, "step": 345 }, { "epoch": 0.30246407692727967, "grad_norm": 0.9453125, "learning_rate": 4.179986561178617e-05, "loss": 2.539, "step": 346 }, { "epoch": 0.3033382505600175, "grad_norm": 0.8828125, "learning_rate": 4.1746188688266444e-05, "loss": 2.5152, "step": 347 }, { "epoch": 0.3042124241927553, "grad_norm": 0.8828125, "learning_rate": 4.16923713683551e-05, "loss": 2.6098, "step": 348 }, { "epoch": 0.3050865978254931, "grad_norm": 0.890625, "learning_rate": 4.163841410324482e-05, "loss": 2.5229, "step": 349 }, { "epoch": 0.3059607714582309, "grad_norm": 0.87890625, "learning_rate": 4.158431734530154e-05, "loss": 2.5009, "step": 350 }, { "epoch": 0.3068349450909687, "grad_norm": 0.9140625, "learning_rate": 4.153008154806067e-05, "loss": 2.4947, "step": 351 }, { "epoch": 0.3077091187237065, "grad_norm": 0.953125, "learning_rate": 4.1475707166223296e-05, "loss": 2.5652, "step": 352 }, { "epoch": 0.30858329235644427, "grad_norm": 0.8828125, "learning_rate": 4.142119465565238e-05, "loss": 2.5643, "step": 353 }, { "epoch": 0.3094574659891821, "grad_norm": 0.953125, "learning_rate": 4.13665444733689e-05, "loss": 2.5575, "step": 354 }, { "epoch": 0.3103316396219199, "grad_norm": 0.8984375, "learning_rate": 4.131175707754807e-05, "loss": 2.4748, "step": 355 }, { "epoch": 0.3112058132546577, "grad_norm": 0.96875, "learning_rate": 4.125683292751546e-05, "loss": 2.53, "step": 356 }, { "epoch": 0.3120799868873955, "grad_norm": 0.875, "learning_rate": 4.120177248374315e-05, "loss": 2.5582, "step": 357 }, { "epoch": 0.3129541605201333, "grad_norm": 0.90625, "learning_rate": 4.114657620784589e-05, "loss": 2.5842, "step": 358 }, { "epoch": 0.3138283341528711, "grad_norm": 0.94140625, "learning_rate": 4.109124456257721e-05, "loss": 2.5279, "step": 359 }, { "epoch": 0.31470250778560893, "grad_norm": 0.90234375, "learning_rate": 4.103577801182557e-05, "loss": 2.5657, "step": 360 }, { "epoch": 0.3155766814183467, "grad_norm": 1.125, "learning_rate": 4.098017702061039e-05, "loss": 2.5622, "step": 361 }, { "epoch": 0.3164508550510845, "grad_norm": 0.88671875, "learning_rate": 4.0924442055078276e-05, "loss": 2.5328, "step": 362 }, { "epoch": 0.31732502868382234, "grad_norm": 0.94140625, "learning_rate": 4.0868573582499004e-05, "loss": 2.5514, "step": 363 }, { "epoch": 0.3181992023165601, "grad_norm": 0.9453125, "learning_rate": 4.0812572071261654e-05, "loss": 2.5575, "step": 364 }, { "epoch": 0.3190733759492979, "grad_norm": 0.89453125, "learning_rate": 4.07564379908707e-05, "loss": 2.5688, "step": 365 }, { "epoch": 0.31994754958203575, "grad_norm": 0.9140625, "learning_rate": 4.070017181194199e-05, "loss": 2.5032, "step": 366 }, { "epoch": 0.32082172321477354, "grad_norm": 0.86328125, "learning_rate": 4.0643774006198907e-05, "loss": 2.5319, "step": 367 }, { "epoch": 0.3216958968475113, "grad_norm": 0.94140625, "learning_rate": 4.058724504646834e-05, "loss": 2.5558, "step": 368 }, { "epoch": 0.32257007048024916, "grad_norm": 0.84375, "learning_rate": 4.053058540667676e-05, "loss": 2.5876, "step": 369 }, { "epoch": 0.32344424411298694, "grad_norm": 0.875, "learning_rate": 4.0473795561846215e-05, "loss": 2.5354, "step": 370 }, { "epoch": 0.32431841774572473, "grad_norm": 0.84375, "learning_rate": 4.0416875988090375e-05, "loss": 2.531, "step": 371 }, { "epoch": 0.32519259137846257, "grad_norm": 0.859375, "learning_rate": 4.035982716261053e-05, "loss": 2.5584, "step": 372 }, { "epoch": 0.32606676501120035, "grad_norm": 0.84765625, "learning_rate": 4.030264956369157e-05, "loss": 2.4785, "step": 373 }, { "epoch": 0.32694093864393814, "grad_norm": 0.90234375, "learning_rate": 4.0245343670698025e-05, "loss": 2.549, "step": 374 }, { "epoch": 0.327815112276676, "grad_norm": 0.83984375, "learning_rate": 4.018790996406998e-05, "loss": 2.4917, "step": 375 }, { "epoch": 0.32868928590941376, "grad_norm": 0.8671875, "learning_rate": 4.01303489253191e-05, "loss": 2.4882, "step": 376 }, { "epoch": 0.32956345954215155, "grad_norm": 0.8828125, "learning_rate": 4.0072661037024596e-05, "loss": 2.5832, "step": 377 }, { "epoch": 0.3304376331748894, "grad_norm": 0.87109375, "learning_rate": 4.0014846782829104e-05, "loss": 2.5667, "step": 378 }, { "epoch": 0.3313118068076272, "grad_norm": 0.8515625, "learning_rate": 3.9956906647434736e-05, "loss": 2.511, "step": 379 }, { "epoch": 0.33218598044036496, "grad_norm": 0.86328125, "learning_rate": 3.989884111659893e-05, "loss": 2.5146, "step": 380 }, { "epoch": 0.33306015407310274, "grad_norm": 0.8671875, "learning_rate": 3.984065067713043e-05, "loss": 2.4662, "step": 381 }, { "epoch": 0.3339343277058406, "grad_norm": 0.8671875, "learning_rate": 3.978233581688518e-05, "loss": 2.5807, "step": 382 }, { "epoch": 0.33480850133857837, "grad_norm": 0.8515625, "learning_rate": 3.9723897024762255e-05, "loss": 2.5095, "step": 383 }, { "epoch": 0.33568267497131615, "grad_norm": 0.85546875, "learning_rate": 3.9665334790699714e-05, "loss": 2.5084, "step": 384 }, { "epoch": 0.336556848604054, "grad_norm": 0.83984375, "learning_rate": 3.960664960567057e-05, "loss": 2.5447, "step": 385 }, { "epoch": 0.3374310222367918, "grad_norm": 0.875, "learning_rate": 3.95478419616786e-05, "loss": 2.5544, "step": 386 }, { "epoch": 0.33830519586952956, "grad_norm": 0.8203125, "learning_rate": 3.948891235175425e-05, "loss": 2.5338, "step": 387 }, { "epoch": 0.3391793695022674, "grad_norm": 0.84375, "learning_rate": 3.942986126995052e-05, "loss": 2.5239, "step": 388 }, { "epoch": 0.3400535431350052, "grad_norm": 0.86328125, "learning_rate": 3.937068921133879e-05, "loss": 2.5493, "step": 389 }, { "epoch": 0.34092771676774297, "grad_norm": 0.80859375, "learning_rate": 3.931139667200469e-05, "loss": 2.4874, "step": 390 }, { "epoch": 0.3418018904004808, "grad_norm": 0.83984375, "learning_rate": 3.9251984149043917e-05, "loss": 2.5066, "step": 391 }, { "epoch": 0.3426760640332186, "grad_norm": 0.8203125, "learning_rate": 3.919245214055812e-05, "loss": 2.5081, "step": 392 }, { "epoch": 0.3435502376659564, "grad_norm": 0.84375, "learning_rate": 3.913280114565066e-05, "loss": 2.5536, "step": 393 }, { "epoch": 0.3444244112986942, "grad_norm": 0.8828125, "learning_rate": 3.9073031664422444e-05, "loss": 2.5335, "step": 394 }, { "epoch": 0.345298584931432, "grad_norm": 0.83203125, "learning_rate": 3.901314419796778e-05, "loss": 2.4885, "step": 395 }, { "epoch": 0.3461727585641698, "grad_norm": 0.890625, "learning_rate": 3.8953139248370116e-05, "loss": 2.5373, "step": 396 }, { "epoch": 0.34704693219690763, "grad_norm": 0.8515625, "learning_rate": 3.889301731869784e-05, "loss": 2.563, "step": 397 }, { "epoch": 0.3479211058296454, "grad_norm": 0.8515625, "learning_rate": 3.883277891300011e-05, "loss": 2.5089, "step": 398 }, { "epoch": 0.3487952794623832, "grad_norm": 0.90234375, "learning_rate": 3.8772424536302564e-05, "loss": 2.5444, "step": 399 }, { "epoch": 0.34966945309512104, "grad_norm": 0.8359375, "learning_rate": 3.8711954694603126e-05, "loss": 2.4677, "step": 400 }, { "epoch": 0.3505436267278588, "grad_norm": 0.87890625, "learning_rate": 3.865136989486776e-05, "loss": 2.4907, "step": 401 }, { "epoch": 0.3514178003605966, "grad_norm": 0.8671875, "learning_rate": 3.8590670645026195e-05, "loss": 2.4889, "step": 402 }, { "epoch": 0.35229197399333445, "grad_norm": 0.9140625, "learning_rate": 3.85298574539677e-05, "loss": 2.5175, "step": 403 }, { "epoch": 0.35316614762607224, "grad_norm": 0.88671875, "learning_rate": 3.84689308315368e-05, "loss": 2.555, "step": 404 }, { "epoch": 0.35404032125881, "grad_norm": 0.96875, "learning_rate": 3.8407891288529004e-05, "loss": 2.4927, "step": 405 }, { "epoch": 0.35491449489154786, "grad_norm": 0.81640625, "learning_rate": 3.834673933668651e-05, "loss": 2.4928, "step": 406 }, { "epoch": 0.35578866852428565, "grad_norm": 0.86328125, "learning_rate": 3.828547548869396e-05, "loss": 2.5426, "step": 407 }, { "epoch": 0.35666284215702343, "grad_norm": 0.8828125, "learning_rate": 3.822410025817406e-05, "loss": 2.5477, "step": 408 }, { "epoch": 0.3575370157897612, "grad_norm": 0.890625, "learning_rate": 3.8162614159683374e-05, "loss": 2.5466, "step": 409 }, { "epoch": 0.35841118942249905, "grad_norm": 0.8671875, "learning_rate": 3.8101017708707906e-05, "loss": 2.5304, "step": 410 }, { "epoch": 0.35928536305523684, "grad_norm": 0.91015625, "learning_rate": 3.8039311421658887e-05, "loss": 2.556, "step": 411 }, { "epoch": 0.3601595366879746, "grad_norm": 0.8984375, "learning_rate": 3.797749581586835e-05, "loss": 2.5913, "step": 412 }, { "epoch": 0.36103371032071246, "grad_norm": 0.87109375, "learning_rate": 3.7915571409584836e-05, "loss": 2.5172, "step": 413 }, { "epoch": 0.36190788395345025, "grad_norm": 0.88671875, "learning_rate": 3.7853538721969064e-05, "loss": 2.4756, "step": 414 }, { "epoch": 0.36278205758618803, "grad_norm": 0.8359375, "learning_rate": 3.779139827308956e-05, "loss": 2.5278, "step": 415 }, { "epoch": 0.3636562312189259, "grad_norm": 0.8671875, "learning_rate": 3.7729150583918264e-05, "loss": 2.4925, "step": 416 }, { "epoch": 0.36453040485166366, "grad_norm": 0.90625, "learning_rate": 3.766679617632624e-05, "loss": 2.5038, "step": 417 }, { "epoch": 0.36540457848440144, "grad_norm": 0.8203125, "learning_rate": 3.760433557307922e-05, "loss": 2.518, "step": 418 }, { "epoch": 0.3662787521171393, "grad_norm": 0.83984375, "learning_rate": 3.754176929783327e-05, "loss": 2.554, "step": 419 }, { "epoch": 0.36715292574987707, "grad_norm": 0.859375, "learning_rate": 3.74790978751304e-05, "loss": 2.5062, "step": 420 }, { "epoch": 0.36802709938261485, "grad_norm": 0.8828125, "learning_rate": 3.7416321830394144e-05, "loss": 2.5755, "step": 421 }, { "epoch": 0.3689012730153527, "grad_norm": 0.828125, "learning_rate": 3.735344168992515e-05, "loss": 2.5203, "step": 422 }, { "epoch": 0.3697754466480905, "grad_norm": 0.859375, "learning_rate": 3.7290457980896795e-05, "loss": 2.4996, "step": 423 }, { "epoch": 0.37064962028082826, "grad_norm": 0.859375, "learning_rate": 3.722737123135075e-05, "loss": 2.5625, "step": 424 }, { "epoch": 0.3715237939135661, "grad_norm": 0.8359375, "learning_rate": 3.716418197019257e-05, "loss": 2.5665, "step": 425 }, { "epoch": 0.3723979675463039, "grad_norm": 0.8515625, "learning_rate": 3.710089072718722e-05, "loss": 2.5188, "step": 426 }, { "epoch": 0.37327214117904167, "grad_norm": 0.84375, "learning_rate": 3.7037498032954664e-05, "loss": 2.5166, "step": 427 }, { "epoch": 0.3741463148117795, "grad_norm": 0.87890625, "learning_rate": 3.697400441896543e-05, "loss": 2.5166, "step": 428 }, { "epoch": 0.3750204884445173, "grad_norm": 0.8515625, "learning_rate": 3.691041041753613e-05, "loss": 2.5436, "step": 429 }, { "epoch": 0.3758946620772551, "grad_norm": 0.83203125, "learning_rate": 3.6846716561824965e-05, "loss": 2.5019, "step": 430 }, { "epoch": 0.3767688357099929, "grad_norm": 0.8515625, "learning_rate": 3.678292338582735e-05, "loss": 2.5575, "step": 431 }, { "epoch": 0.3776430093427307, "grad_norm": 0.8046875, "learning_rate": 3.671903142437134e-05, "loss": 2.5161, "step": 432 }, { "epoch": 0.3785171829754685, "grad_norm": 0.84765625, "learning_rate": 3.6655041213113184e-05, "loss": 2.5285, "step": 433 }, { "epoch": 0.37939135660820633, "grad_norm": 0.8671875, "learning_rate": 3.659095328853288e-05, "loss": 2.4936, "step": 434 }, { "epoch": 0.3802655302409441, "grad_norm": 0.85546875, "learning_rate": 3.652676818792958e-05, "loss": 2.5238, "step": 435 }, { "epoch": 0.3811397038736819, "grad_norm": 0.8515625, "learning_rate": 3.646248644941716e-05, "loss": 2.4821, "step": 436 }, { "epoch": 0.3820138775064197, "grad_norm": 0.87109375, "learning_rate": 3.6398108611919696e-05, "loss": 2.5309, "step": 437 }, { "epoch": 0.3828880511391575, "grad_norm": 0.83203125, "learning_rate": 3.633363521516693e-05, "loss": 2.508, "step": 438 }, { "epoch": 0.3837622247718953, "grad_norm": 0.84765625, "learning_rate": 3.626906679968974e-05, "loss": 2.5292, "step": 439 }, { "epoch": 0.3846363984046331, "grad_norm": 0.83203125, "learning_rate": 3.6204403906815655e-05, "loss": 2.5175, "step": 440 }, { "epoch": 0.38551057203737094, "grad_norm": 0.890625, "learning_rate": 3.613964707866424e-05, "loss": 2.5478, "step": 441 }, { "epoch": 0.3863847456701087, "grad_norm": 0.8359375, "learning_rate": 3.607479685814261e-05, "loss": 2.5442, "step": 442 }, { "epoch": 0.3872589193028465, "grad_norm": 0.90234375, "learning_rate": 3.600985378894086e-05, "loss": 2.5198, "step": 443 }, { "epoch": 0.38813309293558435, "grad_norm": 0.94921875, "learning_rate": 3.594481841552753e-05, "loss": 2.5001, "step": 444 }, { "epoch": 0.38900726656832213, "grad_norm": 0.8671875, "learning_rate": 3.5879691283144964e-05, "loss": 2.53, "step": 445 }, { "epoch": 0.3898814402010599, "grad_norm": 0.859375, "learning_rate": 3.5814472937804865e-05, "loss": 2.5589, "step": 446 }, { "epoch": 0.39075561383379775, "grad_norm": 0.86328125, "learning_rate": 3.574916392628359e-05, "loss": 2.5402, "step": 447 }, { "epoch": 0.39162978746653554, "grad_norm": 0.85546875, "learning_rate": 3.5683764796117634e-05, "loss": 2.48, "step": 448 }, { "epoch": 0.3925039610992733, "grad_norm": 0.89453125, "learning_rate": 3.561827609559905e-05, "loss": 2.5504, "step": 449 }, { "epoch": 0.39337813473201116, "grad_norm": 0.8359375, "learning_rate": 3.55526983737708e-05, "loss": 2.5011, "step": 450 }, { "epoch": 0.39425230836474895, "grad_norm": 0.8984375, "learning_rate": 3.54870321804222e-05, "loss": 2.4815, "step": 451 }, { "epoch": 0.39512648199748673, "grad_norm": 0.85546875, "learning_rate": 3.5421278066084276e-05, "loss": 2.537, "step": 452 }, { "epoch": 0.3960006556302246, "grad_norm": 0.875, "learning_rate": 3.535543658202518e-05, "loss": 2.5111, "step": 453 }, { "epoch": 0.39687482926296236, "grad_norm": 0.84375, "learning_rate": 3.528950828024555e-05, "loss": 2.4883, "step": 454 }, { "epoch": 0.39774900289570014, "grad_norm": 0.828125, "learning_rate": 3.522349371347387e-05, "loss": 2.4712, "step": 455 }, { "epoch": 0.398623176528438, "grad_norm": 0.84375, "learning_rate": 3.515739343516188e-05, "loss": 2.4872, "step": 456 }, { "epoch": 0.39949735016117577, "grad_norm": 0.8203125, "learning_rate": 3.509120799947987e-05, "loss": 2.5711, "step": 457 }, { "epoch": 0.40037152379391355, "grad_norm": 0.828125, "learning_rate": 3.50249379613121e-05, "loss": 2.5285, "step": 458 }, { "epoch": 0.4012456974266514, "grad_norm": 0.84765625, "learning_rate": 3.49585838762521e-05, "loss": 2.5139, "step": 459 }, { "epoch": 0.4021198710593892, "grad_norm": 0.8046875, "learning_rate": 3.489214630059806e-05, "loss": 2.5236, "step": 460 }, { "epoch": 0.40299404469212696, "grad_norm": 0.81640625, "learning_rate": 3.4825625791348096e-05, "loss": 2.5336, "step": 461 }, { "epoch": 0.4038682183248648, "grad_norm": 0.8515625, "learning_rate": 3.475902290619565e-05, "loss": 2.4917, "step": 462 }, { "epoch": 0.4047423919576026, "grad_norm": 0.8359375, "learning_rate": 3.469233820352477e-05, "loss": 2.5423, "step": 463 }, { "epoch": 0.40561656559034037, "grad_norm": 0.85546875, "learning_rate": 3.462557224240545e-05, "loss": 2.4924, "step": 464 }, { "epoch": 0.40649073922307816, "grad_norm": 0.82421875, "learning_rate": 3.455872558258895e-05, "loss": 2.5107, "step": 465 }, { "epoch": 0.407364912855816, "grad_norm": 0.84375, "learning_rate": 3.449179878450308e-05, "loss": 2.5197, "step": 466 }, { "epoch": 0.4082390864885538, "grad_norm": 0.8203125, "learning_rate": 3.442479240924749e-05, "loss": 2.4901, "step": 467 }, { "epoch": 0.40911326012129157, "grad_norm": 0.83203125, "learning_rate": 3.4357707018589036e-05, "loss": 2.4912, "step": 468 }, { "epoch": 0.4099874337540294, "grad_norm": 0.83984375, "learning_rate": 3.429054317495697e-05, "loss": 2.4534, "step": 469 }, { "epoch": 0.4108616073867672, "grad_norm": 0.85546875, "learning_rate": 3.4223301441438306e-05, "loss": 2.4801, "step": 470 }, { "epoch": 0.411735781019505, "grad_norm": 0.85546875, "learning_rate": 3.415598238177307e-05, "loss": 2.4984, "step": 471 }, { "epoch": 0.4126099546522428, "grad_norm": 0.8203125, "learning_rate": 3.408858656034957e-05, "loss": 2.5402, "step": 472 }, { "epoch": 0.4134841282849806, "grad_norm": 0.859375, "learning_rate": 3.4021114542199664e-05, "loss": 2.5232, "step": 473 }, { "epoch": 0.4143583019177184, "grad_norm": 0.83203125, "learning_rate": 3.395356689299401e-05, "loss": 2.5168, "step": 474 }, { "epoch": 0.4152324755504562, "grad_norm": 0.86328125, "learning_rate": 3.3885944179037395e-05, "loss": 2.5563, "step": 475 }, { "epoch": 0.416106649183194, "grad_norm": 0.8203125, "learning_rate": 3.381824696726386e-05, "loss": 2.5104, "step": 476 }, { "epoch": 0.4169808228159318, "grad_norm": 0.81640625, "learning_rate": 3.3750475825232074e-05, "loss": 2.5002, "step": 477 }, { "epoch": 0.41785499644866964, "grad_norm": 0.828125, "learning_rate": 3.3682631321120504e-05, "loss": 2.5262, "step": 478 }, { "epoch": 0.4187291700814074, "grad_norm": 0.859375, "learning_rate": 3.361471402372267e-05, "loss": 2.5159, "step": 479 }, { "epoch": 0.4196033437141452, "grad_norm": 0.84375, "learning_rate": 3.3546724502442354e-05, "loss": 2.455, "step": 480 }, { "epoch": 0.42047751734688305, "grad_norm": 0.796875, "learning_rate": 3.347866332728889e-05, "loss": 2.4299, "step": 481 }, { "epoch": 0.42135169097962083, "grad_norm": 0.90625, "learning_rate": 3.341053106887229e-05, "loss": 2.5159, "step": 482 }, { "epoch": 0.4222258646123586, "grad_norm": 0.84765625, "learning_rate": 3.3342328298398565e-05, "loss": 2.4763, "step": 483 }, { "epoch": 0.42310003824509645, "grad_norm": 0.83984375, "learning_rate": 3.3274055587664856e-05, "loss": 2.4768, "step": 484 }, { "epoch": 0.42397421187783424, "grad_norm": 0.84765625, "learning_rate": 3.320571350905466e-05, "loss": 2.5295, "step": 485 }, { "epoch": 0.424848385510572, "grad_norm": 0.83984375, "learning_rate": 3.313730263553306e-05, "loss": 2.4913, "step": 486 }, { "epoch": 0.42572255914330986, "grad_norm": 0.8515625, "learning_rate": 3.3068823540641886e-05, "loss": 2.5096, "step": 487 }, { "epoch": 0.42659673277604765, "grad_norm": 0.8359375, "learning_rate": 3.300027679849492e-05, "loss": 2.5255, "step": 488 }, { "epoch": 0.42747090640878543, "grad_norm": 0.84375, "learning_rate": 3.2931662983773106e-05, "loss": 2.4564, "step": 489 }, { "epoch": 0.4283450800415233, "grad_norm": 0.85546875, "learning_rate": 3.286298267171969e-05, "loss": 2.5294, "step": 490 }, { "epoch": 0.42921925367426106, "grad_norm": 0.84375, "learning_rate": 3.2794236438135405e-05, "loss": 2.5117, "step": 491 }, { "epoch": 0.43009342730699884, "grad_norm": 0.9375, "learning_rate": 3.272542485937369e-05, "loss": 2.4564, "step": 492 }, { "epoch": 0.43096760093973663, "grad_norm": 0.828125, "learning_rate": 3.265654851233579e-05, "loss": 2.4361, "step": 493 }, { "epoch": 0.43184177457247447, "grad_norm": 0.8125, "learning_rate": 3.258760797446598e-05, "loss": 2.5215, "step": 494 }, { "epoch": 0.43271594820521225, "grad_norm": 0.8125, "learning_rate": 3.251860382374668e-05, "loss": 2.4979, "step": 495 }, { "epoch": 0.43359012183795004, "grad_norm": 0.8203125, "learning_rate": 3.244953663869365e-05, "loss": 2.5005, "step": 496 }, { "epoch": 0.4344642954706879, "grad_norm": 0.84375, "learning_rate": 3.238040699835106e-05, "loss": 2.5365, "step": 497 }, { "epoch": 0.43533846910342566, "grad_norm": 0.82421875, "learning_rate": 3.231121548228676e-05, "loss": 2.5102, "step": 498 }, { "epoch": 0.43621264273616345, "grad_norm": 0.82421875, "learning_rate": 3.2241962670587314e-05, "loss": 2.4999, "step": 499 }, { "epoch": 0.4370868163689013, "grad_norm": 0.83984375, "learning_rate": 3.2172649143853176e-05, "loss": 2.4631, "step": 500 } ], "logging_steps": 1, "max_steps": 1143, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 4.571566411087872e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }