{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.75, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 0.0007428824901580811, "accuracy": 0.546875, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 3.3472559452056885, "learning_rate": 1.5000000000000002e-07, "loss": 0.6932, "step": 1 }, { "Batch Mean": -0.007137291133403778, "accuracy": 0.4921875, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 2.879757881164551, "learning_rate": 3.0000000000000004e-07, "loss": 0.6937, "step": 2 }, { "Batch Mean": -0.0009354054927825928, "accuracy": 0.5078125, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 2.630247116088867, "learning_rate": 4.5e-07, "loss": 0.6957, "step": 3 }, { "Batch Mean": 0.024937540292739868, "accuracy": 0.546875, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 3.5664515495300293, "learning_rate": 6.000000000000001e-07, "loss": 0.699, "step": 4 }, { "Batch Mean": 0.02523682825267315, "accuracy": 0.484375, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 3.7211363315582275, "learning_rate": 7.5e-07, "loss": 0.6939, "step": 5 }, { "Batch Mean": 0.021557003259658813, "accuracy": 0.4609375, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 3.728484869003296, "learning_rate": 9e-07, "loss": 0.7002, "step": 6 }, { "Batch Mean": 0.04629041254520416, "accuracy": 0.453125, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 3.8765816688537598, "learning_rate": 1.05e-06, "loss": 0.7043, "step": 7 }, { "Batch Mean": 0.039198338985443115, "accuracy": 0.4921875, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 3.4694457054138184, "learning_rate": 1.2000000000000002e-06, "loss": 0.6997, "step": 8 }, { "Batch Mean": 0.0921606719493866, "accuracy": 0.4765625, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 3.169879198074341, "learning_rate": 1.35e-06, "loss": 0.6916, "step": 9 }, { "Batch Mean": 0.11305281519889832, "accuracy": 0.5625, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 2.858436346054077, "learning_rate": 1.5e-06, "loss": 0.688, "step": 10 }, { "Batch Mean": 0.11914896965026855, "accuracy": 0.6015625, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 3.2789862155914307, "learning_rate": 1.65e-06, "loss": 0.6752, "step": 11 }, { "Batch Mean": 0.16257084906101227, "accuracy": 0.5625, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 3.2884113788604736, "learning_rate": 1.8e-06, "loss": 0.6824, "step": 12 }, { "Batch Mean": 0.2741994559764862, "accuracy": 0.6171875, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 4.308699607849121, "learning_rate": 1.95e-06, "loss": 0.6527, "step": 13 }, { "Batch Mean": 0.24053972959518433, "accuracy": 0.6484375, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 4.072883605957031, "learning_rate": 2.1e-06, "loss": 0.6528, "step": 14 }, { "Batch Mean": 0.14770203828811646, "accuracy": 0.609375, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 4.621586322784424, "learning_rate": 2.25e-06, "loss": 0.667, "step": 15 }, { "Batch Mean": -0.06273385882377625, "accuracy": 0.59375, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 3.8247880935668945, "learning_rate": 2.4000000000000003e-06, "loss": 0.6638, "step": 16 }, { "Batch Mean": -0.37021708488464355, "accuracy": 0.671875, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 6.02881383895874, "learning_rate": 2.55e-06, "loss": 0.6292, "step": 17 }, { "Batch Mean": -0.36325013637542725, "accuracy": 0.609375, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 6.033018589019775, "learning_rate": 2.7e-06, "loss": 0.6302, "step": 18 }, { "Batch Mean": 0.009652853012084961, "accuracy": 0.65625, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 11.042936325073242, "learning_rate": 2.85e-06, "loss": 0.6487, "step": 19 }, { "Batch Mean": 0.3796501159667969, "accuracy": 0.6640625, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 8.643921852111816, "learning_rate": 3e-06, "loss": 0.6373, "step": 20 }, { "Batch Mean": 0.18232779204845428, "accuracy": 0.671875, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 6.826192855834961, "learning_rate": 2.992105263157895e-06, "loss": 0.6009, "step": 21 }, { "Batch Mean": -0.32630038261413574, "accuracy": 0.6640625, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 5.449484825134277, "learning_rate": 2.9842105263157896e-06, "loss": 0.5894, "step": 22 }, { "Batch Mean": -0.34779179096221924, "accuracy": 0.6640625, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 7.103095054626465, "learning_rate": 2.9763157894736843e-06, "loss": 0.6234, "step": 23 }, { "Batch Mean": -0.1863919347524643, "accuracy": 0.734375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 4.805452823638916, "learning_rate": 2.968421052631579e-06, "loss": 0.539, "step": 24 }, { "Batch Mean": 0.22394847869873047, "accuracy": 0.625, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 6.5976362228393555, "learning_rate": 2.960526315789474e-06, "loss": 0.627, "step": 25 }, { "Batch Mean": 0.4492683410644531, "accuracy": 0.6953125, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 7.0352911949157715, "learning_rate": 2.9526315789473685e-06, "loss": 0.528, "step": 26 }, { "Batch Mean": 0.3814358115196228, "accuracy": 0.6484375, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 6.645840167999268, "learning_rate": 2.9447368421052633e-06, "loss": 0.5545, "step": 27 }, { "Batch Mean": -0.2906806468963623, "accuracy": 0.671875, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.070335388183594, "learning_rate": 2.936842105263158e-06, "loss": 0.5584, "step": 28 }, { "Batch Mean": -0.3805738687515259, "accuracy": 0.6328125, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 9.93401050567627, "learning_rate": 2.9289473684210528e-06, "loss": 0.7077, "step": 29 }, { "Batch Mean": -0.2810482680797577, "accuracy": 0.7109375, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 7.469810962677002, "learning_rate": 2.9210526315789475e-06, "loss": 0.6311, "step": 30 }, { "Batch Mean": 0.0018663406372070312, "accuracy": 0.75, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 5.728447914123535, "learning_rate": 2.9131578947368423e-06, "loss": 0.4953, "step": 31 }, { "Batch Mean": 0.4803805351257324, "accuracy": 0.6875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 6.646997928619385, "learning_rate": 2.905263157894737e-06, "loss": 0.557, "step": 32 }, { "Batch Mean": 0.15735602378845215, "accuracy": 0.7734375, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 7.232790946960449, "learning_rate": 2.8973684210526318e-06, "loss": 0.5356, "step": 33 }, { "Batch Mean": 0.01392507553100586, "accuracy": 0.7421875, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 7.317196369171143, "learning_rate": 2.8894736842105265e-06, "loss": 0.5318, "step": 34 }, { "Batch Mean": -0.19668865203857422, "accuracy": 0.671875, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 7.3725972175598145, "learning_rate": 2.8815789473684213e-06, "loss": 0.5664, "step": 35 }, { "Batch Mean": -0.04311776161193848, "accuracy": 0.703125, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 7.768387317657471, "learning_rate": 2.873684210526316e-06, "loss": 0.5562, "step": 36 }, { "Batch Mean": 0.06285472959280014, "accuracy": 0.703125, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 6.43149995803833, "learning_rate": 2.8657894736842103e-06, "loss": 0.5513, "step": 37 }, { "Batch Mean": 0.09104752540588379, "accuracy": 0.7578125, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 7.041010856628418, "learning_rate": 2.857894736842105e-06, "loss": 0.5437, "step": 38 }, { "Batch Mean": -0.25239354372024536, "accuracy": 0.7578125, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 7.535412311553955, "learning_rate": 2.85e-06, "loss": 0.5328, "step": 39 }, { "Batch Mean": -0.2604098320007324, "accuracy": 0.7421875, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 7.7832231521606445, "learning_rate": 2.8421052631578946e-06, "loss": 0.5027, "step": 40 }, { "Batch Mean": 0.0992133617401123, "accuracy": 0.7265625, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 7.394603729248047, "learning_rate": 2.8342105263157897e-06, "loss": 0.5346, "step": 41 }, { "Batch Mean": 0.466805100440979, "accuracy": 0.75, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 7.488112926483154, "learning_rate": 2.8263157894736845e-06, "loss": 0.513, "step": 42 }, { "Batch Mean": 0.007248640060424805, "accuracy": 0.7890625, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 7.159249305725098, "learning_rate": 2.8184210526315792e-06, "loss": 0.4573, "step": 43 }, { "Batch Mean": -0.20040297508239746, "accuracy": 0.7578125, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 8.217549324035645, "learning_rate": 2.810526315789474e-06, "loss": 0.4752, "step": 44 }, { "Batch Mean": -0.16749587655067444, "accuracy": 0.7890625, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 7.133549213409424, "learning_rate": 2.8026315789473687e-06, "loss": 0.4852, "step": 45 }, { "Batch Mean": -0.13931392133235931, "accuracy": 0.8203125, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 7.379823684692383, "learning_rate": 2.7947368421052635e-06, "loss": 0.3933, "step": 46 }, { "Batch Mean": 0.30439114570617676, "accuracy": 0.8046875, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 10.884243965148926, "learning_rate": 2.7868421052631578e-06, "loss": 0.555, "step": 47 }, { "Batch Mean": 0.5755846500396729, "accuracy": 0.71875, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 11.940531730651855, "learning_rate": 2.7789473684210525e-06, "loss": 0.5486, "step": 48 }, { "Batch Mean": -0.11571967601776123, "accuracy": 0.734375, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 10.186071395874023, "learning_rate": 2.7710526315789473e-06, "loss": 0.474, "step": 49 }, { "Batch Mean": -0.22505545616149902, "accuracy": 0.75, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 10.46984577178955, "learning_rate": 2.763157894736842e-06, "loss": 0.5942, "step": 50 }, { "Batch Mean": -0.7263038158416748, "accuracy": 0.796875, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 10.227009773254395, "learning_rate": 2.7552631578947368e-06, "loss": 0.4969, "step": 51 }, { "Batch Mean": -0.08428221940994263, "accuracy": 0.6875, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 8.77719497680664, "learning_rate": 2.7473684210526315e-06, "loss": 0.5358, "step": 52 }, { "Batch Mean": 0.13573884963989258, "accuracy": 0.7421875, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 7.371972560882568, "learning_rate": 2.7394736842105263e-06, "loss": 0.4681, "step": 53 }, { "Batch Mean": 0.5908935070037842, "accuracy": 0.7890625, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 8.683425903320312, "learning_rate": 2.7315789473684214e-06, "loss": 0.4736, "step": 54 }, { "Batch Mean": 0.11167335510253906, "accuracy": 0.75, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 7.259003639221191, "learning_rate": 2.723684210526316e-06, "loss": 0.4604, "step": 55 }, { "Batch Mean": -0.29556989669799805, "accuracy": 0.8046875, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 7.5383195877075195, "learning_rate": 2.715789473684211e-06, "loss": 0.4362, "step": 56 }, { "Batch Mean": -0.2666120231151581, "accuracy": 0.7109375, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 8.91254997253418, "learning_rate": 2.7078947368421052e-06, "loss": 0.5249, "step": 57 }, { "Batch Mean": -0.15824484825134277, "accuracy": 0.7421875, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 7.135673522949219, "learning_rate": 2.7e-06, "loss": 0.4759, "step": 58 }, { "Batch Mean": 0.46106112003326416, "accuracy": 0.78125, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 8.785165786743164, "learning_rate": 2.6921052631578947e-06, "loss": 0.4929, "step": 59 }, { "Batch Mean": 0.22279119491577148, "accuracy": 0.7421875, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 8.647071838378906, "learning_rate": 2.6842105263157895e-06, "loss": 0.5142, "step": 60 }, { "Batch Mean": -0.11326289176940918, "accuracy": 0.7421875, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 8.15560531616211, "learning_rate": 2.6763157894736842e-06, "loss": 0.5053, "step": 61 }, { "Batch Mean": -0.3783993721008301, "accuracy": 0.8125, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 7.596786975860596, "learning_rate": 2.668421052631579e-06, "loss": 0.4269, "step": 62 }, { "Batch Mean": -0.45914560556411743, "accuracy": 0.7734375, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 8.38855266571045, "learning_rate": 2.6605263157894737e-06, "loss": 0.4563, "step": 63 }, { "Batch Mean": 0.21407967805862427, "accuracy": 0.78125, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 7.324102878570557, "learning_rate": 2.6526315789473685e-06, "loss": 0.4576, "step": 64 }, { "Batch Mean": 0.40311315655708313, "accuracy": 0.75, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 9.020163536071777, "learning_rate": 2.644736842105263e-06, "loss": 0.5125, "step": 65 }, { "Batch Mean": 0.4620845317840576, "accuracy": 0.796875, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 9.777474403381348, "learning_rate": 2.636842105263158e-06, "loss": 0.4976, "step": 66 }, { "Batch Mean": 0.24700450897216797, "accuracy": 0.796875, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 7.800853729248047, "learning_rate": 2.6289473684210527e-06, "loss": 0.4805, "step": 67 }, { "Batch Mean": -0.699199914932251, "accuracy": 0.78125, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 11.246289253234863, "learning_rate": 2.6210526315789474e-06, "loss": 0.5059, "step": 68 }, { "Batch Mean": -0.3487205505371094, "accuracy": 0.7734375, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 10.012717247009277, "learning_rate": 2.613157894736842e-06, "loss": 0.4823, "step": 69 }, { "Batch Mean": -0.2884748876094818, "accuracy": 0.7265625, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 7.398864269256592, "learning_rate": 2.605263157894737e-06, "loss": 0.4733, "step": 70 }, { "Batch Mean": 0.2956275939941406, "accuracy": 0.7734375, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 7.696159362792969, "learning_rate": 2.5973684210526317e-06, "loss": 0.4371, "step": 71 }, { "Batch Mean": 0.33945655822753906, "accuracy": 0.734375, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 8.824238777160645, "learning_rate": 2.5894736842105264e-06, "loss": 0.5005, "step": 72 }, { "Batch Mean": 0.33958864212036133, "accuracy": 0.8203125, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 7.717179775238037, "learning_rate": 2.581578947368421e-06, "loss": 0.4041, "step": 73 }, { "Batch Mean": -0.37130284309387207, "accuracy": 0.8046875, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 7.526662349700928, "learning_rate": 2.573684210526316e-06, "loss": 0.3923, "step": 74 }, { "Batch Mean": -0.42052245140075684, "accuracy": 0.75, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 8.325594902038574, "learning_rate": 2.5657894736842107e-06, "loss": 0.4774, "step": 75 }, { "Batch Mean": 0.4816089868545532, "accuracy": 0.7265625, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 8.09318733215332, "learning_rate": 2.5578947368421054e-06, "loss": 0.4618, "step": 76 }, { "Batch Mean": 0.20279693603515625, "accuracy": 0.78125, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 7.570963382720947, "learning_rate": 2.55e-06, "loss": 0.4313, "step": 77 }, { "Batch Mean": -0.05216550827026367, "accuracy": 0.78125, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 8.529275894165039, "learning_rate": 2.542105263157895e-06, "loss": 0.4789, "step": 78 }, { "Batch Mean": -0.23521381616592407, "accuracy": 0.7578125, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 8.783782958984375, "learning_rate": 2.5342105263157892e-06, "loss": 0.4806, "step": 79 }, { "Batch Mean": -0.6067562103271484, "accuracy": 0.8046875, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 9.494295120239258, "learning_rate": 2.526315789473684e-06, "loss": 0.4442, "step": 80 }, { "Batch Mean": 0.09117501974105835, "accuracy": 0.8671875, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 8.121963500976562, "learning_rate": 2.5184210526315787e-06, "loss": 0.3909, "step": 81 }, { "Batch Mean": 0.41854333877563477, "accuracy": 0.8046875, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 9.42233943939209, "learning_rate": 2.510526315789474e-06, "loss": 0.4739, "step": 82 }, { "Batch Mean": 0.1303873062133789, "accuracy": 0.7734375, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 9.826407432556152, "learning_rate": 2.5026315789473686e-06, "loss": 0.4386, "step": 83 }, { "Batch Mean": 0.23121166229248047, "accuracy": 0.8203125, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 8.157249450683594, "learning_rate": 2.4947368421052634e-06, "loss": 0.3744, "step": 84 }, { "Batch Mean": 0.11153149604797363, "accuracy": 0.75, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 9.793784141540527, "learning_rate": 2.486842105263158e-06, "loss": 0.4797, "step": 85 }, { "Batch Mean": -0.05008220672607422, "accuracy": 0.7578125, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 10.425474166870117, "learning_rate": 2.478947368421053e-06, "loss": 0.4931, "step": 86 }, { "Batch Mean": -0.2606058120727539, "accuracy": 0.8046875, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 8.923140525817871, "learning_rate": 2.4710526315789476e-06, "loss": 0.4332, "step": 87 }, { "Batch Mean": -0.34642457962036133, "accuracy": 0.78125, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 10.161897659301758, "learning_rate": 2.4631578947368424e-06, "loss": 0.5033, "step": 88 }, { "Batch Mean": -0.5555503964424133, "accuracy": 0.8125, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 8.970458030700684, "learning_rate": 2.4552631578947367e-06, "loss": 0.416, "step": 89 }, { "Batch Mean": -0.04411482810974121, "accuracy": 0.7890625, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 7.110513687133789, "learning_rate": 2.4473684210526314e-06, "loss": 0.4013, "step": 90 }, { "Batch Mean": 0.3213387727737427, "accuracy": 0.7578125, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 9.09732437133789, "learning_rate": 2.439473684210526e-06, "loss": 0.4529, "step": 91 }, { "Batch Mean": 0.3748267889022827, "accuracy": 0.78125, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 9.071044921875, "learning_rate": 2.431578947368421e-06, "loss": 0.4823, "step": 92 }, { "Batch Mean": -0.18729716539382935, "accuracy": 0.78125, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 7.198951244354248, "learning_rate": 2.4236842105263157e-06, "loss": 0.4397, "step": 93 }, { "Batch Mean": -0.09032821655273438, "accuracy": 0.8046875, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 6.631897926330566, "learning_rate": 2.4157894736842104e-06, "loss": 0.4133, "step": 94 }, { "Batch Mean": -0.3303496539592743, "accuracy": 0.734375, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 8.016679763793945, "learning_rate": 2.4078947368421056e-06, "loss": 0.4866, "step": 95 }, { "Batch Mean": 0.04996013641357422, "accuracy": 0.828125, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 6.748624801635742, "learning_rate": 2.4000000000000003e-06, "loss": 0.4049, "step": 96 }, { "Batch Mean": 0.3791158199310303, "accuracy": 0.8203125, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 7.329560279846191, "learning_rate": 2.392105263157895e-06, "loss": 0.4132, "step": 97 }, { "Batch Mean": 0.07385429739952087, "accuracy": 0.8125, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 7.477539539337158, "learning_rate": 2.38421052631579e-06, "loss": 0.4055, "step": 98 }, { "Batch Mean": 0.2892317771911621, "accuracy": 0.7421875, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 9.593108177185059, "learning_rate": 2.376315789473684e-06, "loss": 0.5325, "step": 99 }, { "Batch Mean": -0.06891387701034546, "accuracy": 0.8359375, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 7.288457870483398, "learning_rate": 2.368421052631579e-06, "loss": 0.3851, "step": 100 }, { "Batch Mean": -0.14348173141479492, "accuracy": 0.7734375, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 7.977787971496582, "learning_rate": 2.3605263157894736e-06, "loss": 0.4682, "step": 101 }, { "Batch Mean": -0.006304934620857239, "accuracy": 0.8046875, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 8.956628799438477, "learning_rate": 2.3526315789473684e-06, "loss": 0.408, "step": 102 }, { "Batch Mean": 0.23482227325439453, "accuracy": 0.8671875, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 7.214924335479736, "learning_rate": 2.344736842105263e-06, "loss": 0.3201, "step": 103 }, { "Batch Mean": -0.04813265800476074, "accuracy": 0.7890625, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 9.90829849243164, "learning_rate": 2.336842105263158e-06, "loss": 0.4159, "step": 104 }, { "Batch Mean": -0.013557195663452148, "accuracy": 0.7578125, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 10.49203872680664, "learning_rate": 2.3289473684210526e-06, "loss": 0.454, "step": 105 }, { "Batch Mean": 0.06090879440307617, "accuracy": 0.8046875, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 9.869403839111328, "learning_rate": 2.3210526315789473e-06, "loss": 0.4022, "step": 106 }, { "Batch Mean": 0.0837242603302002, "accuracy": 0.7890625, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 11.350212097167969, "learning_rate": 2.313157894736842e-06, "loss": 0.4498, "step": 107 }, { "Batch Mean": 0.20977115631103516, "accuracy": 0.78125, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 11.261960983276367, "learning_rate": 2.305263157894737e-06, "loss": 0.4282, "step": 108 }, { "Batch Mean": -0.47429561614990234, "accuracy": 0.8515625, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 11.830789566040039, "learning_rate": 2.2973684210526316e-06, "loss": 0.4101, "step": 109 }, { "Batch Mean": -0.3634052276611328, "accuracy": 0.8515625, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 11.642911911010742, "learning_rate": 2.2894736842105263e-06, "loss": 0.3613, "step": 110 }, { "Batch Mean": 0.10471916198730469, "accuracy": 0.8984375, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 7.64810848236084, "learning_rate": 2.281578947368421e-06, "loss": 0.2652, "step": 111 }, { "Batch Mean": -0.012386441230773926, "accuracy": 0.8515625, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 8.564587593078613, "learning_rate": 2.273684210526316e-06, "loss": 0.2888, "step": 112 }, { "Batch Mean": 0.5501725673675537, "accuracy": 0.8203125, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 13.619306564331055, "learning_rate": 2.2657894736842106e-06, "loss": 0.4229, "step": 113 }, { "Batch Mean": 0.38643258810043335, "accuracy": 0.8828125, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 10.127896308898926, "learning_rate": 2.2578947368421053e-06, "loss": 0.3344, "step": 114 }, { "Batch Mean": 0.41213035583496094, "accuracy": 0.7109375, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 17.467395782470703, "learning_rate": 2.25e-06, "loss": 0.5535, "step": 115 }, { "Batch Mean": -0.7800889015197754, "accuracy": 0.796875, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 11.808396339416504, "learning_rate": 2.242105263157895e-06, "loss": 0.4686, "step": 116 }, { "Batch Mean": -1.182328224182129, "accuracy": 0.84375, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 13.75217056274414, "learning_rate": 2.2342105263157895e-06, "loss": 0.395, "step": 117 }, { "Batch Mean": -0.6266803741455078, "accuracy": 0.8203125, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 9.98914623260498, "learning_rate": 2.2263157894736843e-06, "loss": 0.3979, "step": 118 }, { "Batch Mean": 0.39401721954345703, "accuracy": 0.7578125, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 10.643619537353516, "learning_rate": 2.218421052631579e-06, "loss": 0.4525, "step": 119 }, { "Batch Mean": 0.6345320343971252, "accuracy": 0.796875, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 11.153727531433105, "learning_rate": 2.2105263157894738e-06, "loss": 0.4358, "step": 120 }, { "Batch Mean": 0.3619537353515625, "accuracy": 0.7578125, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 9.28880500793457, "learning_rate": 2.2026315789473685e-06, "loss": 0.4207, "step": 121 }, { "Batch Mean": -0.17964375019073486, "accuracy": 0.828125, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 7.322789192199707, "learning_rate": 2.1947368421052633e-06, "loss": 0.377, "step": 122 }, { "Batch Mean": -0.10846161842346191, "accuracy": 0.8359375, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 6.607788562774658, "learning_rate": 2.186842105263158e-06, "loss": 0.3734, "step": 123 }, { "Batch Mean": 0.16549494862556458, "accuracy": 0.8359375, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 7.060988426208496, "learning_rate": 2.1789473684210528e-06, "loss": 0.3922, "step": 124 }, { "Batch Mean": 0.3155031204223633, "accuracy": 0.7734375, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 7.642501354217529, "learning_rate": 2.1710526315789475e-06, "loss": 0.4157, "step": 125 }, { "Batch Mean": 0.5004472732543945, "accuracy": 0.796875, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 8.835705757141113, "learning_rate": 2.1631578947368423e-06, "loss": 0.4304, "step": 126 }, { "Batch Mean": -0.07234644889831543, "accuracy": 0.84375, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 8.115930557250977, "learning_rate": 2.155263157894737e-06, "loss": 0.4201, "step": 127 }, { "Batch Mean": -0.1375948190689087, "accuracy": 0.859375, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 7.790677547454834, "learning_rate": 2.1473684210526317e-06, "loss": 0.3985, "step": 128 }, { "Batch Mean": -0.7996973991394043, "accuracy": 0.78125, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 10.611040115356445, "learning_rate": 2.1394736842105265e-06, "loss": 0.4438, "step": 129 }, { "Batch Mean": -0.5430135726928711, "accuracy": 0.8671875, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.725314140319824, "learning_rate": 2.1315789473684212e-06, "loss": 0.3585, "step": 130 }, { "Batch Mean": 0.26318359375, "accuracy": 0.7890625, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 9.756380081176758, "learning_rate": 2.123684210526316e-06, "loss": 0.4467, "step": 131 }, { "Batch Mean": 0.3145838975906372, "accuracy": 0.828125, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 10.053284645080566, "learning_rate": 2.1157894736842103e-06, "loss": 0.3693, "step": 132 }, { "Batch Mean": 0.43440866470336914, "accuracy": 0.8125, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 10.054195404052734, "learning_rate": 2.107894736842105e-06, "loss": 0.3936, "step": 133 }, { "Batch Mean": -0.04828697443008423, "accuracy": 0.7734375, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 9.32927417755127, "learning_rate": 2.1e-06, "loss": 0.4503, "step": 134 }, { "Batch Mean": 0.11955499649047852, "accuracy": 0.8125, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 10.115190505981445, "learning_rate": 2.0921052631578945e-06, "loss": 0.4678, "step": 135 }, { "Batch Mean": -0.744804859161377, "accuracy": 0.796875, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 10.978937149047852, "learning_rate": 2.0842105263157897e-06, "loss": 0.4126, "step": 136 }, { "Batch Mean": -0.5665757656097412, "accuracy": 0.796875, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 10.839649200439453, "learning_rate": 2.0763157894736845e-06, "loss": 0.419, "step": 137 }, { "Batch Mean": -0.24710901081562042, "accuracy": 0.796875, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 9.443739891052246, "learning_rate": 2.068421052631579e-06, "loss": 0.4318, "step": 138 }, { "Batch Mean": -0.07041597366333008, "accuracy": 0.875, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 8.114516258239746, "learning_rate": 2.060526315789474e-06, "loss": 0.3463, "step": 139 }, { "Batch Mean": 0.4602632522583008, "accuracy": 0.8203125, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 8.078240394592285, "learning_rate": 2.0526315789473687e-06, "loss": 0.3485, "step": 140 }, { "Batch Mean": 0.34258782863616943, "accuracy": 0.8046875, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 8.950546264648438, "learning_rate": 2.0447368421052634e-06, "loss": 0.3833, "step": 141 }, { "Batch Mean": 0.738131046295166, "accuracy": 0.7734375, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 10.85938549041748, "learning_rate": 2.0368421052631578e-06, "loss": 0.4635, "step": 142 }, { "Batch Mean": 0.015331149101257324, "accuracy": 0.796875, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 8.852413177490234, "learning_rate": 2.0289473684210525e-06, "loss": 0.4245, "step": 143 }, { "Batch Mean": -0.5837619304656982, "accuracy": 0.84375, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 9.082572937011719, "learning_rate": 2.0210526315789473e-06, "loss": 0.3629, "step": 144 }, { "Batch Mean": -0.538818359375, "accuracy": 0.7734375, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 9.002676963806152, "learning_rate": 2.013157894736842e-06, "loss": 0.4276, "step": 145 }, { "Batch Mean": -0.21149349212646484, "accuracy": 0.7734375, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 7.991299629211426, "learning_rate": 2.0052631578947367e-06, "loss": 0.4095, "step": 146 }, { "Batch Mean": 0.4303879737854004, "accuracy": 0.7578125, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 8.972021102905273, "learning_rate": 1.9973684210526315e-06, "loss": 0.4421, "step": 147 }, { "Batch Mean": -0.1550462245941162, "accuracy": 0.7734375, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 9.37540340423584, "learning_rate": 1.9894736842105262e-06, "loss": 0.4239, "step": 148 }, { "Batch Mean": 0.34125369787216187, "accuracy": 0.8203125, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 8.121829986572266, "learning_rate": 1.9815789473684214e-06, "loss": 0.4473, "step": 149 }, { "Batch Mean": 0.3503285050392151, "accuracy": 0.8359375, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 7.723282337188721, "learning_rate": 1.973684210526316e-06, "loss": 0.3652, "step": 150 }, { "Batch Mean": 0.34477996826171875, "accuracy": 0.796875, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 8.22373104095459, "learning_rate": 1.965789473684211e-06, "loss": 0.3873, "step": 151 }, { "Batch Mean": -0.09462380409240723, "accuracy": 0.8046875, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 8.765836715698242, "learning_rate": 1.9578947368421052e-06, "loss": 0.4342, "step": 152 }, { "Batch Mean": -0.3281865119934082, "accuracy": 0.8359375, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 8.4281644821167, "learning_rate": 1.95e-06, "loss": 0.3796, "step": 153 }, { "Batch Mean": -0.2611503601074219, "accuracy": 0.8203125, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 10.023504257202148, "learning_rate": 1.9421052631578947e-06, "loss": 0.3897, "step": 154 }, { "Batch Mean": 0.0618128776550293, "accuracy": 0.859375, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 7.970378875732422, "learning_rate": 1.9342105263157895e-06, "loss": 0.3461, "step": 155 }, { "Batch Mean": 0.1698009967803955, "accuracy": 0.8828125, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 8.717864990234375, "learning_rate": 1.926315789473684e-06, "loss": 0.3416, "step": 156 }, { "Batch Mean": 0.5162477493286133, "accuracy": 0.8046875, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 11.4457368850708, "learning_rate": 1.918421052631579e-06, "loss": 0.3854, "step": 157 }, { "Batch Mean": -0.1577901840209961, "accuracy": 0.8515625, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 9.103015899658203, "learning_rate": 1.9105263157894737e-06, "loss": 0.3127, "step": 158 }, { "Batch Mean": -0.1037142276763916, "accuracy": 0.8203125, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 12.061964988708496, "learning_rate": 1.9026315789473684e-06, "loss": 0.382, "step": 159 }, { "Batch Mean": 0.1841106414794922, "accuracy": 0.84375, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 10.312475204467773, "learning_rate": 1.8947368421052632e-06, "loss": 0.3057, "step": 160 }, { "Batch Mean": 0.23936176300048828, "accuracy": 0.8828125, "epoch": 0.4, "step": 160 }, { "epoch": 0.4025, "grad_norm": 9.627632141113281, "learning_rate": 1.8868421052631577e-06, "loss": 0.3393, "step": 161 }, { "Batch Mean": 0.4093809723854065, "accuracy": 0.8515625, "epoch": 0.4025, "step": 161 }, { "epoch": 0.405, "grad_norm": 11.434003829956055, "learning_rate": 1.8789473684210525e-06, "loss": 0.3538, "step": 162 }, { "Batch Mean": -0.6164979338645935, "accuracy": 0.8359375, "epoch": 0.405, "step": 162 }, { "epoch": 0.4075, "grad_norm": 11.858824729919434, "learning_rate": 1.8710526315789476e-06, "loss": 0.3337, "step": 163 }, { "Batch Mean": -0.7717781066894531, "accuracy": 0.8125, "epoch": 0.4075, "step": 163 }, { "epoch": 0.41, "grad_norm": 13.287750244140625, "learning_rate": 1.8631578947368424e-06, "loss": 0.4061, "step": 164 }, { "Batch Mean": -0.3001067042350769, "accuracy": 0.8046875, "epoch": 0.41, "step": 164 }, { "epoch": 0.4125, "grad_norm": 13.294015884399414, "learning_rate": 1.855263157894737e-06, "loss": 0.4431, "step": 165 }, { "Batch Mean": -0.4868602752685547, "accuracy": 0.8515625, "epoch": 0.4125, "step": 165 }, { "epoch": 0.415, "grad_norm": 11.282150268554688, "learning_rate": 1.8473684210526317e-06, "loss": 0.3132, "step": 166 }, { "Batch Mean": 0.6035137176513672, "accuracy": 0.796875, "epoch": 0.415, "step": 166 }, { "epoch": 0.4175, "grad_norm": 13.268153190612793, "learning_rate": 1.8394736842105264e-06, "loss": 0.3593, "step": 167 }, { "Batch Mean": 0.9115990400314331, "accuracy": 0.796875, "epoch": 0.4175, "step": 167 }, { "epoch": 0.42, "grad_norm": 16.841812133789062, "learning_rate": 1.8315789473684211e-06, "loss": 0.4218, "step": 168 }, { "Batch Mean": 0.05609755218029022, "accuracy": 0.796875, "epoch": 0.42, "step": 168 }, { "epoch": 0.4225, "grad_norm": 11.866517066955566, "learning_rate": 1.8236842105263159e-06, "loss": 0.4064, "step": 169 }, { "Batch Mean": -0.1063833236694336, "accuracy": 0.859375, "epoch": 0.4225, "step": 169 }, { "epoch": 0.425, "grad_norm": 11.224437713623047, "learning_rate": 1.8157894736842106e-06, "loss": 0.3982, "step": 170 }, { "Batch Mean": -0.5879029035568237, "accuracy": 0.8828125, "epoch": 0.425, "step": 170 }, { "epoch": 0.4275, "grad_norm": 10.249573707580566, "learning_rate": 1.8078947368421052e-06, "loss": 0.2835, "step": 171 }, { "Batch Mean": -0.7283350229263306, "accuracy": 0.84375, "epoch": 0.4275, "step": 171 }, { "epoch": 0.43, "grad_norm": 11.226608276367188, "learning_rate": 1.8e-06, "loss": 0.3802, "step": 172 }, { "Batch Mean": -0.24501562118530273, "accuracy": 0.796875, "epoch": 0.43, "step": 172 }, { "epoch": 0.4325, "grad_norm": 10.79448127746582, "learning_rate": 1.7921052631578947e-06, "loss": 0.4052, "step": 173 }, { "Batch Mean": -0.024204611778259277, "accuracy": 0.8046875, "epoch": 0.4325, "step": 173 }, { "epoch": 0.435, "grad_norm": 9.704754829406738, "learning_rate": 1.7842105263157894e-06, "loss": 0.3991, "step": 174 }, { "Batch Mean": 0.8046383857727051, "accuracy": 0.8125, "epoch": 0.435, "step": 174 }, { "epoch": 0.4375, "grad_norm": 14.195847511291504, "learning_rate": 1.7763157894736842e-06, "loss": 0.4445, "step": 175 }, { "Batch Mean": 0.42048847675323486, "accuracy": 0.78125, "epoch": 0.4375, "step": 175 }, { "epoch": 0.44, "grad_norm": 10.577892303466797, "learning_rate": 1.768421052631579e-06, "loss": 0.4381, "step": 176 }, { "Batch Mean": 0.4540799856185913, "accuracy": 0.828125, "epoch": 0.44, "step": 176 }, { "epoch": 0.4425, "grad_norm": 9.509872436523438, "learning_rate": 1.7605263157894739e-06, "loss": 0.4243, "step": 177 }, { "Batch Mean": -0.022069334983825684, "accuracy": 0.8828125, "epoch": 0.4425, "step": 177 }, { "epoch": 0.445, "grad_norm": 7.100767135620117, "learning_rate": 1.7526315789473686e-06, "loss": 0.2665, "step": 178 }, { "Batch Mean": -0.19879460334777832, "accuracy": 0.8359375, "epoch": 0.445, "step": 178 }, { "epoch": 0.4475, "grad_norm": 9.830549240112305, "learning_rate": 1.7447368421052633e-06, "loss": 0.3493, "step": 179 }, { "Batch Mean": -0.6755795478820801, "accuracy": 0.8125, "epoch": 0.4475, "step": 179 }, { "epoch": 0.45, "grad_norm": 9.443568229675293, "learning_rate": 1.736842105263158e-06, "loss": 0.4007, "step": 180 }, { "Batch Mean": 0.3351125717163086, "accuracy": 0.8125, "epoch": 0.45, "step": 180 }, { "epoch": 0.4525, "grad_norm": 8.603584289550781, "learning_rate": 1.7289473684210526e-06, "loss": 0.3646, "step": 181 }, { "Batch Mean": 0.1227731704711914, "accuracy": 0.875, "epoch": 0.4525, "step": 181 }, { "epoch": 0.455, "grad_norm": 6.94600248336792, "learning_rate": 1.7210526315789474e-06, "loss": 0.3198, "step": 182 }, { "Batch Mean": 0.47222042083740234, "accuracy": 0.78125, "epoch": 0.455, "step": 182 }, { "epoch": 0.4575, "grad_norm": 10.670587539672852, "learning_rate": 1.7131578947368421e-06, "loss": 0.4573, "step": 183 }, { "Batch Mean": 0.08918190002441406, "accuracy": 0.8125, "epoch": 0.4575, "step": 183 }, { "epoch": 0.46, "grad_norm": 8.129005432128906, "learning_rate": 1.7052631578947369e-06, "loss": 0.3832, "step": 184 }, { "Batch Mean": -0.33992671966552734, "accuracy": 0.7578125, "epoch": 0.46, "step": 184 }, { "epoch": 0.4625, "grad_norm": 10.932904243469238, "learning_rate": 1.6973684210526316e-06, "loss": 0.5074, "step": 185 }, { "Batch Mean": -0.10214900970458984, "accuracy": 0.84375, "epoch": 0.4625, "step": 185 }, { "epoch": 0.465, "grad_norm": 8.017980575561523, "learning_rate": 1.6894736842105264e-06, "loss": 0.3757, "step": 186 }, { "Batch Mean": 0.1536245346069336, "accuracy": 0.84375, "epoch": 0.465, "step": 186 }, { "epoch": 0.4675, "grad_norm": 7.048803806304932, "learning_rate": 1.6815789473684209e-06, "loss": 0.3446, "step": 187 }, { "Batch Mean": -0.18456649780273438, "accuracy": 0.859375, "epoch": 0.4675, "step": 187 }, { "epoch": 0.47, "grad_norm": 9.696250915527344, "learning_rate": 1.6736842105263156e-06, "loss": 0.3075, "step": 188 }, { "Batch Mean": -0.11369824409484863, "accuracy": 0.8359375, "epoch": 0.47, "step": 188 }, { "epoch": 0.4725, "grad_norm": 8.466410636901855, "learning_rate": 1.6657894736842104e-06, "loss": 0.4068, "step": 189 }, { "Batch Mean": -0.0312764048576355, "accuracy": 0.8984375, "epoch": 0.4725, "step": 189 }, { "epoch": 0.475, "grad_norm": 7.680120944976807, "learning_rate": 1.6578947368421056e-06, "loss": 0.2806, "step": 190 }, { "Batch Mean": 0.43646717071533203, "accuracy": 0.8125, "epoch": 0.475, "step": 190 }, { "epoch": 0.4775, "grad_norm": 12.297844886779785, "learning_rate": 1.65e-06, "loss": 0.4147, "step": 191 }, { "Batch Mean": 0.0014213323593139648, "accuracy": 0.90625, "epoch": 0.4775, "step": 191 }, { "epoch": 0.48, "grad_norm": 7.487174987792969, "learning_rate": 1.6421052631578948e-06, "loss": 0.3, "step": 192 }, { "Batch Mean": 0.0644383430480957, "accuracy": 0.8125, "epoch": 0.48, "step": 192 }, { "epoch": 0.4825, "grad_norm": 9.683974266052246, "learning_rate": 1.6342105263157896e-06, "loss": 0.3863, "step": 193 }, { "Batch Mean": 0.4465351104736328, "accuracy": 0.84375, "epoch": 0.4825, "step": 193 }, { "epoch": 0.485, "grad_norm": 10.928067207336426, "learning_rate": 1.6263157894736843e-06, "loss": 0.3765, "step": 194 }, { "Batch Mean": 0.2289137840270996, "accuracy": 0.8046875, "epoch": 0.485, "step": 194 }, { "epoch": 0.4875, "grad_norm": 10.405890464782715, "learning_rate": 1.618421052631579e-06, "loss": 0.3942, "step": 195 }, { "Batch Mean": -0.46003198623657227, "accuracy": 0.8125, "epoch": 0.4875, "step": 195 }, { "epoch": 0.49, "grad_norm": 9.782814979553223, "learning_rate": 1.6105263157894738e-06, "loss": 0.3805, "step": 196 }, { "Batch Mean": -0.8169732093811035, "accuracy": 0.84375, "epoch": 0.49, "step": 196 }, { "epoch": 0.4925, "grad_norm": 13.134366035461426, "learning_rate": 1.6026315789473683e-06, "loss": 0.3853, "step": 197 }, { "Batch Mean": -0.8004512786865234, "accuracy": 0.8515625, "epoch": 0.4925, "step": 197 }, { "epoch": 0.495, "grad_norm": 12.021324157714844, "learning_rate": 1.594736842105263e-06, "loss": 0.3366, "step": 198 }, { "Batch Mean": 0.007058143615722656, "accuracy": 0.8671875, "epoch": 0.495, "step": 198 }, { "epoch": 0.4975, "grad_norm": 10.337015151977539, "learning_rate": 1.5868421052631578e-06, "loss": 0.3146, "step": 199 }, { "Batch Mean": 0.3048887252807617, "accuracy": 0.90625, "epoch": 0.4975, "step": 199 }, { "epoch": 0.5, "grad_norm": 10.387301445007324, "learning_rate": 1.5789473684210526e-06, "loss": 0.3261, "step": 200 }, { "Batch Mean": 0.3846898078918457, "accuracy": 0.8125, "epoch": 0.5, "step": 200 }, { "epoch": 0.5025, "grad_norm": 12.028776168823242, "learning_rate": 1.5710526315789473e-06, "loss": 0.4295, "step": 201 }, { "Batch Mean": 0.17888259887695312, "accuracy": 0.8359375, "epoch": 0.5025, "step": 201 }, { "epoch": 0.505, "grad_norm": 10.109951972961426, "learning_rate": 1.563157894736842e-06, "loss": 0.3303, "step": 202 }, { "Batch Mean": 0.17313361167907715, "accuracy": 0.8046875, "epoch": 0.505, "step": 202 }, { "epoch": 0.5075, "grad_norm": 9.5557861328125, "learning_rate": 1.5552631578947368e-06, "loss": 0.3313, "step": 203 }, { "Batch Mean": 0.10066866874694824, "accuracy": 0.8671875, "epoch": 0.5075, "step": 203 }, { "epoch": 0.51, "grad_norm": 10.076003074645996, "learning_rate": 1.5473684210526318e-06, "loss": 0.354, "step": 204 }, { "Batch Mean": -0.13635492324829102, "accuracy": 0.84375, "epoch": 0.51, "step": 204 }, { "epoch": 0.5125, "grad_norm": 9.799072265625, "learning_rate": 1.5394736842105265e-06, "loss": 0.3384, "step": 205 }, { "Batch Mean": -0.061470985412597656, "accuracy": 0.8046875, "epoch": 0.5125, "step": 205 }, { "epoch": 0.515, "grad_norm": 10.165879249572754, "learning_rate": 1.5315789473684213e-06, "loss": 0.4189, "step": 206 }, { "Batch Mean": -0.5068221092224121, "accuracy": 0.84375, "epoch": 0.515, "step": 206 }, { "epoch": 0.5175, "grad_norm": 11.060914993286133, "learning_rate": 1.5236842105263158e-06, "loss": 0.3486, "step": 207 }, { "Batch Mean": -0.36471378803253174, "accuracy": 0.84375, "epoch": 0.5175, "step": 207 }, { "epoch": 0.52, "grad_norm": 9.788437843322754, "learning_rate": 1.5157894736842105e-06, "loss": 0.353, "step": 208 }, { "Batch Mean": 0.29315185546875, "accuracy": 0.8359375, "epoch": 0.52, "step": 208 }, { "epoch": 0.5225, "grad_norm": 8.249659538269043, "learning_rate": 1.5078947368421053e-06, "loss": 0.3217, "step": 209 }, { "Batch Mean": -0.001120150089263916, "accuracy": 0.8671875, "epoch": 0.5225, "step": 209 }, { "epoch": 0.525, "grad_norm": 8.720206260681152, "learning_rate": 1.5e-06, "loss": 0.377, "step": 210 }, { "Batch Mean": 0.018924176692962646, "accuracy": 0.8515625, "epoch": 0.525, "step": 210 }, { "epoch": 0.5275, "grad_norm": 10.584818840026855, "learning_rate": 1.4921052631578948e-06, "loss": 0.3752, "step": 211 }, { "Batch Mean": 0.25783300399780273, "accuracy": 0.8515625, "epoch": 0.5275, "step": 211 }, { "epoch": 0.53, "grad_norm": 9.455146789550781, "learning_rate": 1.4842105263157895e-06, "loss": 0.3456, "step": 212 }, { "Batch Mean": 0.2519712448120117, "accuracy": 0.8046875, "epoch": 0.53, "step": 212 }, { "epoch": 0.5325, "grad_norm": 10.720685005187988, "learning_rate": 1.4763157894736843e-06, "loss": 0.4302, "step": 213 }, { "Batch Mean": 0.23407071828842163, "accuracy": 0.796875, "epoch": 0.5325, "step": 213 }, { "epoch": 0.535, "grad_norm": 10.353367805480957, "learning_rate": 1.468421052631579e-06, "loss": 0.4365, "step": 214 }, { "Batch Mean": -0.13458722829818726, "accuracy": 0.8125, "epoch": 0.535, "step": 214 }, { "epoch": 0.5375, "grad_norm": 10.57049560546875, "learning_rate": 1.4605263157894738e-06, "loss": 0.3933, "step": 215 }, { "Batch Mean": -0.4141349494457245, "accuracy": 0.8046875, "epoch": 0.5375, "step": 215 }, { "epoch": 0.54, "grad_norm": 10.656410217285156, "learning_rate": 1.4526315789473685e-06, "loss": 0.4412, "step": 216 }, { "Batch Mean": -0.25697755813598633, "accuracy": 0.8359375, "epoch": 0.54, "step": 216 }, { "epoch": 0.5425, "grad_norm": 9.440042495727539, "learning_rate": 1.4447368421052633e-06, "loss": 0.3612, "step": 217 }, { "Batch Mean": -0.22686290740966797, "accuracy": 0.8515625, "epoch": 0.5425, "step": 217 }, { "epoch": 0.545, "grad_norm": 8.337709426879883, "learning_rate": 1.436842105263158e-06, "loss": 0.3764, "step": 218 }, { "Batch Mean": -0.19732820987701416, "accuracy": 0.8359375, "epoch": 0.545, "step": 218 }, { "epoch": 0.5475, "grad_norm": 8.630691528320312, "learning_rate": 1.4289473684210525e-06, "loss": 0.3722, "step": 219 }, { "Batch Mean": 0.2953619956970215, "accuracy": 0.8359375, "epoch": 0.5475, "step": 219 }, { "epoch": 0.55, "grad_norm": 8.33515453338623, "learning_rate": 1.4210526315789473e-06, "loss": 0.3361, "step": 220 }, { "Batch Mean": 0.1760966181755066, "accuracy": 0.828125, "epoch": 0.55, "step": 220 }, { "epoch": 0.5525, "grad_norm": 9.701991081237793, "learning_rate": 1.4131578947368422e-06, "loss": 0.3802, "step": 221 }, { "Batch Mean": 0.7126345634460449, "accuracy": 0.765625, "epoch": 0.5525, "step": 221 }, { "epoch": 0.555, "grad_norm": 12.682254791259766, "learning_rate": 1.405263157894737e-06, "loss": 0.4783, "step": 222 }, { "Batch Mean": 0.4724486470222473, "accuracy": 0.8515625, "epoch": 0.555, "step": 222 }, { "epoch": 0.5575, "grad_norm": 10.180864334106445, "learning_rate": 1.3973684210526317e-06, "loss": 0.3705, "step": 223 }, { "Batch Mean": 0.09322214126586914, "accuracy": 0.8359375, "epoch": 0.5575, "step": 223 }, { "epoch": 0.56, "grad_norm": 7.578114032745361, "learning_rate": 1.3894736842105263e-06, "loss": 0.347, "step": 224 }, { "Batch Mean": -0.7372913360595703, "accuracy": 0.828125, "epoch": 0.56, "step": 224 }, { "epoch": 0.5625, "grad_norm": 9.805769920349121, "learning_rate": 1.381578947368421e-06, "loss": 0.4277, "step": 225 }, { "Batch Mean": -0.5061855316162109, "accuracy": 0.8515625, "epoch": 0.5625, "step": 225 }, { "epoch": 0.565, "grad_norm": 8.92744255065918, "learning_rate": 1.3736842105263158e-06, "loss": 0.3668, "step": 226 }, { "Batch Mean": -0.4624595046043396, "accuracy": 0.84375, "epoch": 0.565, "step": 226 }, { "epoch": 0.5675, "grad_norm": 8.949381828308105, "learning_rate": 1.3657894736842107e-06, "loss": 0.354, "step": 227 }, { "Batch Mean": 0.016000747680664062, "accuracy": 0.765625, "epoch": 0.5675, "step": 227 }, { "epoch": 0.57, "grad_norm": 8.431851387023926, "learning_rate": 1.3578947368421055e-06, "loss": 0.4241, "step": 228 }, { "Batch Mean": -0.004792451858520508, "accuracy": 0.8125, "epoch": 0.57, "step": 228 }, { "epoch": 0.5725, "grad_norm": 8.652802467346191, "learning_rate": 1.35e-06, "loss": 0.431, "step": 229 }, { "Batch Mean": -0.1176004409790039, "accuracy": 0.8359375, "epoch": 0.5725, "step": 229 }, { "epoch": 0.575, "grad_norm": 8.387956619262695, "learning_rate": 1.3421052631578947e-06, "loss": 0.35, "step": 230 }, { "Batch Mean": 0.7041282653808594, "accuracy": 0.8125, "epoch": 0.575, "step": 230 }, { "epoch": 0.5775, "grad_norm": 10.400925636291504, "learning_rate": 1.3342105263157895e-06, "loss": 0.4074, "step": 231 }, { "Batch Mean": 0.24665021896362305, "accuracy": 0.8203125, "epoch": 0.5775, "step": 231 }, { "epoch": 0.58, "grad_norm": 8.198482513427734, "learning_rate": 1.3263157894736842e-06, "loss": 0.3618, "step": 232 }, { "Batch Mean": 0.3715553283691406, "accuracy": 0.859375, "epoch": 0.58, "step": 232 }, { "epoch": 0.5825, "grad_norm": 7.664952754974365, "learning_rate": 1.318421052631579e-06, "loss": 0.3373, "step": 233 }, { "Batch Mean": 0.2685070037841797, "accuracy": 0.8671875, "epoch": 0.5825, "step": 233 }, { "epoch": 0.585, "grad_norm": 7.603297233581543, "learning_rate": 1.3105263157894737e-06, "loss": 0.2974, "step": 234 }, { "Batch Mean": 0.21946507692337036, "accuracy": 0.828125, "epoch": 0.585, "step": 234 }, { "epoch": 0.5875, "grad_norm": 9.086359977722168, "learning_rate": 1.3026315789473685e-06, "loss": 0.3765, "step": 235 }, { "Batch Mean": 0.08822351694107056, "accuracy": 0.8671875, "epoch": 0.5875, "step": 235 }, { "epoch": 0.59, "grad_norm": 8.170425415039062, "learning_rate": 1.2947368421052632e-06, "loss": 0.3658, "step": 236 }, { "Batch Mean": -0.35591793060302734, "accuracy": 0.8515625, "epoch": 0.59, "step": 236 }, { "epoch": 0.5925, "grad_norm": 7.828787803649902, "learning_rate": 1.286842105263158e-06, "loss": 0.3196, "step": 237 }, { "Batch Mean": -0.9328911304473877, "accuracy": 0.828125, "epoch": 0.5925, "step": 237 }, { "epoch": 0.595, "grad_norm": 12.014172554016113, "learning_rate": 1.2789473684210527e-06, "loss": 0.3751, "step": 238 }, { "Batch Mean": 0.0694279670715332, "accuracy": 0.8359375, "epoch": 0.595, "step": 238 }, { "epoch": 0.5975, "grad_norm": 8.917220115661621, "learning_rate": 1.2710526315789474e-06, "loss": 0.3316, "step": 239 }, { "Batch Mean": 0.46890783309936523, "accuracy": 0.84375, "epoch": 0.5975, "step": 239 }, { "epoch": 0.6, "grad_norm": 10.03316879272461, "learning_rate": 1.263157894736842e-06, "loss": 0.3364, "step": 240 }, { "Batch Mean": 0.13347864151000977, "accuracy": 0.84375, "epoch": 0.6, "step": 240 }, { "epoch": 0.6025, "grad_norm": 10.248523712158203, "learning_rate": 1.255263157894737e-06, "loss": 0.4, "step": 241 }, { "Batch Mean": 0.41477489471435547, "accuracy": 0.875, "epoch": 0.6025, "step": 241 }, { "epoch": 0.605, "grad_norm": 9.51279067993164, "learning_rate": 1.2473684210526317e-06, "loss": 0.3409, "step": 242 }, { "Batch Mean": -0.3279252052307129, "accuracy": 0.8984375, "epoch": 0.605, "step": 242 }, { "epoch": 0.6075, "grad_norm": 9.355228424072266, "learning_rate": 1.2394736842105264e-06, "loss": 0.2851, "step": 243 }, { "Batch Mean": 0.20540761947631836, "accuracy": 0.8359375, "epoch": 0.6075, "step": 243 }, { "epoch": 0.61, "grad_norm": 12.000129699707031, "learning_rate": 1.2315789473684212e-06, "loss": 0.4203, "step": 244 }, { "Batch Mean": -0.2519039511680603, "accuracy": 0.8125, "epoch": 0.61, "step": 244 }, { "epoch": 0.6125, "grad_norm": 9.294373512268066, "learning_rate": 1.2236842105263157e-06, "loss": 0.3313, "step": 245 }, { "Batch Mean": -0.26664018630981445, "accuracy": 0.7734375, "epoch": 0.6125, "step": 245 }, { "epoch": 0.615, "grad_norm": 13.824014663696289, "learning_rate": 1.2157894736842105e-06, "loss": 0.565, "step": 246 }, { "Batch Mean": -0.39514851570129395, "accuracy": 0.8359375, "epoch": 0.615, "step": 246 }, { "epoch": 0.6175, "grad_norm": 9.212977409362793, "learning_rate": 1.2078947368421052e-06, "loss": 0.3392, "step": 247 }, { "Batch Mean": -0.19669628143310547, "accuracy": 0.859375, "epoch": 0.6175, "step": 247 }, { "epoch": 0.62, "grad_norm": 9.146504402160645, "learning_rate": 1.2000000000000002e-06, "loss": 0.3738, "step": 248 }, { "Batch Mean": 0.0451512336730957, "accuracy": 0.8046875, "epoch": 0.62, "step": 248 }, { "epoch": 0.6225, "grad_norm": 10.29110336303711, "learning_rate": 1.192105263157895e-06, "loss": 0.4081, "step": 249 }, { "Batch Mean": 0.10025668144226074, "accuracy": 0.8515625, "epoch": 0.6225, "step": 249 }, { "epoch": 0.625, "grad_norm": 8.873616218566895, "learning_rate": 1.1842105263157894e-06, "loss": 0.3559, "step": 250 }, { "Batch Mean": -0.04506206512451172, "accuracy": 0.859375, "epoch": 0.625, "step": 250 }, { "epoch": 0.6275, "grad_norm": 8.159873962402344, "learning_rate": 1.1763157894736842e-06, "loss": 0.3022, "step": 251 }, { "Batch Mean": 0.3932800889015198, "accuracy": 0.796875, "epoch": 0.6275, "step": 251 }, { "epoch": 0.63, "grad_norm": 9.627471923828125, "learning_rate": 1.168421052631579e-06, "loss": 0.3727, "step": 252 }, { "Batch Mean": 0.5422868728637695, "accuracy": 0.8359375, "epoch": 0.63, "step": 252 }, { "epoch": 0.6325, "grad_norm": 9.851725578308105, "learning_rate": 1.1605263157894737e-06, "loss": 0.3625, "step": 253 }, { "Batch Mean": 0.18495893478393555, "accuracy": 0.8671875, "epoch": 0.6325, "step": 253 }, { "epoch": 0.635, "grad_norm": 8.526684761047363, "learning_rate": 1.1526315789473684e-06, "loss": 0.3445, "step": 254 }, { "Batch Mean": -0.14566564559936523, "accuracy": 0.75, "epoch": 0.635, "step": 254 }, { "epoch": 0.6375, "grad_norm": 10.76711368560791, "learning_rate": 1.1447368421052632e-06, "loss": 0.4964, "step": 255 }, { "Batch Mean": -0.34235867857933044, "accuracy": 0.8125, "epoch": 0.6375, "step": 255 }, { "epoch": 0.64, "grad_norm": 8.614067077636719, "learning_rate": 1.136842105263158e-06, "loss": 0.3615, "step": 256 }, { "Batch Mean": -0.2911210060119629, "accuracy": 0.8359375, "epoch": 0.64, "step": 256 }, { "epoch": 0.6425, "grad_norm": 9.370655059814453, "learning_rate": 1.1289473684210527e-06, "loss": 0.4009, "step": 257 }, { "Batch Mean": -0.46470344066619873, "accuracy": 0.828125, "epoch": 0.6425, "step": 257 }, { "epoch": 0.645, "grad_norm": 9.790703773498535, "learning_rate": 1.1210526315789474e-06, "loss": 0.3881, "step": 258 }, { "Batch Mean": -0.6390800476074219, "accuracy": 0.8203125, "epoch": 0.645, "step": 258 }, { "epoch": 0.6475, "grad_norm": 9.37238597869873, "learning_rate": 1.1131578947368421e-06, "loss": 0.3805, "step": 259 }, { "Batch Mean": 0.1065974235534668, "accuracy": 0.828125, "epoch": 0.6475, "step": 259 }, { "epoch": 0.65, "grad_norm": 7.998692512512207, "learning_rate": 1.1052631578947369e-06, "loss": 0.3688, "step": 260 }, { "Batch Mean": 0.022962749004364014, "accuracy": 0.875, "epoch": 0.65, "step": 260 }, { "epoch": 0.6525, "grad_norm": 7.9993977546691895, "learning_rate": 1.0973684210526316e-06, "loss": 0.3377, "step": 261 }, { "Batch Mean": 0.5953633785247803, "accuracy": 0.828125, "epoch": 0.6525, "step": 261 }, { "epoch": 0.655, "grad_norm": 10.421438217163086, "learning_rate": 1.0894736842105264e-06, "loss": 0.3699, "step": 262 }, { "Batch Mean": -0.10953497886657715, "accuracy": 0.84375, "epoch": 0.655, "step": 262 }, { "epoch": 0.6575, "grad_norm": 9.006104469299316, "learning_rate": 1.0815789473684211e-06, "loss": 0.3919, "step": 263 }, { "Batch Mean": -0.07210604846477509, "accuracy": 0.84375, "epoch": 0.6575, "step": 263 }, { "epoch": 0.66, "grad_norm": 7.708876609802246, "learning_rate": 1.0736842105263159e-06, "loss": 0.3634, "step": 264 }, { "Batch Mean": 0.09765318781137466, "accuracy": 0.8515625, "epoch": 0.66, "step": 264 }, { "epoch": 0.6625, "grad_norm": 7.834005832672119, "learning_rate": 1.0657894736842106e-06, "loss": 0.3461, "step": 265 }, { "Batch Mean": 0.5027589797973633, "accuracy": 0.8125, "epoch": 0.6625, "step": 265 }, { "epoch": 0.665, "grad_norm": 11.38192367553711, "learning_rate": 1.0578947368421052e-06, "loss": 0.4183, "step": 266 }, { "Batch Mean": 0.35455894470214844, "accuracy": 0.8984375, "epoch": 0.665, "step": 266 }, { "epoch": 0.6675, "grad_norm": 7.649405002593994, "learning_rate": 1.05e-06, "loss": 0.3053, "step": 267 }, { "Batch Mean": -0.2621840238571167, "accuracy": 0.8046875, "epoch": 0.6675, "step": 267 }, { "epoch": 0.67, "grad_norm": 8.894427299499512, "learning_rate": 1.0421052631578949e-06, "loss": 0.3942, "step": 268 }, { "Batch Mean": 0.395050048828125, "accuracy": 0.84375, "epoch": 0.67, "step": 268 }, { "epoch": 0.6725, "grad_norm": 8.56412410736084, "learning_rate": 1.0342105263157896e-06, "loss": 0.344, "step": 269 }, { "Batch Mean": 0.18659591674804688, "accuracy": 0.84375, "epoch": 0.6725, "step": 269 }, { "epoch": 0.675, "grad_norm": 8.3667631149292, "learning_rate": 1.0263157894736843e-06, "loss": 0.3481, "step": 270 }, { "Batch Mean": -0.2180713415145874, "accuracy": 0.84375, "epoch": 0.675, "step": 270 }, { "epoch": 0.6775, "grad_norm": 8.841838836669922, "learning_rate": 1.0184210526315789e-06, "loss": 0.3658, "step": 271 }, { "Batch Mean": -0.1882772445678711, "accuracy": 0.8515625, "epoch": 0.6775, "step": 271 }, { "epoch": 0.68, "grad_norm": 9.811952590942383, "learning_rate": 1.0105263157894736e-06, "loss": 0.3432, "step": 272 }, { "Batch Mean": -0.0435866117477417, "accuracy": 0.8203125, "epoch": 0.68, "step": 272 }, { "epoch": 0.6825, "grad_norm": 9.91102123260498, "learning_rate": 1.0026315789473684e-06, "loss": 0.3464, "step": 273 }, { "Batch Mean": -0.2130718231201172, "accuracy": 0.8515625, "epoch": 0.6825, "step": 273 }, { "epoch": 0.685, "grad_norm": 8.993069648742676, "learning_rate": 9.947368421052631e-07, "loss": 0.3687, "step": 274 }, { "Batch Mean": 0.06511643528938293, "accuracy": 0.84375, "epoch": 0.685, "step": 274 }, { "epoch": 0.6875, "grad_norm": 9.224372863769531, "learning_rate": 9.86842105263158e-07, "loss": 0.3275, "step": 275 }, { "Batch Mean": 0.19634652137756348, "accuracy": 0.8203125, "epoch": 0.6875, "step": 275 }, { "epoch": 0.69, "grad_norm": 10.563430786132812, "learning_rate": 9.789473684210526e-07, "loss": 0.4621, "step": 276 }, { "Batch Mean": 0.09274458885192871, "accuracy": 0.7734375, "epoch": 0.69, "step": 276 }, { "epoch": 0.6925, "grad_norm": 9.659343719482422, "learning_rate": 9.710526315789474e-07, "loss": 0.436, "step": 277 }, { "Batch Mean": 0.0857553482055664, "accuracy": 0.828125, "epoch": 0.6925, "step": 277 }, { "epoch": 0.695, "grad_norm": 10.164700508117676, "learning_rate": 9.63157894736842e-07, "loss": 0.3198, "step": 278 }, { "Batch Mean": 0.0673593282699585, "accuracy": 0.8359375, "epoch": 0.695, "step": 278 }, { "epoch": 0.6975, "grad_norm": 9.253751754760742, "learning_rate": 9.552631578947368e-07, "loss": 0.3667, "step": 279 }, { "Batch Mean": -0.1906418800354004, "accuracy": 0.875, "epoch": 0.6975, "step": 279 }, { "epoch": 0.7, "grad_norm": 8.38836669921875, "learning_rate": 9.473684210526316e-07, "loss": 0.2834, "step": 280 }, { "Batch Mean": -0.2817869186401367, "accuracy": 0.7734375, "epoch": 0.7, "step": 280 }, { "epoch": 0.7025, "grad_norm": 11.11916446685791, "learning_rate": 9.394736842105262e-07, "loss": 0.4159, "step": 281 }, { "Batch Mean": 0.1480579376220703, "accuracy": 0.8359375, "epoch": 0.7025, "step": 281 }, { "epoch": 0.705, "grad_norm": 10.571281433105469, "learning_rate": 9.315789473684212e-07, "loss": 0.3385, "step": 282 }, { "Batch Mean": 0.17013701796531677, "accuracy": 0.78125, "epoch": 0.705, "step": 282 }, { "epoch": 0.7075, "grad_norm": 12.997551918029785, "learning_rate": 9.236842105263158e-07, "loss": 0.4569, "step": 283 }, { "Batch Mean": 0.1344117522239685, "accuracy": 0.8515625, "epoch": 0.7075, "step": 283 }, { "epoch": 0.71, "grad_norm": 8.290603637695312, "learning_rate": 9.157894736842106e-07, "loss": 0.3057, "step": 284 }, { "Batch Mean": -0.1876235008239746, "accuracy": 0.859375, "epoch": 0.71, "step": 284 }, { "epoch": 0.7125, "grad_norm": 10.66235637664795, "learning_rate": 9.078947368421053e-07, "loss": 0.3139, "step": 285 }, { "Batch Mean": -0.3286571502685547, "accuracy": 0.8515625, "epoch": 0.7125, "step": 285 }, { "epoch": 0.715, "grad_norm": 9.821268081665039, "learning_rate": 9e-07, "loss": 0.3358, "step": 286 }, { "Batch Mean": 0.005966871976852417, "accuracy": 0.859375, "epoch": 0.715, "step": 286 }, { "epoch": 0.7175, "grad_norm": 9.407478332519531, "learning_rate": 8.921052631578947e-07, "loss": 0.3341, "step": 287 }, { "Batch Mean": -0.436631977558136, "accuracy": 0.8671875, "epoch": 0.7175, "step": 287 }, { "epoch": 0.72, "grad_norm": 11.42169189453125, "learning_rate": 8.842105263157895e-07, "loss": 0.4293, "step": 288 }, { "Batch Mean": 0.09899795055389404, "accuracy": 0.890625, "epoch": 0.72, "step": 288 }, { "epoch": 0.7225, "grad_norm": 8.493339538574219, "learning_rate": 8.763157894736843e-07, "loss": 0.2619, "step": 289 }, { "Batch Mean": 0.21198153495788574, "accuracy": 0.8125, "epoch": 0.7225, "step": 289 }, { "epoch": 0.725, "grad_norm": 10.834942817687988, "learning_rate": 8.68421052631579e-07, "loss": 0.3986, "step": 290 }, { "Batch Mean": 0.5926527976989746, "accuracy": 0.828125, "epoch": 0.725, "step": 290 }, { "epoch": 0.7275, "grad_norm": 10.650278091430664, "learning_rate": 8.605263157894737e-07, "loss": 0.3638, "step": 291 }, { "Batch Mean": 0.023035049438476562, "accuracy": 0.859375, "epoch": 0.7275, "step": 291 }, { "epoch": 0.73, "grad_norm": 10.30225658416748, "learning_rate": 8.526315789473684e-07, "loss": 0.3584, "step": 292 }, { "Batch Mean": -0.41120100021362305, "accuracy": 0.8203125, "epoch": 0.73, "step": 292 }, { "epoch": 0.7325, "grad_norm": 9.968331336975098, "learning_rate": 8.447368421052632e-07, "loss": 0.4338, "step": 293 }, { "Batch Mean": 0.1086917519569397, "accuracy": 0.8828125, "epoch": 0.7325, "step": 293 }, { "epoch": 0.735, "grad_norm": 8.88227653503418, "learning_rate": 8.368421052631578e-07, "loss": 0.3174, "step": 294 }, { "Batch Mean": 0.06357598304748535, "accuracy": 0.828125, "epoch": 0.735, "step": 294 }, { "epoch": 0.7375, "grad_norm": 10.419845581054688, "learning_rate": 8.289473684210528e-07, "loss": 0.3307, "step": 295 }, { "Batch Mean": -0.16071414947509766, "accuracy": 0.8203125, "epoch": 0.7375, "step": 295 }, { "epoch": 0.74, "grad_norm": 9.523018836975098, "learning_rate": 8.210526315789474e-07, "loss": 0.3388, "step": 296 }, { "Batch Mean": 0.2232537567615509, "accuracy": 0.890625, "epoch": 0.74, "step": 296 }, { "epoch": 0.7425, "grad_norm": 9.414478302001953, "learning_rate": 8.131578947368422e-07, "loss": 0.3035, "step": 297 }, { "Batch Mean": -0.037161171436309814, "accuracy": 0.828125, "epoch": 0.7425, "step": 297 }, { "epoch": 0.745, "grad_norm": 9.156777381896973, "learning_rate": 8.052631578947369e-07, "loss": 0.3558, "step": 298 }, { "Batch Mean": 0.2239832878112793, "accuracy": 0.84375, "epoch": 0.745, "step": 298 }, { "epoch": 0.7475, "grad_norm": 9.189536094665527, "learning_rate": 7.973684210526315e-07, "loss": 0.3335, "step": 299 }, { "Batch Mean": 0.21716433763504028, "accuracy": 0.8671875, "epoch": 0.7475, "step": 299 }, { "epoch": 0.75, "grad_norm": 9.07275104522705, "learning_rate": 7.894736842105263e-07, "loss": 0.3226, "step": 300 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }