| { | |
| "best_metric": 0.6652334928512573, | |
| "best_model_checkpoint": "/l/users/visionlanguage/mostafa_ciai/hf_checkpoints_code_ciai_gemma2/checkpoint-1700", | |
| "epoch": 5.994075260208167, | |
| "eval_steps": 50, | |
| "global_step": 1752, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006832132372564718, | |
| "grad_norm": 93.82548522949219, | |
| "learning_rate": 2.777777777777778e-06, | |
| "loss": 208.4052, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.013664264745129436, | |
| "grad_norm": 65.51689147949219, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 194.4831, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.020496397117694156, | |
| "grad_norm": 30.816993713378906, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 159.6516, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.027328529490258872, | |
| "grad_norm": 30.113662719726562, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 145.5557, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03416066186282359, | |
| "grad_norm": 22.37295150756836, | |
| "learning_rate": 1.388888888888889e-05, | |
| "loss": 128.5444, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04099279423538831, | |
| "grad_norm": 22.287870407104492, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 116.2723, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04782492660795303, | |
| "grad_norm": 16.027904510498047, | |
| "learning_rate": 1.9444444444444445e-05, | |
| "loss": 107.5451, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.054657058980517745, | |
| "grad_norm": 17.97212791442871, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 100.7136, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.061489191353082465, | |
| "grad_norm": 15.427449226379395, | |
| "learning_rate": 2.5e-05, | |
| "loss": 96.4422, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06832132372564718, | |
| "grad_norm": 11.836018562316895, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 89.9874, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0751534560982119, | |
| "grad_norm": 13.170073509216309, | |
| "learning_rate": 3.055555555555556e-05, | |
| "loss": 90.5263, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.08198558847077662, | |
| "grad_norm": 12.781464576721191, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 87.3144, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08881772084334134, | |
| "grad_norm": 11.460458755493164, | |
| "learning_rate": 3.611111111111111e-05, | |
| "loss": 85.6209, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.09564985321590606, | |
| "grad_norm": 10.382000923156738, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 88.2803, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.10248198558847077, | |
| "grad_norm": 10.578895568847656, | |
| "learning_rate": 4.166666666666667e-05, | |
| "loss": 80.589, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10931411796103549, | |
| "grad_norm": 10.231274604797363, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 83.0791, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.11614625033360021, | |
| "grad_norm": 13.121459007263184, | |
| "learning_rate": 4.722222222222222e-05, | |
| "loss": 81.0775, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.12297838270616493, | |
| "grad_norm": 11.594988822937012, | |
| "learning_rate": 5e-05, | |
| "loss": 79.3985, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.12981051507872965, | |
| "grad_norm": 10.554534912109375, | |
| "learning_rate": 4.9999832415172185e-05, | |
| "loss": 78.9732, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.13664264745129437, | |
| "grad_norm": 9.661481857299805, | |
| "learning_rate": 4.9999329662935534e-05, | |
| "loss": 77.5229, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1434747798238591, | |
| "grad_norm": 11.10251235961914, | |
| "learning_rate": 4.9998491750030315e-05, | |
| "loss": 77.7747, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.1503069121964238, | |
| "grad_norm": 9.058899879455566, | |
| "learning_rate": 4.999731868769027e-05, | |
| "loss": 79.2141, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.15713904456898853, | |
| "grad_norm": 9.254643440246582, | |
| "learning_rate": 4.999581049164237e-05, | |
| "loss": 77.5962, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.16397117694155325, | |
| "grad_norm": 10.37578010559082, | |
| "learning_rate": 4.99939671821067e-05, | |
| "loss": 76.6356, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.17080330931411797, | |
| "grad_norm": 9.983922004699707, | |
| "learning_rate": 4.999178878379611e-05, | |
| "loss": 76.0763, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17080330931411797, | |
| "eval_loss": 1.20554518699646, | |
| "eval_runtime": 119.3115, | |
| "eval_samples_per_second": 33.065, | |
| "eval_steps_per_second": 8.272, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1776354416866827, | |
| "grad_norm": 9.109485626220703, | |
| "learning_rate": 4.998927532591592e-05, | |
| "loss": 75.2524, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1844675740592474, | |
| "grad_norm": 8.939992904663086, | |
| "learning_rate": 4.9986426842163515e-05, | |
| "loss": 75.8614, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.19129970643181213, | |
| "grad_norm": 8.342733383178711, | |
| "learning_rate": 4.9983243370727914e-05, | |
| "loss": 72.864, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.19813183880437685, | |
| "grad_norm": 7.625518321990967, | |
| "learning_rate": 4.9979724954289244e-05, | |
| "loss": 75.7165, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.20496397117694154, | |
| "grad_norm": 6.545467853546143, | |
| "learning_rate": 4.9975871640018154e-05, | |
| "loss": 72.337, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21179610354950626, | |
| "grad_norm": 8.73936939239502, | |
| "learning_rate": 4.99716834795752e-05, | |
| "loss": 73.0804, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.21862823592207098, | |
| "grad_norm": 7.599481105804443, | |
| "learning_rate": 4.996716052911017e-05, | |
| "loss": 71.3494, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2254603682946357, | |
| "grad_norm": 8.88508415222168, | |
| "learning_rate": 4.996230284926128e-05, | |
| "loss": 73.4886, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.23229250066720042, | |
| "grad_norm": 7.141696453094482, | |
| "learning_rate": 4.99571105051544e-05, | |
| "loss": 73.0934, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.23912463303976514, | |
| "grad_norm": 8.946745872497559, | |
| "learning_rate": 4.99515835664022e-05, | |
| "loss": 70.5761, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24595676541232986, | |
| "grad_norm": 7.428682804107666, | |
| "learning_rate": 4.994572210710315e-05, | |
| "loss": 69.8488, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.2527888977848946, | |
| "grad_norm": 10.490913391113281, | |
| "learning_rate": 4.993952620584058e-05, | |
| "loss": 72.1602, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.2596210301574593, | |
| "grad_norm": 6.010617733001709, | |
| "learning_rate": 4.993299594568163e-05, | |
| "loss": 70.0962, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.26645316253002405, | |
| "grad_norm": 5.207183361053467, | |
| "learning_rate": 4.992613141417608e-05, | |
| "loss": 70.6436, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.27328529490258874, | |
| "grad_norm": 7.816757678985596, | |
| "learning_rate": 4.9918932703355256e-05, | |
| "loss": 68.9464, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28011742727515343, | |
| "grad_norm": 6.2263383865356445, | |
| "learning_rate": 4.9911399909730714e-05, | |
| "loss": 68.8249, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2869495596477182, | |
| "grad_norm": 6.726258754730225, | |
| "learning_rate": 4.990353313429303e-05, | |
| "loss": 68.7637, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.29378169202028287, | |
| "grad_norm": 5.4038543701171875, | |
| "learning_rate": 4.989533248251037e-05, | |
| "loss": 68.7726, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3006138243928476, | |
| "grad_norm": 9.256815910339355, | |
| "learning_rate": 4.988679806432712e-05, | |
| "loss": 68.2967, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3074459567654123, | |
| "grad_norm": 7.765486717224121, | |
| "learning_rate": 4.98779299941624e-05, | |
| "loss": 70.6181, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.31427808913797706, | |
| "grad_norm": 7.625786304473877, | |
| "learning_rate": 4.9868728390908526e-05, | |
| "loss": 68.5738, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.32111022151054175, | |
| "grad_norm": 7.776100158691406, | |
| "learning_rate": 4.985919337792944e-05, | |
| "loss": 65.0074, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3279423538831065, | |
| "grad_norm": 6.496335029602051, | |
| "learning_rate": 4.9849325083059e-05, | |
| "loss": 66.7343, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3347744862556712, | |
| "grad_norm": 6.616697311401367, | |
| "learning_rate": 4.983912363859935e-05, | |
| "loss": 69.292, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.34160661862823594, | |
| "grad_norm": 7.259242057800293, | |
| "learning_rate": 4.982858918131906e-05, | |
| "loss": 66.8941, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34160661862823594, | |
| "eval_loss": 1.0700218677520752, | |
| "eval_runtime": 119.6843, | |
| "eval_samples_per_second": 32.962, | |
| "eval_steps_per_second": 8.247, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34843875100080063, | |
| "grad_norm": 7.206521987915039, | |
| "learning_rate": 4.981772185245135e-05, | |
| "loss": 68.3145, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3552708833733654, | |
| "grad_norm": 6.332549095153809, | |
| "learning_rate": 4.980652179769218e-05, | |
| "loss": 67.5062, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.36210301574593007, | |
| "grad_norm": 8.422966957092285, | |
| "learning_rate": 4.979498916719828e-05, | |
| "loss": 69.0426, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3689351481184948, | |
| "grad_norm": 4.5074357986450195, | |
| "learning_rate": 4.978312411558518e-05, | |
| "loss": 66.0764, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3757672804910595, | |
| "grad_norm": 6.847994327545166, | |
| "learning_rate": 4.977092680192507e-05, | |
| "loss": 68.0597, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.38259941286362426, | |
| "grad_norm": 9.010295867919922, | |
| "learning_rate": 4.9758397389744734e-05, | |
| "loss": 66.7856, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.38943154523618895, | |
| "grad_norm": 8.793087005615234, | |
| "learning_rate": 4.9745536047023324e-05, | |
| "loss": 66.6415, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.3962636776087537, | |
| "grad_norm": 6.820159912109375, | |
| "learning_rate": 4.973234294619011e-05, | |
| "loss": 66.8668, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.4030958099813184, | |
| "grad_norm": 10.739355087280273, | |
| "learning_rate": 4.971881826412218e-05, | |
| "loss": 64.5842, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.4099279423538831, | |
| "grad_norm": 6.451905727386475, | |
| "learning_rate": 4.9704962182142044e-05, | |
| "loss": 64.2948, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4167600747264478, | |
| "grad_norm": 6.998046398162842, | |
| "learning_rate": 4.9690774886015244e-05, | |
| "loss": 66.095, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.4235922070990125, | |
| "grad_norm": 6.946700096130371, | |
| "learning_rate": 4.967625656594782e-05, | |
| "loss": 66.6205, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.43042433947157727, | |
| "grad_norm": 7.656089782714844, | |
| "learning_rate": 4.966140741658379e-05, | |
| "loss": 65.2253, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.43725647184414196, | |
| "grad_norm": 8.242254257202148, | |
| "learning_rate": 4.9646227637002515e-05, | |
| "loss": 65.4466, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.4440886042167067, | |
| "grad_norm": 6.5599894523620605, | |
| "learning_rate": 4.963071743071607e-05, | |
| "loss": 64.5302, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4509207365892714, | |
| "grad_norm": 5.671536922454834, | |
| "learning_rate": 4.961487700566646e-05, | |
| "loss": 64.9711, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.45775286896183615, | |
| "grad_norm": 6.317226886749268, | |
| "learning_rate": 4.9598706574222886e-05, | |
| "loss": 66.1428, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.46458500133440084, | |
| "grad_norm": 7.731470584869385, | |
| "learning_rate": 4.958220635317886e-05, | |
| "loss": 65.6398, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4714171337069656, | |
| "grad_norm": 7.070956230163574, | |
| "learning_rate": 4.956537656374933e-05, | |
| "loss": 64.027, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4782492660795303, | |
| "grad_norm": 5.216205596923828, | |
| "learning_rate": 4.9548217431567665e-05, | |
| "loss": 64.9929, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.485081398452095, | |
| "grad_norm": 6.5882344245910645, | |
| "learning_rate": 4.95307291866827e-05, | |
| "loss": 66.2789, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4919135308246597, | |
| "grad_norm": 5.5962934494018555, | |
| "learning_rate": 4.95129120635556e-05, | |
| "loss": 65.4516, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.49874566319722446, | |
| "grad_norm": 7.341054916381836, | |
| "learning_rate": 4.949476630105669e-05, | |
| "loss": 64.339, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.5055777955697892, | |
| "grad_norm": 7.5083441734313965, | |
| "learning_rate": 4.9476292142462374e-05, | |
| "loss": 62.7076, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5124099279423538, | |
| "grad_norm": 5.081834316253662, | |
| "learning_rate": 4.945748983545172e-05, | |
| "loss": 64.2066, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5124099279423538, | |
| "eval_loss": 0.9920685291290283, | |
| "eval_runtime": 120.1858, | |
| "eval_samples_per_second": 32.824, | |
| "eval_steps_per_second": 8.212, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5192420603149186, | |
| "grad_norm": 6.279696464538574, | |
| "learning_rate": 4.943835963210324e-05, | |
| "loss": 63.3412, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.5260741926874833, | |
| "grad_norm": 6.806802749633789, | |
| "learning_rate": 4.941890178889149e-05, | |
| "loss": 63.2038, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5329063250600481, | |
| "grad_norm": 8.012312889099121, | |
| "learning_rate": 4.939911656668361e-05, | |
| "loss": 63.4725, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5397384574326127, | |
| "grad_norm": 6.68613338470459, | |
| "learning_rate": 4.937900423073585e-05, | |
| "loss": 62.8267, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5465705898051775, | |
| "grad_norm": 6.391062259674072, | |
| "learning_rate": 4.9358565050689985e-05, | |
| "loss": 63.4099, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5534027221777422, | |
| "grad_norm": 6.4117817878723145, | |
| "learning_rate": 4.933779930056975e-05, | |
| "loss": 62.475, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5602348545503069, | |
| "grad_norm": 10.238900184631348, | |
| "learning_rate": 4.93167072587771e-05, | |
| "loss": 62.3929, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5670669869228716, | |
| "grad_norm": 6.800478935241699, | |
| "learning_rate": 4.929528920808854e-05, | |
| "loss": 63.4465, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5738991192954364, | |
| "grad_norm": 6.688059329986572, | |
| "learning_rate": 4.92735454356513e-05, | |
| "loss": 62.3017, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5807312516680011, | |
| "grad_norm": 5.010741710662842, | |
| "learning_rate": 4.925147623297949e-05, | |
| "loss": 61.5306, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5875633840405657, | |
| "grad_norm": 6.061219215393066, | |
| "learning_rate": 4.922908189595018e-05, | |
| "loss": 63.5529, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5943955164131305, | |
| "grad_norm": 7.6835126876831055, | |
| "learning_rate": 4.920636272479946e-05, | |
| "loss": 64.4077, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.6012276487856952, | |
| "grad_norm": 5.945671558380127, | |
| "learning_rate": 4.9183319024118415e-05, | |
| "loss": 64.3411, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.60805978115826, | |
| "grad_norm": 4.983694076538086, | |
| "learning_rate": 4.915995110284901e-05, | |
| "loss": 63.5529, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.6148919135308246, | |
| "grad_norm": 5.736062049865723, | |
| "learning_rate": 4.9136259274279955e-05, | |
| "loss": 63.7282, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6217240459033894, | |
| "grad_norm": 6.8453545570373535, | |
| "learning_rate": 4.911224385604255e-05, | |
| "loss": 63.5027, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.6285561782759541, | |
| "grad_norm": 5.9253668785095215, | |
| "learning_rate": 4.908790517010636e-05, | |
| "loss": 60.5142, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6353883106485189, | |
| "grad_norm": 5.743585586547852, | |
| "learning_rate": 4.906324354277495e-05, | |
| "loss": 62.4935, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6422204430210835, | |
| "grad_norm": 4.686921119689941, | |
| "learning_rate": 4.903825930468149e-05, | |
| "loss": 60.8045, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6490525753936482, | |
| "grad_norm": 5.350888729095459, | |
| "learning_rate": 4.901295279078431e-05, | |
| "loss": 62.3775, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.655884707766213, | |
| "grad_norm": 5.417562961578369, | |
| "learning_rate": 4.898732434036244e-05, | |
| "loss": 60.1095, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6627168401387777, | |
| "grad_norm": 5.238453388214111, | |
| "learning_rate": 4.896137429701102e-05, | |
| "loss": 62.8943, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6695489725113424, | |
| "grad_norm": 6.252527713775635, | |
| "learning_rate": 4.893510300863676e-05, | |
| "loss": 61.1666, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6763811048839071, | |
| "grad_norm": 5.860842704772949, | |
| "learning_rate": 4.890851082745319e-05, | |
| "loss": 62.6643, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6832132372564719, | |
| "grad_norm": 6.3946099281311035, | |
| "learning_rate": 4.8881598109976004e-05, | |
| "loss": 61.939, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6832132372564719, | |
| "eval_loss": 0.9664058685302734, | |
| "eval_runtime": 119.3157, | |
| "eval_samples_per_second": 33.064, | |
| "eval_steps_per_second": 8.272, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6900453696290365, | |
| "grad_norm": 5.909948825836182, | |
| "learning_rate": 4.885436521701824e-05, | |
| "loss": 63.9172, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6968775020016013, | |
| "grad_norm": 6.600235462188721, | |
| "learning_rate": 4.8826812513685487e-05, | |
| "loss": 60.6396, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.703709634374166, | |
| "grad_norm": 5.97224235534668, | |
| "learning_rate": 4.8798940369370944e-05, | |
| "loss": 61.1365, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.7105417667467308, | |
| "grad_norm": 5.521954536437988, | |
| "learning_rate": 4.877074915775049e-05, | |
| "loss": 61.9178, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.7173738991192954, | |
| "grad_norm": 4.756962299346924, | |
| "learning_rate": 4.8742239256777674e-05, | |
| "loss": 60.0003, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7242060314918601, | |
| "grad_norm": 7.966216564178467, | |
| "learning_rate": 4.8713411048678635e-05, | |
| "loss": 60.3937, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.7310381638644249, | |
| "grad_norm": 5.864863872528076, | |
| "learning_rate": 4.868426491994702e-05, | |
| "loss": 60.5208, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.7378702962369896, | |
| "grad_norm": 4.952422142028809, | |
| "learning_rate": 4.865480126133872e-05, | |
| "loss": 61.4458, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.7447024286095543, | |
| "grad_norm": 4.522135257720947, | |
| "learning_rate": 4.862502046786671e-05, | |
| "loss": 62.5035, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.751534560982119, | |
| "grad_norm": 4.29464054107666, | |
| "learning_rate": 4.859492293879574e-05, | |
| "loss": 61.5825, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7583666933546838, | |
| "grad_norm": 5.789974212646484, | |
| "learning_rate": 4.856450907763693e-05, | |
| "loss": 59.9352, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7651988257272485, | |
| "grad_norm": 6.44216251373291, | |
| "learning_rate": 4.853377929214243e-05, | |
| "loss": 59.1637, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.7720309580998131, | |
| "grad_norm": 4.520390033721924, | |
| "learning_rate": 4.85027339942999e-05, | |
| "loss": 60.4813, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.7788630904723779, | |
| "grad_norm": 6.058870315551758, | |
| "learning_rate": 4.8471373600326996e-05, | |
| "loss": 60.2968, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7856952228449426, | |
| "grad_norm": 5.945502281188965, | |
| "learning_rate": 4.843969853066584e-05, | |
| "loss": 58.2098, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7925273552175074, | |
| "grad_norm": 4.318876266479492, | |
| "learning_rate": 4.8407709209977305e-05, | |
| "loss": 58.4711, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.799359487590072, | |
| "grad_norm": 5.385821342468262, | |
| "learning_rate": 4.837540606713538e-05, | |
| "loss": 59.5379, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.8061916199626368, | |
| "grad_norm": 6.59214973449707, | |
| "learning_rate": 4.834278953522138e-05, | |
| "loss": 58.4163, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.8130237523352015, | |
| "grad_norm": 5.087238311767578, | |
| "learning_rate": 4.8309860051518204e-05, | |
| "loss": 60.5546, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.8198558847077662, | |
| "grad_norm": 6.804642200469971, | |
| "learning_rate": 4.8276618057504376e-05, | |
| "loss": 59.0874, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8266880170803309, | |
| "grad_norm": 5.035391330718994, | |
| "learning_rate": 4.824306399884822e-05, | |
| "loss": 59.9545, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.8335201494528957, | |
| "grad_norm": 5.837290287017822, | |
| "learning_rate": 4.8209198325401815e-05, | |
| "loss": 59.5963, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.8403522818254604, | |
| "grad_norm": 4.17293643951416, | |
| "learning_rate": 4.817502149119502e-05, | |
| "loss": 59.7065, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.847184414198025, | |
| "grad_norm": 4.964944362640381, | |
| "learning_rate": 4.8140533954429327e-05, | |
| "loss": 59.5358, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.8540165465705898, | |
| "grad_norm": 6.021297931671143, | |
| "learning_rate": 4.810573617747178e-05, | |
| "loss": 60.6391, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8540165465705898, | |
| "eval_loss": 0.9407148361206055, | |
| "eval_runtime": 119.9595, | |
| "eval_samples_per_second": 32.886, | |
| "eval_steps_per_second": 8.228, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8608486789431545, | |
| "grad_norm": 5.707021713256836, | |
| "learning_rate": 4.8070628626848735e-05, | |
| "loss": 61.5872, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8676808113157193, | |
| "grad_norm": 4.725375652313232, | |
| "learning_rate": 4.803521177323962e-05, | |
| "loss": 59.2192, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.8745129436882839, | |
| "grad_norm": 23.445714950561523, | |
| "learning_rate": 4.799948609147061e-05, | |
| "loss": 60.1762, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8813450760608487, | |
| "grad_norm": 5.503020286560059, | |
| "learning_rate": 4.796345206050829e-05, | |
| "loss": 62.2226, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8881772084334134, | |
| "grad_norm": 6.558228015899658, | |
| "learning_rate": 4.792711016345321e-05, | |
| "loss": 62.089, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8950093408059782, | |
| "grad_norm": 8.109895706176758, | |
| "learning_rate": 4.7890460887533417e-05, | |
| "loss": 60.7872, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.9018414731785428, | |
| "grad_norm": 5.230234622955322, | |
| "learning_rate": 4.785350472409792e-05, | |
| "loss": 57.9312, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.9086736055511075, | |
| "grad_norm": 6.669562339782715, | |
| "learning_rate": 4.7816242168610093e-05, | |
| "loss": 61.7966, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.9155057379236723, | |
| "grad_norm": 5.428192615509033, | |
| "learning_rate": 4.777867372064105e-05, | |
| "loss": 58.4551, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.922337870296237, | |
| "grad_norm": 5.6168131828308105, | |
| "learning_rate": 4.774079988386296e-05, | |
| "loss": 59.9015, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9291700026688017, | |
| "grad_norm": 5.785460948944092, | |
| "learning_rate": 4.770262116604224e-05, | |
| "loss": 59.723, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.9360021350413664, | |
| "grad_norm": 8.77035140991211, | |
| "learning_rate": 4.76641380790328e-05, | |
| "loss": 60.8996, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.9428342674139312, | |
| "grad_norm": 4.000178813934326, | |
| "learning_rate": 4.762535113876917e-05, | |
| "loss": 59.2908, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.9496663997864959, | |
| "grad_norm": 5.8565826416015625, | |
| "learning_rate": 4.758626086525956e-05, | |
| "loss": 59.296, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.9564985321590606, | |
| "grad_norm": 6.792466163635254, | |
| "learning_rate": 4.754686778257891e-05, | |
| "loss": 58.351, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9633306645316253, | |
| "grad_norm": 6.484628677368164, | |
| "learning_rate": 4.750717241886185e-05, | |
| "loss": 58.46, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.97016279690419, | |
| "grad_norm": 5.421430587768555, | |
| "learning_rate": 4.7467175306295655e-05, | |
| "loss": 59.0205, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.9769949292767547, | |
| "grad_norm": 4.550335884094238, | |
| "learning_rate": 4.7426876981113044e-05, | |
| "loss": 60.8234, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9838270616493194, | |
| "grad_norm": 5.412383079528809, | |
| "learning_rate": 4.738627798358506e-05, | |
| "loss": 57.3651, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9906591940218842, | |
| "grad_norm": 5.225856781005859, | |
| "learning_rate": 4.7345378858013776e-05, | |
| "loss": 58.8522, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9974913263944489, | |
| "grad_norm": 3.856189250946045, | |
| "learning_rate": 4.730418015272503e-05, | |
| "loss": 59.7945, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.0034160661862823, | |
| "grad_norm": 6.19010066986084, | |
| "learning_rate": 4.726268242006106e-05, | |
| "loss": 50.2722, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.0102481985588472, | |
| "grad_norm": 5.333181858062744, | |
| "learning_rate": 4.722088621637309e-05, | |
| "loss": 58.7285, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.0170803309314118, | |
| "grad_norm": 5.93973970413208, | |
| "learning_rate": 4.717879210201389e-05, | |
| "loss": 57.2823, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.0239124633039765, | |
| "grad_norm": 4.59360408782959, | |
| "learning_rate": 4.713640064133025e-05, | |
| "loss": 58.4687, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0239124633039765, | |
| "eval_loss": 0.9195547699928284, | |
| "eval_runtime": 119.3076, | |
| "eval_samples_per_second": 33.066, | |
| "eval_steps_per_second": 8.273, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0307445956765413, | |
| "grad_norm": 5.437332630157471, | |
| "learning_rate": 4.7093712402655427e-05, | |
| "loss": 57.7491, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.037576728049106, | |
| "grad_norm": 4.938009738922119, | |
| "learning_rate": 4.7050727958301506e-05, | |
| "loss": 58.2642, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.0444088604216706, | |
| "grad_norm": 5.104777812957764, | |
| "learning_rate": 4.7007447884551745e-05, | |
| "loss": 56.1312, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.0512409927942354, | |
| "grad_norm": 5.78248405456543, | |
| "learning_rate": 4.6963872761652835e-05, | |
| "loss": 56.9488, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.0580731251668, | |
| "grad_norm": 4.8224287033081055, | |
| "learning_rate": 4.692000317380715e-05, | |
| "loss": 56.6993, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.064905257539365, | |
| "grad_norm": 4.517540454864502, | |
| "learning_rate": 4.687583970916487e-05, | |
| "loss": 58.8636, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.0717373899119296, | |
| "grad_norm": 5.353949069976807, | |
| "learning_rate": 4.683138295981611e-05, | |
| "loss": 58.6762, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.0785695222844942, | |
| "grad_norm": 6.164919376373291, | |
| "learning_rate": 4.678663352178301e-05, | |
| "loss": 57.9218, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.085401654657059, | |
| "grad_norm": 4.577470302581787, | |
| "learning_rate": 4.674159199501173e-05, | |
| "loss": 58.1644, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.0922337870296237, | |
| "grad_norm": 6.5861592292785645, | |
| "learning_rate": 4.6696258983364385e-05, | |
| "loss": 57.3447, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0990659194021883, | |
| "grad_norm": 4.327467918395996, | |
| "learning_rate": 4.665063509461097e-05, | |
| "loss": 57.2627, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.1058980517747532, | |
| "grad_norm": 7.534716606140137, | |
| "learning_rate": 4.660472094042121e-05, | |
| "loss": 57.2099, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.1127301841473178, | |
| "grad_norm": 5.549008369445801, | |
| "learning_rate": 4.655851713635635e-05, | |
| "loss": 58.4564, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.1195623165198825, | |
| "grad_norm": 4.385070323944092, | |
| "learning_rate": 4.651202430186092e-05, | |
| "loss": 57.0019, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.1263944488924473, | |
| "grad_norm": 4.763044357299805, | |
| "learning_rate": 4.6465243060254415e-05, | |
| "loss": 55.7849, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.133226581265012, | |
| "grad_norm": 3.9461379051208496, | |
| "learning_rate": 4.641817403872293e-05, | |
| "loss": 56.2399, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.1400587136375768, | |
| "grad_norm": 4.946137428283691, | |
| "learning_rate": 4.637081786831079e-05, | |
| "loss": 56.7089, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.1468908460101415, | |
| "grad_norm": 5.664731025695801, | |
| "learning_rate": 4.6323175183912024e-05, | |
| "loss": 57.1022, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.153722978382706, | |
| "grad_norm": 5.261230945587158, | |
| "learning_rate": 4.627524662426194e-05, | |
| "loss": 56.3552, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.160555110755271, | |
| "grad_norm": 4.166741847991943, | |
| "learning_rate": 4.6227032831928484e-05, | |
| "loss": 56.888, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1673872431278356, | |
| "grad_norm": 6.015218734741211, | |
| "learning_rate": 4.6178534453303666e-05, | |
| "loss": 57.3006, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.1742193755004002, | |
| "grad_norm": 6.349710941314697, | |
| "learning_rate": 4.6129752138594874e-05, | |
| "loss": 57.0208, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.181051507872965, | |
| "grad_norm": 5.403022766113281, | |
| "learning_rate": 4.608068654181617e-05, | |
| "loss": 57.0645, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.1878836402455297, | |
| "grad_norm": 6.523670673370361, | |
| "learning_rate": 4.6031338320779534e-05, | |
| "loss": 58.2164, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.1947157726180944, | |
| "grad_norm": 6.369359970092773, | |
| "learning_rate": 4.5981708137086e-05, | |
| "loss": 56.7965, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1947157726180944, | |
| "eval_loss": 0.8986765146255493, | |
| "eval_runtime": 119.0222, | |
| "eval_samples_per_second": 33.145, | |
| "eval_steps_per_second": 8.293, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2015479049906592, | |
| "grad_norm": 5.050749778747559, | |
| "learning_rate": 4.5931796656116846e-05, | |
| "loss": 56.7828, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.2083800373632239, | |
| "grad_norm": 5.341484069824219, | |
| "learning_rate": 4.588160454702462e-05, | |
| "loss": 57.4058, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.2152121697357887, | |
| "grad_norm": 4.554074287414551, | |
| "learning_rate": 4.5831132482724195e-05, | |
| "loss": 57.6257, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.2220443021083534, | |
| "grad_norm": 4.951889514923096, | |
| "learning_rate": 4.578038113988376e-05, | |
| "loss": 56.0608, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.228876434480918, | |
| "grad_norm": 4.2526421546936035, | |
| "learning_rate": 4.572935119891571e-05, | |
| "loss": 55.8586, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2357085668534828, | |
| "grad_norm": 4.805353164672852, | |
| "learning_rate": 4.5678043343967554e-05, | |
| "loss": 59.2427, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.2425406992260475, | |
| "grad_norm": 4.9927978515625, | |
| "learning_rate": 4.5626458262912745e-05, | |
| "loss": 55.1494, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.2493728315986123, | |
| "grad_norm": 5.778275012969971, | |
| "learning_rate": 4.557459664734141e-05, | |
| "loss": 55.9791, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.256204963971177, | |
| "grad_norm": 4.41555643081665, | |
| "learning_rate": 4.552245919255117e-05, | |
| "loss": 57.3123, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.2630370963437416, | |
| "grad_norm": 5.230330944061279, | |
| "learning_rate": 4.5470046597537735e-05, | |
| "loss": 55.9031, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2698692287163063, | |
| "grad_norm": 3.9548189640045166, | |
| "learning_rate": 4.541735956498554e-05, | |
| "loss": 56.6997, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.2767013610888711, | |
| "grad_norm": 5.017361640930176, | |
| "learning_rate": 4.5364398801258396e-05, | |
| "loss": 57.3268, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.2835334934614357, | |
| "grad_norm": 5.562941074371338, | |
| "learning_rate": 4.5311165016389916e-05, | |
| "loss": 55.6271, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.2903656258340006, | |
| "grad_norm": 6.675297737121582, | |
| "learning_rate": 4.525765892407409e-05, | |
| "loss": 55.9593, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.2971977582065652, | |
| "grad_norm": 6.47582483291626, | |
| "learning_rate": 4.5203881241655644e-05, | |
| "loss": 57.0788, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.3040298905791299, | |
| "grad_norm": 5.157675743103027, | |
| "learning_rate": 4.514983269012049e-05, | |
| "loss": 56.3623, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.3108620229516947, | |
| "grad_norm": 8.075702667236328, | |
| "learning_rate": 4.509551399408598e-05, | |
| "loss": 55.6531, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.3176941553242594, | |
| "grad_norm": 3.849310874938965, | |
| "learning_rate": 4.504092588179128e-05, | |
| "loss": 58.7546, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.3245262876968242, | |
| "grad_norm": 3.6027579307556152, | |
| "learning_rate": 4.498606908508754e-05, | |
| "loss": 57.7153, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.3313584200693889, | |
| "grad_norm": 5.139729976654053, | |
| "learning_rate": 4.4930944339428085e-05, | |
| "loss": 56.4532, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.3381905524419535, | |
| "grad_norm": 5.337704181671143, | |
| "learning_rate": 4.487555238385862e-05, | |
| "loss": 54.2958, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.3450226848145181, | |
| "grad_norm": 3.3229618072509766, | |
| "learning_rate": 4.481989396100724e-05, | |
| "loss": 54.2046, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.351854817187083, | |
| "grad_norm": 5.2183074951171875, | |
| "learning_rate": 4.476396981707453e-05, | |
| "loss": 56.0147, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.3586869495596476, | |
| "grad_norm": 5.028941631317139, | |
| "learning_rate": 4.470778070182353e-05, | |
| "loss": 54.3446, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.3655190819322125, | |
| "grad_norm": 6.347212791442871, | |
| "learning_rate": 4.465132736856969e-05, | |
| "loss": 56.7659, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.3655190819322125, | |
| "eval_loss": 0.8771227598190308, | |
| "eval_runtime": 118.9477, | |
| "eval_samples_per_second": 33.166, | |
| "eval_steps_per_second": 8.298, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.3723512143047771, | |
| "grad_norm": 9.381309509277344, | |
| "learning_rate": 4.459461057417078e-05, | |
| "loss": 56.8099, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.3791833466773418, | |
| "grad_norm": 5.657813549041748, | |
| "learning_rate": 4.453763107901675e-05, | |
| "loss": 56.3326, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.3860154790499066, | |
| "grad_norm": 4.476396083831787, | |
| "learning_rate": 4.4480389647019505e-05, | |
| "loss": 57.3978, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.3928476114224713, | |
| "grad_norm": 5.402798652648926, | |
| "learning_rate": 4.442288704560268e-05, | |
| "loss": 55.7143, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.3996797437950361, | |
| "grad_norm": 4.367002010345459, | |
| "learning_rate": 4.436512404569136e-05, | |
| "loss": 55.7044, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4065118761676008, | |
| "grad_norm": 5.653073310852051, | |
| "learning_rate": 4.430710142170176e-05, | |
| "loss": 55.7266, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.4133440085401654, | |
| "grad_norm": 7.221829414367676, | |
| "learning_rate": 4.424881995153076e-05, | |
| "loss": 56.4174, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.4201761409127303, | |
| "grad_norm": 5.465057373046875, | |
| "learning_rate": 4.419028041654559e-05, | |
| "loss": 56.9093, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.427008273285295, | |
| "grad_norm": 8.383552551269531, | |
| "learning_rate": 4.4131483601573285e-05, | |
| "loss": 56.0841, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.4338404056578598, | |
| "grad_norm": 4.208652973175049, | |
| "learning_rate": 4.4072430294890174e-05, | |
| "loss": 57.5786, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4406725380304244, | |
| "grad_norm": 5.773376941680908, | |
| "learning_rate": 4.4013121288211307e-05, | |
| "loss": 55.8851, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.447504670402989, | |
| "grad_norm": 5.354812145233154, | |
| "learning_rate": 4.3953557376679856e-05, | |
| "loss": 55.1571, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.4543368027755537, | |
| "grad_norm": 4.6360039710998535, | |
| "learning_rate": 4.389373935885646e-05, | |
| "loss": 54.0095, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.4611689351481185, | |
| "grad_norm": 7.125521183013916, | |
| "learning_rate": 4.383366803670849e-05, | |
| "loss": 56.645, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.4680010675206832, | |
| "grad_norm": 6.071737766265869, | |
| "learning_rate": 4.377334421559932e-05, | |
| "loss": 55.3209, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.474833199893248, | |
| "grad_norm": 4.569766998291016, | |
| "learning_rate": 4.371276870427753e-05, | |
| "loss": 54.6604, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.4816653322658127, | |
| "grad_norm": 5.426764965057373, | |
| "learning_rate": 4.365194231486604e-05, | |
| "loss": 56.4116, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.4884974646383773, | |
| "grad_norm": 5.6092023849487305, | |
| "learning_rate": 4.359086586285127e-05, | |
| "loss": 56.0268, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.4953295970109421, | |
| "grad_norm": 6.140939712524414, | |
| "learning_rate": 4.3529540167072126e-05, | |
| "loss": 54.886, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.5021617293835068, | |
| "grad_norm": 4.043739318847656, | |
| "learning_rate": 4.346796604970912e-05, | |
| "loss": 56.6431, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5089938617560716, | |
| "grad_norm": 3.8898212909698486, | |
| "learning_rate": 4.340614433627328e-05, | |
| "loss": 55.6492, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.5158259941286363, | |
| "grad_norm": 6.158950328826904, | |
| "learning_rate": 4.3344075855595104e-05, | |
| "loss": 55.6869, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.522658126501201, | |
| "grad_norm": 3.874180316925049, | |
| "learning_rate": 4.328176143981343e-05, | |
| "loss": 53.7981, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.5294902588737656, | |
| "grad_norm": 4.068581581115723, | |
| "learning_rate": 4.321920192436433e-05, | |
| "loss": 54.6618, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.5363223912463304, | |
| "grad_norm": 4.552149295806885, | |
| "learning_rate": 4.315639814796983e-05, | |
| "loss": 55.1642, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5363223912463304, | |
| "eval_loss": 0.8704175353050232, | |
| "eval_runtime": 119.5049, | |
| "eval_samples_per_second": 33.011, | |
| "eval_steps_per_second": 8.259, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5431545236188953, | |
| "grad_norm": 4.1831374168396, | |
| "learning_rate": 4.309335095262676e-05, | |
| "loss": 53.2926, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.54998665599146, | |
| "grad_norm": 4.456052780151367, | |
| "learning_rate": 4.303006118359537e-05, | |
| "loss": 53.6038, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.5568187883640245, | |
| "grad_norm": 17.7099609375, | |
| "learning_rate": 4.296652968938807e-05, | |
| "loss": 54.9325, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.5636509207365892, | |
| "grad_norm": 8.005233764648438, | |
| "learning_rate": 4.2902757321758016e-05, | |
| "loss": 53.7884, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.570483053109154, | |
| "grad_norm": 5.034004211425781, | |
| "learning_rate": 4.283874493568772e-05, | |
| "loss": 53.2575, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.5773151854817187, | |
| "grad_norm": 4.005930423736572, | |
| "learning_rate": 4.2774493389377545e-05, | |
| "loss": 55.4554, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.5841473178542835, | |
| "grad_norm": 5.812296390533447, | |
| "learning_rate": 4.271000354423426e-05, | |
| "loss": 56.7008, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.5909794502268482, | |
| "grad_norm": 6.425695896148682, | |
| "learning_rate": 4.2645276264859394e-05, | |
| "loss": 56.8804, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.5978115825994128, | |
| "grad_norm": 4.44102144241333, | |
| "learning_rate": 4.258031241903778e-05, | |
| "loss": 54.2011, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.6046437149719774, | |
| "grad_norm": 4.444553852081299, | |
| "learning_rate": 4.251511287772579e-05, | |
| "loss": 54.9826, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.6114758473445423, | |
| "grad_norm": 3.8157808780670166, | |
| "learning_rate": 4.2449678515039747e-05, | |
| "loss": 55.2601, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.6183079797171072, | |
| "grad_norm": 6.47904634475708, | |
| "learning_rate": 4.238401020824416e-05, | |
| "loss": 54.5978, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.6251401120896718, | |
| "grad_norm": 5.010526180267334, | |
| "learning_rate": 4.231810883773999e-05, | |
| "loss": 56.0995, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.6319722444622364, | |
| "grad_norm": 5.843505382537842, | |
| "learning_rate": 4.2251975287052804e-05, | |
| "loss": 54.0241, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.638804376834801, | |
| "grad_norm": 4.549996852874756, | |
| "learning_rate": 4.218561044282099e-05, | |
| "loss": 56.3071, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.645636509207366, | |
| "grad_norm": 4.20985221862793, | |
| "learning_rate": 4.211901519478382e-05, | |
| "loss": 54.3977, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.6524686415799306, | |
| "grad_norm": 5.491010665893555, | |
| "learning_rate": 4.2052190435769554e-05, | |
| "loss": 53.1375, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.6593007739524954, | |
| "grad_norm": 4.417302131652832, | |
| "learning_rate": 4.198513706168345e-05, | |
| "loss": 53.959, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.66613290632506, | |
| "grad_norm": 5.39029598236084, | |
| "learning_rate": 4.191785597149577e-05, | |
| "loss": 54.5638, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.6729650386976247, | |
| "grad_norm": 4.233526229858398, | |
| "learning_rate": 4.1850348067229696e-05, | |
| "loss": 54.6384, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.6797971710701893, | |
| "grad_norm": 6.301634311676025, | |
| "learning_rate": 4.178261425394926e-05, | |
| "loss": 55.1738, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.6866293034427542, | |
| "grad_norm": 5.9507246017456055, | |
| "learning_rate": 4.171465543974723e-05, | |
| "loss": 54.7009, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.693461435815319, | |
| "grad_norm": 5.033243656158447, | |
| "learning_rate": 4.1646472535732895e-05, | |
| "loss": 54.3154, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.7002935681878837, | |
| "grad_norm": 4.675721168518066, | |
| "learning_rate": 4.157806645601988e-05, | |
| "loss": 54.1507, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.7071257005604483, | |
| "grad_norm": 3.5945537090301514, | |
| "learning_rate": 4.1509438117713866e-05, | |
| "loss": 52.2103, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7071257005604483, | |
| "eval_loss": 0.8516557216644287, | |
| "eval_runtime": 119.4754, | |
| "eval_samples_per_second": 33.019, | |
| "eval_steps_per_second": 8.261, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.713957832933013, | |
| "grad_norm": 4.187085151672363, | |
| "learning_rate": 4.144058844090032e-05, | |
| "loss": 54.1474, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 1.7207899653055778, | |
| "grad_norm": 3.818648099899292, | |
| "learning_rate": 4.137151834863213e-05, | |
| "loss": 55.5711, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.7276220976781427, | |
| "grad_norm": 5.919620513916016, | |
| "learning_rate": 4.130222876691726e-05, | |
| "loss": 54.3803, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 1.7344542300507073, | |
| "grad_norm": 5.772305011749268, | |
| "learning_rate": 4.123272062470633e-05, | |
| "loss": 53.9454, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 1.741286362423272, | |
| "grad_norm": 4.569563865661621, | |
| "learning_rate": 4.116299485388014e-05, | |
| "loss": 53.5009, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.7481184947958366, | |
| "grad_norm": 4.183293342590332, | |
| "learning_rate": 4.109305238923718e-05, | |
| "loss": 52.9927, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 1.7549506271684012, | |
| "grad_norm": 4.4316301345825195, | |
| "learning_rate": 4.102289416848114e-05, | |
| "loss": 54.5023, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 1.761782759540966, | |
| "grad_norm": 14.234251976013184, | |
| "learning_rate": 4.095252113220827e-05, | |
| "loss": 53.1473, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.768614891913531, | |
| "grad_norm": 4.889795780181885, | |
| "learning_rate": 4.088193422389484e-05, | |
| "loss": 53.7265, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 1.7754470242860956, | |
| "grad_norm": 3.02785325050354, | |
| "learning_rate": 4.0811134389884433e-05, | |
| "loss": 52.5917, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.7822791566586602, | |
| "grad_norm": 5.794788360595703, | |
| "learning_rate": 4.0740122579375286e-05, | |
| "loss": 55.4619, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.7891112890312248, | |
| "grad_norm": 4.442338466644287, | |
| "learning_rate": 4.066889974440757e-05, | |
| "loss": 53.7709, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 1.7959434214037897, | |
| "grad_norm": 4.7714715003967285, | |
| "learning_rate": 4.0597466839850595e-05, | |
| "loss": 54.16, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 1.8027755537763546, | |
| "grad_norm": 4.7263569831848145, | |
| "learning_rate": 4.0525824823390045e-05, | |
| "loss": 55.9749, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.8096076861489192, | |
| "grad_norm": 4.258271217346191, | |
| "learning_rate": 4.045397465551513e-05, | |
| "loss": 52.5445, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8164398185214838, | |
| "grad_norm": 4.56829309463501, | |
| "learning_rate": 4.038191729950569e-05, | |
| "loss": 53.8703, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.8232719508940485, | |
| "grad_norm": 8.888167381286621, | |
| "learning_rate": 4.030965372141927e-05, | |
| "loss": 52.7209, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.8301040832666133, | |
| "grad_norm": 4.5087175369262695, | |
| "learning_rate": 4.0237184890078245e-05, | |
| "loss": 54.591, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 1.836936215639178, | |
| "grad_norm": 4.460638523101807, | |
| "learning_rate": 4.0164511777056725e-05, | |
| "loss": 54.8662, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 1.8437683480117428, | |
| "grad_norm": 3.5958664417266846, | |
| "learning_rate": 4.009163535666761e-05, | |
| "loss": 53.423, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.8506004803843075, | |
| "grad_norm": 4.3935418128967285, | |
| "learning_rate": 4.001855660594948e-05, | |
| "loss": 53.9048, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 1.857432612756872, | |
| "grad_norm": 5.473939895629883, | |
| "learning_rate": 3.994527650465352e-05, | |
| "loss": 52.9295, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 1.8642647451294367, | |
| "grad_norm": 4.8625922203063965, | |
| "learning_rate": 3.98717960352304e-05, | |
| "loss": 51.8002, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.8710968775020016, | |
| "grad_norm": 4.244052886962891, | |
| "learning_rate": 3.979811618281706e-05, | |
| "loss": 53.6904, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 1.8779290098745665, | |
| "grad_norm": 4.050732612609863, | |
| "learning_rate": 3.972423793522352e-05, | |
| "loss": 54.7441, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.8779290098745665, | |
| "eval_loss": 0.8419561982154846, | |
| "eval_runtime": 119.6757, | |
| "eval_samples_per_second": 32.964, | |
| "eval_steps_per_second": 8.247, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.884761142247131, | |
| "grad_norm": 5.255309104919434, | |
| "learning_rate": 3.9650162282919655e-05, | |
| "loss": 53.6842, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.8915932746196957, | |
| "grad_norm": 5.483623504638672, | |
| "learning_rate": 3.957589021902191e-05, | |
| "loss": 54.0004, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 1.8984254069922604, | |
| "grad_norm": 4.224212169647217, | |
| "learning_rate": 3.9501422739279956e-05, | |
| "loss": 51.7289, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 1.9052575393648252, | |
| "grad_norm": 5.061962127685547, | |
| "learning_rate": 3.942676084206338e-05, | |
| "loss": 53.4457, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.9120896717373899, | |
| "grad_norm": 3.8694398403167725, | |
| "learning_rate": 3.9351905528348285e-05, | |
| "loss": 51.8595, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.9189218041099547, | |
| "grad_norm": 4.149620056152344, | |
| "learning_rate": 3.927685780170385e-05, | |
| "loss": 51.8196, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.9257539364825194, | |
| "grad_norm": 6.877647399902344, | |
| "learning_rate": 3.920161866827889e-05, | |
| "loss": 52.7279, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.932586068855084, | |
| "grad_norm": 4.069815635681152, | |
| "learning_rate": 3.9126189136788416e-05, | |
| "loss": 51.1502, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 1.9394182012276486, | |
| "grad_norm": 6.629972457885742, | |
| "learning_rate": 3.90505702185e-05, | |
| "loss": 52.6793, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 1.9462503336002135, | |
| "grad_norm": 4.475677013397217, | |
| "learning_rate": 3.897476292722034e-05, | |
| "loss": 51.4329, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.9530824659727783, | |
| "grad_norm": 5.370522499084473, | |
| "learning_rate": 3.889876827928156e-05, | |
| "loss": 53.1101, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 1.959914598345343, | |
| "grad_norm": 5.481414794921875, | |
| "learning_rate": 3.882258729352768e-05, | |
| "loss": 53.3684, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.9667467307179076, | |
| "grad_norm": 6.393594741821289, | |
| "learning_rate": 3.874622099130087e-05, | |
| "loss": 52.7341, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.9735788630904723, | |
| "grad_norm": 3.9178807735443115, | |
| "learning_rate": 3.866967039642784e-05, | |
| "loss": 51.5249, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 1.9804109954630371, | |
| "grad_norm": 9.721770286560059, | |
| "learning_rate": 3.859293653520604e-05, | |
| "loss": 51.2705, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.987243127835602, | |
| "grad_norm": 4.619483470916748, | |
| "learning_rate": 3.851602043638994e-05, | |
| "loss": 51.7596, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.9940752602081666, | |
| "grad_norm": 4.899592399597168, | |
| "learning_rate": 3.843892313117724e-05, | |
| "loss": 54.7586, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.8423385620117188, | |
| "learning_rate": 3.8361645653195026e-05, | |
| "loss": 44.9497, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 2.0068321323725646, | |
| "grad_norm": 4.93556022644043, | |
| "learning_rate": 3.8284189038485936e-05, | |
| "loss": 53.1383, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 2.0136642647451293, | |
| "grad_norm": 6.575899124145508, | |
| "learning_rate": 3.8206554325494225e-05, | |
| "loss": 52.1373, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.0204963971176944, | |
| "grad_norm": 3.5134201049804688, | |
| "learning_rate": 3.812874255505191e-05, | |
| "loss": 50.8711, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.027328529490259, | |
| "grad_norm": 4.761475086212158, | |
| "learning_rate": 3.805075477036476e-05, | |
| "loss": 52.0756, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.0341606618628236, | |
| "grad_norm": 3.7381017208099365, | |
| "learning_rate": 3.797259201699833e-05, | |
| "loss": 51.0594, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 2.0409927942353883, | |
| "grad_norm": 5.102145671844482, | |
| "learning_rate": 3.789425534286394e-05, | |
| "loss": 52.1454, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 2.047824926607953, | |
| "grad_norm": 4.762547969818115, | |
| "learning_rate": 3.781574579820464e-05, | |
| "loss": 50.3373, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.047824926607953, | |
| "eval_loss": 0.8283991813659668, | |
| "eval_runtime": 119.5704, | |
| "eval_samples_per_second": 32.993, | |
| "eval_steps_per_second": 8.255, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.0546570589805175, | |
| "grad_norm": 4.646745681762695, | |
| "learning_rate": 3.773706443558111e-05, | |
| "loss": 51.0792, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 2.0614891913530826, | |
| "grad_norm": 5.648324012756348, | |
| "learning_rate": 3.765821230985758e-05, | |
| "loss": 50.6017, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 2.0683213237256473, | |
| "grad_norm": 4.703359603881836, | |
| "learning_rate": 3.75791904781876e-05, | |
| "loss": 52.4212, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 2.075153456098212, | |
| "grad_norm": 4.082385540008545, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 51.9666, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.0819855884707765, | |
| "grad_norm": 4.6461687088012695, | |
| "learning_rate": 3.74206419369846e-05, | |
| "loss": 51.6205, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.088817720843341, | |
| "grad_norm": 3.9972918033599854, | |
| "learning_rate": 3.7341117353077966e-05, | |
| "loss": 52.6521, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 2.0956498532159062, | |
| "grad_norm": 5.636791229248047, | |
| "learning_rate": 3.726142731444921e-05, | |
| "loss": 52.6811, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 2.102481985588471, | |
| "grad_norm": 6.055325508117676, | |
| "learning_rate": 3.718157288948563e-05, | |
| "loss": 51.2952, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 2.1093141179610355, | |
| "grad_norm": 5.317610740661621, | |
| "learning_rate": 3.710155514877844e-05, | |
| "loss": 52.4443, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 2.1161462503336, | |
| "grad_norm": 4.979522705078125, | |
| "learning_rate": 3.702137516510838e-05, | |
| "loss": 51.3593, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.122978382706165, | |
| "grad_norm": 7.410902500152588, | |
| "learning_rate": 3.694103401343136e-05, | |
| "loss": 51.5919, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 2.12981051507873, | |
| "grad_norm": 4.962103366851807, | |
| "learning_rate": 3.686053277086401e-05, | |
| "loss": 51.272, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 2.1366426474512945, | |
| "grad_norm": 4.0044426918029785, | |
| "learning_rate": 3.6779872516669295e-05, | |
| "loss": 51.6362, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 2.143474779823859, | |
| "grad_norm": 5.016703128814697, | |
| "learning_rate": 3.669905433224199e-05, | |
| "loss": 51.7369, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 2.150306912196424, | |
| "grad_norm": 4.700343132019043, | |
| "learning_rate": 3.6618079301094216e-05, | |
| "loss": 50.9454, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.1571390445689884, | |
| "grad_norm": 8.11246395111084, | |
| "learning_rate": 3.653694850884091e-05, | |
| "loss": 50.4605, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 2.163971176941553, | |
| "grad_norm": 3.8724536895751953, | |
| "learning_rate": 3.645566304318526e-05, | |
| "loss": 52.4849, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 2.170803309314118, | |
| "grad_norm": 3.699873208999634, | |
| "learning_rate": 3.637422399390413e-05, | |
| "loss": 49.8017, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 2.1776354416866828, | |
| "grad_norm": 4.757104873657227, | |
| "learning_rate": 3.6292632452833436e-05, | |
| "loss": 52.0966, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 2.1844675740592474, | |
| "grad_norm": 5.273576736450195, | |
| "learning_rate": 3.621088951385353e-05, | |
| "loss": 49.5201, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.191299706431812, | |
| "grad_norm": 4.152122497558594, | |
| "learning_rate": 3.612899627287452e-05, | |
| "loss": 51.121, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 2.1981318388043767, | |
| "grad_norm": 4.448339939117432, | |
| "learning_rate": 3.604695382782159e-05, | |
| "loss": 51.5833, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 2.2049639711769418, | |
| "grad_norm": 3.272676706314087, | |
| "learning_rate": 3.596476327862024e-05, | |
| "loss": 50.4036, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 2.2117961035495064, | |
| "grad_norm": 4.293691158294678, | |
| "learning_rate": 3.588242572718162e-05, | |
| "loss": 50.4138, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 2.218628235922071, | |
| "grad_norm": 6.384798049926758, | |
| "learning_rate": 3.579994227738767e-05, | |
| "loss": 49.0042, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.218628235922071, | |
| "eval_loss": 0.8110712170600891, | |
| "eval_runtime": 119.0744, | |
| "eval_samples_per_second": 33.131, | |
| "eval_steps_per_second": 8.289, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.2254603682946357, | |
| "grad_norm": 4.501573085784912, | |
| "learning_rate": 3.5717314035076355e-05, | |
| "loss": 49.7713, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 2.2322925006672003, | |
| "grad_norm": 4.808114051818848, | |
| "learning_rate": 3.5634542108026876e-05, | |
| "loss": 50.6265, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 2.239124633039765, | |
| "grad_norm": 5.616351127624512, | |
| "learning_rate": 3.5551627605944745e-05, | |
| "loss": 52.1332, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 2.24595676541233, | |
| "grad_norm": 7.0716071128845215, | |
| "learning_rate": 3.5468571640446994e-05, | |
| "loss": 50.7825, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 2.2527888977848947, | |
| "grad_norm": 4.64641809463501, | |
| "learning_rate": 3.5385375325047166e-05, | |
| "loss": 50.3092, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.2596210301574593, | |
| "grad_norm": 4.058784008026123, | |
| "learning_rate": 3.5302039775140486e-05, | |
| "loss": 51.7827, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 2.266453162530024, | |
| "grad_norm": 4.011864185333252, | |
| "learning_rate": 3.521856610798887e-05, | |
| "loss": 51.4194, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 2.2732852949025886, | |
| "grad_norm": 3.89857816696167, | |
| "learning_rate": 3.513495544270592e-05, | |
| "loss": 50.7032, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 2.2801174272751537, | |
| "grad_norm": 4.966712951660156, | |
| "learning_rate": 3.505120890024195e-05, | |
| "loss": 49.925, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 2.2869495596477183, | |
| "grad_norm": 4.181141376495361, | |
| "learning_rate": 3.496732760336895e-05, | |
| "loss": 49.5112, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.293781692020283, | |
| "grad_norm": 4.761594772338867, | |
| "learning_rate": 3.4883312676665536e-05, | |
| "loss": 49.6545, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.3006138243928476, | |
| "grad_norm": 3.97501802444458, | |
| "learning_rate": 3.479916524650188e-05, | |
| "loss": 51.1862, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 2.307445956765412, | |
| "grad_norm": 5.200672149658203, | |
| "learning_rate": 3.4714886441024574e-05, | |
| "loss": 49.9163, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 2.314278089137977, | |
| "grad_norm": 4.147047519683838, | |
| "learning_rate": 3.4630477390141556e-05, | |
| "loss": 48.6138, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 2.321110221510542, | |
| "grad_norm": 4.9791693687438965, | |
| "learning_rate": 3.4545939225506934e-05, | |
| "loss": 51.4538, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.3279423538831066, | |
| "grad_norm": 4.929348945617676, | |
| "learning_rate": 3.4461273080505793e-05, | |
| "loss": 51.2735, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 2.334774486255671, | |
| "grad_norm": 4.98499059677124, | |
| "learning_rate": 3.437648009023905e-05, | |
| "loss": 48.5889, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 2.341606618628236, | |
| "grad_norm": 4.354183673858643, | |
| "learning_rate": 3.4291561391508185e-05, | |
| "loss": 51.7768, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 2.3484387510008005, | |
| "grad_norm": 3.482697010040283, | |
| "learning_rate": 3.420651812280006e-05, | |
| "loss": 48.9966, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 2.3552708833733655, | |
| "grad_norm": 4.613458156585693, | |
| "learning_rate": 3.4121351424271594e-05, | |
| "loss": 50.8534, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.36210301574593, | |
| "grad_norm": 3.93235182762146, | |
| "learning_rate": 3.4036062437734484e-05, | |
| "loss": 50.9164, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 2.368935148118495, | |
| "grad_norm": 5.348623275756836, | |
| "learning_rate": 3.395065230663996e-05, | |
| "loss": 49.6679, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 2.3757672804910595, | |
| "grad_norm": 5.050134181976318, | |
| "learning_rate": 3.386512217606339e-05, | |
| "loss": 48.0534, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 2.382599412863624, | |
| "grad_norm": 3.7587573528289795, | |
| "learning_rate": 3.3779473192688954e-05, | |
| "loss": 50.3013, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 2.3894315452361887, | |
| "grad_norm": 5.177303314208984, | |
| "learning_rate": 3.369370650479425e-05, | |
| "loss": 48.8704, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.3894315452361887, | |
| "eval_loss": 0.7940448522567749, | |
| "eval_runtime": 119.8708, | |
| "eval_samples_per_second": 32.91, | |
| "eval_steps_per_second": 8.234, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.396263677608754, | |
| "grad_norm": 4.268886089324951, | |
| "learning_rate": 3.360782326223493e-05, | |
| "loss": 50.0788, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 2.4030958099813184, | |
| "grad_norm": 4.847851276397705, | |
| "learning_rate": 3.3521824616429285e-05, | |
| "loss": 50.5298, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 2.409927942353883, | |
| "grad_norm": 4.221863746643066, | |
| "learning_rate": 3.3435711720342764e-05, | |
| "loss": 51.0571, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 2.4167600747264477, | |
| "grad_norm": 5.5122528076171875, | |
| "learning_rate": 3.3349485728472535e-05, | |
| "loss": 48.3266, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 2.4235922070990124, | |
| "grad_norm": 3.7766902446746826, | |
| "learning_rate": 3.326314779683207e-05, | |
| "loss": 49.9334, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.4304243394715774, | |
| "grad_norm": 4.093820571899414, | |
| "learning_rate": 3.3176699082935545e-05, | |
| "loss": 48.4746, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 2.437256471844142, | |
| "grad_norm": 4.116121292114258, | |
| "learning_rate": 3.3090140745782396e-05, | |
| "loss": 48.5131, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 2.4440886042167067, | |
| "grad_norm": 5.181516647338867, | |
| "learning_rate": 3.300347394584172e-05, | |
| "loss": 50.4981, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 2.4509207365892713, | |
| "grad_norm": 4.464053630828857, | |
| "learning_rate": 3.2916699845036816e-05, | |
| "loss": 50.2301, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 2.457752868961836, | |
| "grad_norm": 4.229206562042236, | |
| "learning_rate": 3.282981960672948e-05, | |
| "loss": 50.1858, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.4645850013344006, | |
| "grad_norm": 3.8356049060821533, | |
| "learning_rate": 3.2742834395704486e-05, | |
| "loss": 48.9147, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 2.4714171337069657, | |
| "grad_norm": 3.9584670066833496, | |
| "learning_rate": 3.265574537815398e-05, | |
| "loss": 48.6574, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 2.4782492660795303, | |
| "grad_norm": 4.802350997924805, | |
| "learning_rate": 3.25685537216618e-05, | |
| "loss": 48.9724, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 2.485081398452095, | |
| "grad_norm": 4.078526020050049, | |
| "learning_rate": 3.248126059518785e-05, | |
| "loss": 47.7639, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 2.4919135308246596, | |
| "grad_norm": 3.8187856674194336, | |
| "learning_rate": 3.2393867169052385e-05, | |
| "loss": 48.2195, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.4987456631972247, | |
| "grad_norm": 5.273796081542969, | |
| "learning_rate": 3.230637461492043e-05, | |
| "loss": 49.7512, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 2.5055777955697893, | |
| "grad_norm": 4.126491069793701, | |
| "learning_rate": 3.221878410578593e-05, | |
| "loss": 49.0844, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 2.512409927942354, | |
| "grad_norm": 4.665433406829834, | |
| "learning_rate": 3.213109681595612e-05, | |
| "loss": 48.7829, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 2.5192420603149186, | |
| "grad_norm": 4.897470951080322, | |
| "learning_rate": 3.2043313921035743e-05, | |
| "loss": 49.5252, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 2.5260741926874832, | |
| "grad_norm": 5.257498264312744, | |
| "learning_rate": 3.195543659791132e-05, | |
| "loss": 50.4767, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.532906325060048, | |
| "grad_norm": 3.754957914352417, | |
| "learning_rate": 3.186746602473533e-05, | |
| "loss": 49.4055, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 2.5397384574326125, | |
| "grad_norm": 3.994774341583252, | |
| "learning_rate": 3.177940338091043e-05, | |
| "loss": 49.3039, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 2.5465705898051776, | |
| "grad_norm": 4.923650741577148, | |
| "learning_rate": 3.169124984707367e-05, | |
| "loss": 48.6568, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 2.5534027221777422, | |
| "grad_norm": 6.377063274383545, | |
| "learning_rate": 3.160300660508064e-05, | |
| "loss": 48.7655, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 2.560234854550307, | |
| "grad_norm": 3.7124524116516113, | |
| "learning_rate": 3.151467483798961e-05, | |
| "loss": 48.0997, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.560234854550307, | |
| "eval_loss": 0.7798339128494263, | |
| "eval_runtime": 119.2173, | |
| "eval_samples_per_second": 33.091, | |
| "eval_steps_per_second": 8.279, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.5670669869228715, | |
| "grad_norm": 4.752464294433594, | |
| "learning_rate": 3.14262557300457e-05, | |
| "loss": 48.422, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 2.5738991192954366, | |
| "grad_norm": 4.635769844055176, | |
| "learning_rate": 3.1337750466665e-05, | |
| "loss": 48.9177, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 2.580731251668001, | |
| "grad_norm": 4.357526779174805, | |
| "learning_rate": 3.124916023441865e-05, | |
| "loss": 49.4801, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 2.587563384040566, | |
| "grad_norm": 16.189651489257812, | |
| "learning_rate": 3.116048622101694e-05, | |
| "loss": 49.275, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 2.5943955164131305, | |
| "grad_norm": 3.983285903930664, | |
| "learning_rate": 3.107172961529343e-05, | |
| "loss": 47.968, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.601227648785695, | |
| "grad_norm": 4.357701301574707, | |
| "learning_rate": 3.098289160718895e-05, | |
| "loss": 47.8592, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 2.6080597811582598, | |
| "grad_norm": 3.9686052799224854, | |
| "learning_rate": 3.0893973387735687e-05, | |
| "loss": 49.5191, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 2.6148919135308244, | |
| "grad_norm": 3.9062581062316895, | |
| "learning_rate": 3.0804976149041195e-05, | |
| "loss": 48.5485, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 2.6217240459033895, | |
| "grad_norm": 4.7290143966674805, | |
| "learning_rate": 3.071590108427244e-05, | |
| "loss": 49.2073, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 2.628556178275954, | |
| "grad_norm": 4.57703161239624, | |
| "learning_rate": 3.062674938763976e-05, | |
| "loss": 49.7624, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.6353883106485188, | |
| "grad_norm": 4.4061737060546875, | |
| "learning_rate": 3.0537522254380905e-05, | |
| "loss": 49.0566, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 2.6422204430210834, | |
| "grad_norm": 4.166697978973389, | |
| "learning_rate": 3.044822088074496e-05, | |
| "loss": 49.3193, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 2.6490525753936485, | |
| "grad_norm": 3.5513172149658203, | |
| "learning_rate": 3.0358846463976372e-05, | |
| "loss": 48.9675, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 2.655884707766213, | |
| "grad_norm": 4.9701995849609375, | |
| "learning_rate": 3.026940020229882e-05, | |
| "loss": 49.6229, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 2.6627168401387777, | |
| "grad_norm": 4.223094463348389, | |
| "learning_rate": 3.017988329489923e-05, | |
| "loss": 47.1613, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.6695489725113424, | |
| "grad_norm": 4.849906921386719, | |
| "learning_rate": 3.0090296941911633e-05, | |
| "loss": 47.5764, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 2.676381104883907, | |
| "grad_norm": 3.507953643798828, | |
| "learning_rate": 3.0000642344401113e-05, | |
| "loss": 47.1944, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 2.6832132372564717, | |
| "grad_norm": 4.040694713592529, | |
| "learning_rate": 2.9910920704347696e-05, | |
| "loss": 48.6472, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 2.6900453696290363, | |
| "grad_norm": 5.141117095947266, | |
| "learning_rate": 2.9821133224630226e-05, | |
| "loss": 47.177, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 2.6968775020016014, | |
| "grad_norm": 4.463181018829346, | |
| "learning_rate": 2.9731281109010256e-05, | |
| "loss": 47.4283, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.703709634374166, | |
| "grad_norm": 3.586456060409546, | |
| "learning_rate": 2.9641365562115887e-05, | |
| "loss": 48.9784, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 2.7105417667467306, | |
| "grad_norm": 3.9780969619750977, | |
| "learning_rate": 2.9551387789425638e-05, | |
| "loss": 48.601, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 2.7173738991192953, | |
| "grad_norm": 4.445759296417236, | |
| "learning_rate": 2.9461348997252265e-05, | |
| "loss": 49.9106, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 2.7242060314918604, | |
| "grad_norm": 4.416858673095703, | |
| "learning_rate": 2.9371250392726614e-05, | |
| "loss": 48.3298, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 2.731038163864425, | |
| "grad_norm": 4.36728572845459, | |
| "learning_rate": 2.9281093183781403e-05, | |
| "loss": 48.6063, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.731038163864425, | |
| "eval_loss": 0.7699871063232422, | |
| "eval_runtime": 119.5951, | |
| "eval_samples_per_second": 32.986, | |
| "eval_steps_per_second": 8.253, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.7378702962369896, | |
| "grad_norm": 5.540378570556641, | |
| "learning_rate": 2.919087857913508e-05, | |
| "loss": 49.4323, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 2.7447024286095543, | |
| "grad_norm": 3.73681640625, | |
| "learning_rate": 2.9100607788275545e-05, | |
| "loss": 49.0439, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 2.751534560982119, | |
| "grad_norm": 4.437684535980225, | |
| "learning_rate": 2.9010282021444008e-05, | |
| "loss": 48.8682, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 2.7583666933546835, | |
| "grad_norm": 4.933871746063232, | |
| "learning_rate": 2.891990248961871e-05, | |
| "loss": 48.0791, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 2.7651988257272486, | |
| "grad_norm": 4.351380825042725, | |
| "learning_rate": 2.8829470404498697e-05, | |
| "loss": 47.0584, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.7720309580998133, | |
| "grad_norm": 4.953640937805176, | |
| "learning_rate": 2.8738986978487625e-05, | |
| "loss": 50.0531, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 2.778863090472378, | |
| "grad_norm": 3.676950216293335, | |
| "learning_rate": 2.8648453424677434e-05, | |
| "loss": 46.9994, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 2.7856952228449425, | |
| "grad_norm": 4.177380084991455, | |
| "learning_rate": 2.8557870956832132e-05, | |
| "loss": 48.3932, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 2.7925273552175076, | |
| "grad_norm": 4.177119731903076, | |
| "learning_rate": 2.846724078937149e-05, | |
| "loss": 48.2385, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 2.7993594875900722, | |
| "grad_norm": 4.261831283569336, | |
| "learning_rate": 2.8376564137354795e-05, | |
| "loss": 48.813, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.806191619962637, | |
| "grad_norm": 3.7779037952423096, | |
| "learning_rate": 2.8285842216464543e-05, | |
| "loss": 48.801, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 2.8130237523352015, | |
| "grad_norm": 5.378250598907471, | |
| "learning_rate": 2.8195076242990122e-05, | |
| "loss": 45.9584, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 2.819855884707766, | |
| "grad_norm": 3.5369153022766113, | |
| "learning_rate": 2.8104267433811533e-05, | |
| "loss": 46.97, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 2.826688017080331, | |
| "grad_norm": 3.493602991104126, | |
| "learning_rate": 2.8013417006383076e-05, | |
| "loss": 46.7352, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 2.8335201494528954, | |
| "grad_norm": 5.41981840133667, | |
| "learning_rate": 2.7922526178717017e-05, | |
| "loss": 48.4586, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.8403522818254605, | |
| "grad_norm": 4.6053948402404785, | |
| "learning_rate": 2.783159616936723e-05, | |
| "loss": 46.5008, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 2.847184414198025, | |
| "grad_norm": 4.136333465576172, | |
| "learning_rate": 2.774062819741293e-05, | |
| "loss": 47.3448, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 2.85401654657059, | |
| "grad_norm": 3.927877187728882, | |
| "learning_rate": 2.764962348244228e-05, | |
| "loss": 46.7369, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 2.8608486789431544, | |
| "grad_norm": 4.283491611480713, | |
| "learning_rate": 2.7558583244536007e-05, | |
| "loss": 48.098, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 2.8676808113157195, | |
| "grad_norm": 3.802030563354492, | |
| "learning_rate": 2.7467508704251137e-05, | |
| "loss": 48.2908, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.874512943688284, | |
| "grad_norm": 5.212815761566162, | |
| "learning_rate": 2.7376401082604564e-05, | |
| "loss": 47.8921, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 2.8813450760608488, | |
| "grad_norm": 4.39296293258667, | |
| "learning_rate": 2.7285261601056698e-05, | |
| "loss": 48.2491, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 2.8881772084334134, | |
| "grad_norm": 5.428844928741455, | |
| "learning_rate": 2.7194091481495076e-05, | |
| "loss": 49.1209, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 2.895009340805978, | |
| "grad_norm": 3.9836559295654297, | |
| "learning_rate": 2.7102891946217994e-05, | |
| "loss": 47.0515, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 2.9018414731785427, | |
| "grad_norm": 3.1067824363708496, | |
| "learning_rate": 2.7011664217918154e-05, | |
| "loss": 46.0087, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.9018414731785427, | |
| "eval_loss": 0.760260820388794, | |
| "eval_runtime": 119.6698, | |
| "eval_samples_per_second": 32.966, | |
| "eval_steps_per_second": 8.248, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.9086736055511073, | |
| "grad_norm": 4.688024997711182, | |
| "learning_rate": 2.6920409519666174e-05, | |
| "loss": 47.0489, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 2.9155057379236724, | |
| "grad_norm": 4.777935981750488, | |
| "learning_rate": 2.6829129074894304e-05, | |
| "loss": 48.1153, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 2.922337870296237, | |
| "grad_norm": 4.912516117095947, | |
| "learning_rate": 2.6737824107379948e-05, | |
| "loss": 48.0798, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 2.9291700026688017, | |
| "grad_norm": 4.066973686218262, | |
| "learning_rate": 2.6646495841229287e-05, | |
| "loss": 46.9194, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 2.9360021350413663, | |
| "grad_norm": 4.499208927154541, | |
| "learning_rate": 2.655514550086086e-05, | |
| "loss": 48.3087, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.9428342674139314, | |
| "grad_norm": 4.891952991485596, | |
| "learning_rate": 2.6463774310989154e-05, | |
| "loss": 46.8565, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 2.949666399786496, | |
| "grad_norm": 3.8262720108032227, | |
| "learning_rate": 2.637238349660819e-05, | |
| "loss": 46.7596, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 2.9564985321590607, | |
| "grad_norm": 5.6072492599487305, | |
| "learning_rate": 2.6280974282975063e-05, | |
| "loss": 45.254, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 2.9633306645316253, | |
| "grad_norm": 3.9889800548553467, | |
| "learning_rate": 2.6189547895593562e-05, | |
| "loss": 46.754, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 2.97016279690419, | |
| "grad_norm": 3.7260525226593018, | |
| "learning_rate": 2.6098105560197722e-05, | |
| "loss": 46.6516, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.9769949292767546, | |
| "grad_norm": 4.090394973754883, | |
| "learning_rate": 2.600664850273538e-05, | |
| "loss": 47.2404, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 2.983827061649319, | |
| "grad_norm": 3.6287267208099365, | |
| "learning_rate": 2.5915177949351765e-05, | |
| "loss": 46.3821, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 2.9906591940218843, | |
| "grad_norm": 3.5229976177215576, | |
| "learning_rate": 2.582369512637302e-05, | |
| "loss": 46.8471, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 2.997491326394449, | |
| "grad_norm": 3.532615900039673, | |
| "learning_rate": 2.5732201260289806e-05, | |
| "loss": 47.0364, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 3.0034160661862823, | |
| "grad_norm": 3.482403039932251, | |
| "learning_rate": 2.564069757774082e-05, | |
| "loss": 40.3241, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.010248198558847, | |
| "grad_norm": 3.94649600982666, | |
| "learning_rate": 2.554918530549637e-05, | |
| "loss": 46.7226, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 3.0170803309314116, | |
| "grad_norm": 4.395301818847656, | |
| "learning_rate": 2.545766567044194e-05, | |
| "loss": 45.266, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 3.0239124633039767, | |
| "grad_norm": 4.813998699188232, | |
| "learning_rate": 2.5366139899561696e-05, | |
| "loss": 46.8651, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 3.0307445956765413, | |
| "grad_norm": 5.5799174308776855, | |
| "learning_rate": 2.527460921992209e-05, | |
| "loss": 46.5727, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 3.037576728049106, | |
| "grad_norm": 6.693199634552002, | |
| "learning_rate": 2.518307485865538e-05, | |
| "loss": 47.987, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.0444088604216706, | |
| "grad_norm": 6.33953332901001, | |
| "learning_rate": 2.509153804294318e-05, | |
| "loss": 45.7221, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 3.051240992794235, | |
| "grad_norm": 4.887784957885742, | |
| "learning_rate": 2.5e-05, | |
| "loss": 44.5186, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 3.0580731251668003, | |
| "grad_norm": 4.337290287017822, | |
| "learning_rate": 2.490846195705683e-05, | |
| "loss": 46.394, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 3.064905257539365, | |
| "grad_norm": 3.7094030380249023, | |
| "learning_rate": 2.4816925141344623e-05, | |
| "loss": 45.122, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 3.0717373899119296, | |
| "grad_norm": 3.71903920173645, | |
| "learning_rate": 2.4725390780077908e-05, | |
| "loss": 44.7121, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.0717373899119296, | |
| "eval_loss": 0.7495905160903931, | |
| "eval_runtime": 119.7503, | |
| "eval_samples_per_second": 32.944, | |
| "eval_steps_per_second": 8.242, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.078569522284494, | |
| "grad_norm": 4.690406799316406, | |
| "learning_rate": 2.4633860100438316e-05, | |
| "loss": 45.6299, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 3.085401654657059, | |
| "grad_norm": 4.29756498336792, | |
| "learning_rate": 2.4542334329558077e-05, | |
| "loss": 48.2504, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 3.092233787029624, | |
| "grad_norm": 5.62404727935791, | |
| "learning_rate": 2.4450814694503636e-05, | |
| "loss": 47.6091, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 3.0990659194021886, | |
| "grad_norm": 3.726529836654663, | |
| "learning_rate": 2.435930242225919e-05, | |
| "loss": 46.4755, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 3.105898051774753, | |
| "grad_norm": 6.04416036605835, | |
| "learning_rate": 2.4267798739710203e-05, | |
| "loss": 46.9715, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.112730184147318, | |
| "grad_norm": 3.8375885486602783, | |
| "learning_rate": 2.4176304873626985e-05, | |
| "loss": 47.9794, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 3.1195623165198825, | |
| "grad_norm": 3.296687602996826, | |
| "learning_rate": 2.4084822050648237e-05, | |
| "loss": 45.0776, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 3.126394448892447, | |
| "grad_norm": 3.546963930130005, | |
| "learning_rate": 2.399335149726463e-05, | |
| "loss": 44.6584, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 3.133226581265012, | |
| "grad_norm": 3.896601676940918, | |
| "learning_rate": 2.390189443980229e-05, | |
| "loss": 47.0284, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 3.140058713637577, | |
| "grad_norm": 3.570570468902588, | |
| "learning_rate": 2.3810452104406444e-05, | |
| "loss": 46.4413, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.1468908460101415, | |
| "grad_norm": 4.160488605499268, | |
| "learning_rate": 2.3719025717024946e-05, | |
| "loss": 47.1564, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 3.153722978382706, | |
| "grad_norm": 5.714613914489746, | |
| "learning_rate": 2.3627616503391814e-05, | |
| "loss": 48.2275, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 3.1605551107552707, | |
| "grad_norm": 4.362124919891357, | |
| "learning_rate": 2.3536225689010845e-05, | |
| "loss": 47.0592, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 3.167387243127836, | |
| "grad_norm": 6.478647708892822, | |
| "learning_rate": 2.3444854499139142e-05, | |
| "loss": 47.4139, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 3.1742193755004005, | |
| "grad_norm": 3.713979721069336, | |
| "learning_rate": 2.3353504158770722e-05, | |
| "loss": 47.7301, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.181051507872965, | |
| "grad_norm": 3.875537872314453, | |
| "learning_rate": 2.3262175892620065e-05, | |
| "loss": 45.6112, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 3.1878836402455297, | |
| "grad_norm": 5.328731536865234, | |
| "learning_rate": 2.3170870925105702e-05, | |
| "loss": 46.6125, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 3.1947157726180944, | |
| "grad_norm": 5.152383327484131, | |
| "learning_rate": 2.307959048033383e-05, | |
| "loss": 45.6076, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 3.201547904990659, | |
| "grad_norm": 4.689112186431885, | |
| "learning_rate": 2.2988335782081855e-05, | |
| "loss": 45.648, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 3.208380037363224, | |
| "grad_norm": 3.3412325382232666, | |
| "learning_rate": 2.2897108053782e-05, | |
| "loss": 44.4993, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.2152121697357887, | |
| "grad_norm": 11.583976745605469, | |
| "learning_rate": 2.280590851850493e-05, | |
| "loss": 46.3174, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 3.2220443021083534, | |
| "grad_norm": 4.012174606323242, | |
| "learning_rate": 2.271473839894331e-05, | |
| "loss": 46.3054, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 3.228876434480918, | |
| "grad_norm": 6.315187931060791, | |
| "learning_rate": 2.2623598917395438e-05, | |
| "loss": 44.3273, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 3.2357085668534826, | |
| "grad_norm": 5.612927436828613, | |
| "learning_rate": 2.253249129574887e-05, | |
| "loss": 46.8669, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 3.2425406992260477, | |
| "grad_norm": 3.7026705741882324, | |
| "learning_rate": 2.2441416755463995e-05, | |
| "loss": 46.4012, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.2425406992260477, | |
| "eval_loss": 0.7383518218994141, | |
| "eval_runtime": 118.6959, | |
| "eval_samples_per_second": 33.236, | |
| "eval_steps_per_second": 8.315, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.2493728315986123, | |
| "grad_norm": 4.251457214355469, | |
| "learning_rate": 2.2350376517557727e-05, | |
| "loss": 47.1319, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 3.256204963971177, | |
| "grad_norm": 4.500071048736572, | |
| "learning_rate": 2.2259371802587068e-05, | |
| "loss": 47.0883, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 3.2630370963437416, | |
| "grad_norm": 4.684493064880371, | |
| "learning_rate": 2.216840383063277e-05, | |
| "loss": 45.0587, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 3.2698692287163063, | |
| "grad_norm": 3.853529453277588, | |
| "learning_rate": 2.2077473821282996e-05, | |
| "loss": 46.3262, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 3.276701361088871, | |
| "grad_norm": 5.501523971557617, | |
| "learning_rate": 2.1986582993616926e-05, | |
| "loss": 44.8375, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.283533493461436, | |
| "grad_norm": 15.540706634521484, | |
| "learning_rate": 2.1895732566188476e-05, | |
| "loss": 45.117, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 3.2903656258340006, | |
| "grad_norm": 2.6855862140655518, | |
| "learning_rate": 2.1804923757009884e-05, | |
| "loss": 45.9567, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 3.2971977582065652, | |
| "grad_norm": 4.529240131378174, | |
| "learning_rate": 2.1714157783535463e-05, | |
| "loss": 44.7532, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 3.30402989057913, | |
| "grad_norm": 4.690282344818115, | |
| "learning_rate": 2.1623435862645204e-05, | |
| "loss": 45.8376, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 3.3108620229516945, | |
| "grad_norm": 5.309507846832275, | |
| "learning_rate": 2.153275921062851e-05, | |
| "loss": 46.1757, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.3176941553242596, | |
| "grad_norm": 4.278385639190674, | |
| "learning_rate": 2.1442129043167874e-05, | |
| "loss": 46.6388, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 3.3245262876968242, | |
| "grad_norm": 4.2424516677856445, | |
| "learning_rate": 2.1351546575322572e-05, | |
| "loss": 45.1695, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 3.331358420069389, | |
| "grad_norm": 3.695155143737793, | |
| "learning_rate": 2.126101302151238e-05, | |
| "loss": 45.9417, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 3.3381905524419535, | |
| "grad_norm": 4.2003374099731445, | |
| "learning_rate": 2.1170529595501305e-05, | |
| "loss": 44.4002, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 3.345022684814518, | |
| "grad_norm": 4.378734588623047, | |
| "learning_rate": 2.1080097510381298e-05, | |
| "loss": 45.4517, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.351854817187083, | |
| "grad_norm": 3.96730637550354, | |
| "learning_rate": 2.098971797855599e-05, | |
| "loss": 43.9996, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 3.358686949559648, | |
| "grad_norm": 3.6162188053131104, | |
| "learning_rate": 2.089939221172446e-05, | |
| "loss": 43.9178, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 3.3655190819322125, | |
| "grad_norm": 4.3834099769592285, | |
| "learning_rate": 2.0809121420864923e-05, | |
| "loss": 46.2701, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 3.372351214304777, | |
| "grad_norm": 4.271561145782471, | |
| "learning_rate": 2.07189068162186e-05, | |
| "loss": 45.7546, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 3.3791833466773418, | |
| "grad_norm": 3.5791757106781006, | |
| "learning_rate": 2.0628749607273396e-05, | |
| "loss": 45.3079, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.3860154790499064, | |
| "grad_norm": 4.5101318359375, | |
| "learning_rate": 2.0538651002747744e-05, | |
| "loss": 46.5476, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 3.3928476114224715, | |
| "grad_norm": 5.944687366485596, | |
| "learning_rate": 2.0448612210574365e-05, | |
| "loss": 44.0355, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 3.399679743795036, | |
| "grad_norm": 4.936254501342773, | |
| "learning_rate": 2.0358634437884112e-05, | |
| "loss": 46.0717, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 3.4065118761676008, | |
| "grad_norm": 4.114757537841797, | |
| "learning_rate": 2.0268718890989753e-05, | |
| "loss": 44.5295, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 3.4133440085401654, | |
| "grad_norm": 8.12585735321045, | |
| "learning_rate": 2.0178866775369777e-05, | |
| "loss": 45.0747, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.4133440085401654, | |
| "eval_loss": 0.7275528907775879, | |
| "eval_runtime": 119.5885, | |
| "eval_samples_per_second": 32.988, | |
| "eval_steps_per_second": 8.253, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.4304243394715774, | |
| "grad_norm": 4.9336113929748535, | |
| "learning_rate": 2.0089079295652306e-05, | |
| "loss": 45.5736, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 3.437256471844142, | |
| "grad_norm": 5.042412757873535, | |
| "learning_rate": 1.9999357655598893e-05, | |
| "loss": 45.6651, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 3.4440886042167067, | |
| "grad_norm": 3.9377660751342773, | |
| "learning_rate": 1.9909703058088376e-05, | |
| "loss": 44.5559, | |
| "step": 1006 | |
| }, | |
| { | |
| "epoch": 3.4509207365892713, | |
| "grad_norm": 4.054321765899658, | |
| "learning_rate": 1.9820116705100777e-05, | |
| "loss": 45.1868, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 3.457752868961836, | |
| "grad_norm": 4.860738277435303, | |
| "learning_rate": 1.9730599797701177e-05, | |
| "loss": 44.6737, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.4645850013344006, | |
| "grad_norm": 3.950925827026367, | |
| "learning_rate": 1.9641153536023644e-05, | |
| "loss": 43.7733, | |
| "step": 1012 | |
| }, | |
| { | |
| "epoch": 3.4714171337069657, | |
| "grad_norm": 3.831669569015503, | |
| "learning_rate": 1.9551779119255043e-05, | |
| "loss": 43.7403, | |
| "step": 1014 | |
| }, | |
| { | |
| "epoch": 3.4782492660795303, | |
| "grad_norm": 4.114947319030762, | |
| "learning_rate": 1.9462477745619108e-05, | |
| "loss": 45.5074, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 3.485081398452095, | |
| "grad_norm": 3.405243158340454, | |
| "learning_rate": 1.9373250612360246e-05, | |
| "loss": 46.4417, | |
| "step": 1018 | |
| }, | |
| { | |
| "epoch": 3.4919135308246596, | |
| "grad_norm": 4.80495023727417, | |
| "learning_rate": 1.928409891572757e-05, | |
| "loss": 44.9758, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.4987456631972247, | |
| "grad_norm": 4.239831447601318, | |
| "learning_rate": 1.919502385095881e-05, | |
| "loss": 44.6174, | |
| "step": 1022 | |
| }, | |
| { | |
| "epoch": 3.5055777955697893, | |
| "grad_norm": 4.724026203155518, | |
| "learning_rate": 1.9106026612264316e-05, | |
| "loss": 44.7325, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 3.512409927942354, | |
| "grad_norm": 3.4634554386138916, | |
| "learning_rate": 1.9017108392811065e-05, | |
| "loss": 43.7796, | |
| "step": 1026 | |
| }, | |
| { | |
| "epoch": 3.5192420603149186, | |
| "grad_norm": 4.715716361999512, | |
| "learning_rate": 1.8928270384706584e-05, | |
| "loss": 45.2777, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 3.5260741926874832, | |
| "grad_norm": 5.100541114807129, | |
| "learning_rate": 1.8839513778983066e-05, | |
| "loss": 46.4359, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.532906325060048, | |
| "grad_norm": 4.475189685821533, | |
| "learning_rate": 1.875083976558136e-05, | |
| "loss": 44.0298, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 3.5397384574326125, | |
| "grad_norm": 4.431650161743164, | |
| "learning_rate": 1.8662249533335003e-05, | |
| "loss": 44.2631, | |
| "step": 1034 | |
| }, | |
| { | |
| "epoch": 3.5465705898051776, | |
| "grad_norm": 4.561038970947266, | |
| "learning_rate": 1.8573744269954298e-05, | |
| "loss": 43.9968, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 3.5534027221777422, | |
| "grad_norm": 3.4181675910949707, | |
| "learning_rate": 1.848532516201039e-05, | |
| "loss": 43.372, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 3.560234854550307, | |
| "grad_norm": 4.05961799621582, | |
| "learning_rate": 1.8396993394919372e-05, | |
| "loss": 43.5887, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.5670669869228715, | |
| "grad_norm": 4.183586597442627, | |
| "learning_rate": 1.8308750152926337e-05, | |
| "loss": 43.1976, | |
| "step": 1042 | |
| }, | |
| { | |
| "epoch": 3.5738991192954366, | |
| "grad_norm": 4.6883745193481445, | |
| "learning_rate": 1.8220596619089576e-05, | |
| "loss": 44.4463, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 3.580731251668001, | |
| "grad_norm": 4.490588665008545, | |
| "learning_rate": 1.8132533975264682e-05, | |
| "loss": 44.3332, | |
| "step": 1046 | |
| }, | |
| { | |
| "epoch": 3.587563384040566, | |
| "grad_norm": 4.937854766845703, | |
| "learning_rate": 1.8044563402088684e-05, | |
| "loss": 45.1199, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 3.5943955164131305, | |
| "grad_norm": 3.8182907104492188, | |
| "learning_rate": 1.795668607896426e-05, | |
| "loss": 45.2035, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.5943955164131305, | |
| "eval_loss": 0.7135393619537354, | |
| "eval_runtime": 130.7813, | |
| "eval_samples_per_second": 30.165, | |
| "eval_steps_per_second": 7.547, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.601227648785695, | |
| "grad_norm": 3.3739826679229736, | |
| "learning_rate": 1.7868903184043887e-05, | |
| "loss": 43.5257, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 3.6080597811582598, | |
| "grad_norm": 3.8119192123413086, | |
| "learning_rate": 1.7781215894214078e-05, | |
| "loss": 44.9718, | |
| "step": 1054 | |
| }, | |
| { | |
| "epoch": 3.6148919135308244, | |
| "grad_norm": 3.6780483722686768, | |
| "learning_rate": 1.7693625385079577e-05, | |
| "loss": 44.496, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 3.6217240459033895, | |
| "grad_norm": 4.625596523284912, | |
| "learning_rate": 1.7606132830947614e-05, | |
| "loss": 43.6496, | |
| "step": 1058 | |
| }, | |
| { | |
| "epoch": 3.628556178275954, | |
| "grad_norm": 5.467988967895508, | |
| "learning_rate": 1.7518739404812155e-05, | |
| "loss": 45.3773, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.6353883106485188, | |
| "grad_norm": 3.7848103046417236, | |
| "learning_rate": 1.7431446278338197e-05, | |
| "loss": 43.6622, | |
| "step": 1062 | |
| }, | |
| { | |
| "epoch": 3.6422204430210834, | |
| "grad_norm": 6.2495222091674805, | |
| "learning_rate": 1.7344254621846016e-05, | |
| "loss": 44.7325, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 3.6490525753936485, | |
| "grad_norm": 4.541433811187744, | |
| "learning_rate": 1.7257165604295513e-05, | |
| "loss": 45.7111, | |
| "step": 1066 | |
| }, | |
| { | |
| "epoch": 3.655884707766213, | |
| "grad_norm": 3.6900789737701416, | |
| "learning_rate": 1.7170180393270532e-05, | |
| "loss": 46.2799, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 3.6627168401387777, | |
| "grad_norm": 3.999112129211426, | |
| "learning_rate": 1.7083300154963193e-05, | |
| "loss": 44.9348, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.6695489725113424, | |
| "grad_norm": 4.940526008605957, | |
| "learning_rate": 1.699652605415828e-05, | |
| "loss": 45.9208, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 3.676381104883907, | |
| "grad_norm": 3.8536486625671387, | |
| "learning_rate": 1.6909859254217613e-05, | |
| "loss": 45.3559, | |
| "step": 1074 | |
| }, | |
| { | |
| "epoch": 3.6832132372564717, | |
| "grad_norm": 5.941255569458008, | |
| "learning_rate": 1.682330091706446e-05, | |
| "loss": 44.2183, | |
| "step": 1076 | |
| }, | |
| { | |
| "epoch": 3.6900453696290363, | |
| "grad_norm": 4.6851091384887695, | |
| "learning_rate": 1.6736852203167935e-05, | |
| "loss": 45.0132, | |
| "step": 1078 | |
| }, | |
| { | |
| "epoch": 3.6968775020016014, | |
| "grad_norm": 6.338913917541504, | |
| "learning_rate": 1.6650514271527468e-05, | |
| "loss": 44.5087, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.703709634374166, | |
| "grad_norm": 6.134509086608887, | |
| "learning_rate": 1.6564288279657252e-05, | |
| "loss": 44.5929, | |
| "step": 1082 | |
| }, | |
| { | |
| "epoch": 3.7105417667467306, | |
| "grad_norm": 3.0185976028442383, | |
| "learning_rate": 1.647817538357072e-05, | |
| "loss": 44.4708, | |
| "step": 1084 | |
| }, | |
| { | |
| "epoch": 3.7173738991192953, | |
| "grad_norm": 4.479791641235352, | |
| "learning_rate": 1.639217673776507e-05, | |
| "loss": 44.4799, | |
| "step": 1086 | |
| }, | |
| { | |
| "epoch": 3.7242060314918604, | |
| "grad_norm": 3.9354395866394043, | |
| "learning_rate": 1.630629349520576e-05, | |
| "loss": 43.3393, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 3.731038163864425, | |
| "grad_norm": 4.530430316925049, | |
| "learning_rate": 1.622052680731105e-05, | |
| "loss": 43.1996, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.7378702962369896, | |
| "grad_norm": 4.594604015350342, | |
| "learning_rate": 1.613487782393661e-05, | |
| "loss": 43.6473, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 3.7447024286095543, | |
| "grad_norm": 4.38798713684082, | |
| "learning_rate": 1.604934769336004e-05, | |
| "loss": 43.1229, | |
| "step": 1094 | |
| }, | |
| { | |
| "epoch": 3.751534560982119, | |
| "grad_norm": 4.350236415863037, | |
| "learning_rate": 1.5963937562265525e-05, | |
| "loss": 44.7883, | |
| "step": 1096 | |
| }, | |
| { | |
| "epoch": 3.7583666933546835, | |
| "grad_norm": 4.064984321594238, | |
| "learning_rate": 1.587864857572842e-05, | |
| "loss": 44.1865, | |
| "step": 1098 | |
| }, | |
| { | |
| "epoch": 3.7651988257272486, | |
| "grad_norm": 4.607226848602295, | |
| "learning_rate": 1.5793481877199946e-05, | |
| "loss": 44.6176, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.7651988257272486, | |
| "eval_loss": 0.7090520858764648, | |
| "eval_runtime": 136.3013, | |
| "eval_samples_per_second": 28.943, | |
| "eval_steps_per_second": 7.241, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.7720309580998133, | |
| "grad_norm": 4.4557719230651855, | |
| "learning_rate": 1.5708438608491814e-05, | |
| "loss": 42.0453, | |
| "step": 1102 | |
| }, | |
| { | |
| "epoch": 3.778863090472378, | |
| "grad_norm": 5.199422359466553, | |
| "learning_rate": 1.5623519909760954e-05, | |
| "loss": 42.589, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 3.7856952228449425, | |
| "grad_norm": 3.632471799850464, | |
| "learning_rate": 1.5538726919494206e-05, | |
| "loss": 43.7924, | |
| "step": 1106 | |
| }, | |
| { | |
| "epoch": 3.7925273552175076, | |
| "grad_norm": 4.203450679779053, | |
| "learning_rate": 1.5454060774493068e-05, | |
| "loss": 45.02, | |
| "step": 1108 | |
| }, | |
| { | |
| "epoch": 3.7993594875900722, | |
| "grad_norm": 5.149316310882568, | |
| "learning_rate": 1.5369522609858446e-05, | |
| "loss": 44.2724, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 3.806191619962637, | |
| "grad_norm": 3.5306341648101807, | |
| "learning_rate": 1.528511355897543e-05, | |
| "loss": 44.2268, | |
| "step": 1112 | |
| }, | |
| { | |
| "epoch": 3.8130237523352015, | |
| "grad_norm": 4.296536445617676, | |
| "learning_rate": 1.5200834753498128e-05, | |
| "loss": 44.0479, | |
| "step": 1114 | |
| }, | |
| { | |
| "epoch": 3.819855884707766, | |
| "grad_norm": 2.969525098800659, | |
| "learning_rate": 1.5116687323334467e-05, | |
| "loss": 43.5543, | |
| "step": 1116 | |
| }, | |
| { | |
| "epoch": 3.826688017080331, | |
| "grad_norm": 4.044551849365234, | |
| "learning_rate": 1.5032672396631056e-05, | |
| "loss": 45.7925, | |
| "step": 1118 | |
| }, | |
| { | |
| "epoch": 3.8335201494528954, | |
| "grad_norm": 5.003629207611084, | |
| "learning_rate": 1.4948791099758052e-05, | |
| "loss": 44.2037, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.8403522818254605, | |
| "grad_norm": 3.4248318672180176, | |
| "learning_rate": 1.486504455729408e-05, | |
| "loss": 43.9243, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 3.847184414198025, | |
| "grad_norm": 4.228148937225342, | |
| "learning_rate": 1.4781433892011131e-05, | |
| "loss": 44.7779, | |
| "step": 1124 | |
| }, | |
| { | |
| "epoch": 3.85401654657059, | |
| "grad_norm": 4.345002174377441, | |
| "learning_rate": 1.4697960224859513e-05, | |
| "loss": 43.0617, | |
| "step": 1126 | |
| }, | |
| { | |
| "epoch": 3.8608486789431544, | |
| "grad_norm": 4.824610233306885, | |
| "learning_rate": 1.4614624674952842e-05, | |
| "loss": 43.2687, | |
| "step": 1128 | |
| }, | |
| { | |
| "epoch": 3.8676808113157195, | |
| "grad_norm": 5.528540134429932, | |
| "learning_rate": 1.4531428359553017e-05, | |
| "loss": 43.5145, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 3.874512943688284, | |
| "grad_norm": 3.7578537464141846, | |
| "learning_rate": 1.4448372394055249e-05, | |
| "loss": 43.2377, | |
| "step": 1132 | |
| }, | |
| { | |
| "epoch": 3.8813450760608488, | |
| "grad_norm": 3.191563367843628, | |
| "learning_rate": 1.436545789197313e-05, | |
| "loss": 43.493, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 3.8881772084334134, | |
| "grad_norm": 3.1072089672088623, | |
| "learning_rate": 1.4282685964923642e-05, | |
| "loss": 44.5567, | |
| "step": 1136 | |
| }, | |
| { | |
| "epoch": 3.895009340805978, | |
| "grad_norm": 4.651160717010498, | |
| "learning_rate": 1.4200057722612336e-05, | |
| "loss": 42.7739, | |
| "step": 1138 | |
| }, | |
| { | |
| "epoch": 3.9018414731785427, | |
| "grad_norm": 3.203441858291626, | |
| "learning_rate": 1.4117574272818388e-05, | |
| "loss": 43.1438, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 3.9086736055511073, | |
| "grad_norm": 4.5728349685668945, | |
| "learning_rate": 1.4035236721379757e-05, | |
| "loss": 44.305, | |
| "step": 1142 | |
| }, | |
| { | |
| "epoch": 3.9155057379236724, | |
| "grad_norm": 6.874294757843018, | |
| "learning_rate": 1.3953046172178414e-05, | |
| "loss": 42.8162, | |
| "step": 1144 | |
| }, | |
| { | |
| "epoch": 3.922337870296237, | |
| "grad_norm": 5.198761463165283, | |
| "learning_rate": 1.387100372712548e-05, | |
| "loss": 44.2441, | |
| "step": 1146 | |
| }, | |
| { | |
| "epoch": 3.9291700026688017, | |
| "grad_norm": 3.9007508754730225, | |
| "learning_rate": 1.378911048614647e-05, | |
| "loss": 43.0147, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 3.9360021350413663, | |
| "grad_norm": 3.7035725116729736, | |
| "learning_rate": 1.3707367547166569e-05, | |
| "loss": 45.0733, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.9360021350413663, | |
| "eval_loss": 0.7048025131225586, | |
| "eval_runtime": 132.7997, | |
| "eval_samples_per_second": 29.706, | |
| "eval_steps_per_second": 7.432, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.9428342674139314, | |
| "grad_norm": 5.101466655731201, | |
| "learning_rate": 1.3625776006095881e-05, | |
| "loss": 42.4982, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 3.949666399786496, | |
| "grad_norm": 4.983183860778809, | |
| "learning_rate": 1.354433695681474e-05, | |
| "loss": 43.3568, | |
| "step": 1154 | |
| }, | |
| { | |
| "epoch": 3.9564985321590607, | |
| "grad_norm": 3.6875593662261963, | |
| "learning_rate": 1.3463051491159096e-05, | |
| "loss": 45.16, | |
| "step": 1156 | |
| }, | |
| { | |
| "epoch": 3.9633306645316253, | |
| "grad_norm": 4.482807636260986, | |
| "learning_rate": 1.3381920698905787e-05, | |
| "loss": 42.8545, | |
| "step": 1158 | |
| }, | |
| { | |
| "epoch": 3.97016279690419, | |
| "grad_norm": 3.858903646469116, | |
| "learning_rate": 1.3300945667758014e-05, | |
| "loss": 42.5779, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 3.9769949292767546, | |
| "grad_norm": 5.07602596282959, | |
| "learning_rate": 1.3220127483330713e-05, | |
| "loss": 43.8678, | |
| "step": 1162 | |
| }, | |
| { | |
| "epoch": 3.983827061649319, | |
| "grad_norm": 5.183884620666504, | |
| "learning_rate": 1.3139467229135999e-05, | |
| "loss": 44.2575, | |
| "step": 1164 | |
| }, | |
| { | |
| "epoch": 3.9906591940218843, | |
| "grad_norm": 5.44564962387085, | |
| "learning_rate": 1.3058965986568648e-05, | |
| "loss": 42.0898, | |
| "step": 1166 | |
| }, | |
| { | |
| "epoch": 3.997491326394449, | |
| "grad_norm": 3.4175875186920166, | |
| "learning_rate": 1.2978624834891628e-05, | |
| "loss": 43.526, | |
| "step": 1168 | |
| }, | |
| { | |
| "epoch": 4.006832132372565, | |
| "grad_norm": 5.1483588218688965, | |
| "learning_rate": 1.2898444851221565e-05, | |
| "loss": 60.1634, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 4.013664264745129, | |
| "grad_norm": 4.452287673950195, | |
| "learning_rate": 1.281842711051438e-05, | |
| "loss": 41.7569, | |
| "step": 1172 | |
| }, | |
| { | |
| "epoch": 4.020496397117694, | |
| "grad_norm": 4.024214267730713, | |
| "learning_rate": 1.2738572685550799e-05, | |
| "loss": 44.7667, | |
| "step": 1174 | |
| }, | |
| { | |
| "epoch": 4.0273285294902585, | |
| "grad_norm": 5.533107757568359, | |
| "learning_rate": 1.2658882646922034e-05, | |
| "loss": 43.7144, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 4.034160661862823, | |
| "grad_norm": 4.520675182342529, | |
| "learning_rate": 1.2579358063015418e-05, | |
| "loss": 43.3862, | |
| "step": 1178 | |
| }, | |
| { | |
| "epoch": 4.040992794235389, | |
| "grad_norm": 4.086079120635986, | |
| "learning_rate": 1.2500000000000006e-05, | |
| "loss": 44.268, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 4.047824926607953, | |
| "grad_norm": 3.335569381713867, | |
| "learning_rate": 1.2420809521812404e-05, | |
| "loss": 43.1871, | |
| "step": 1182 | |
| }, | |
| { | |
| "epoch": 4.054657058980518, | |
| "grad_norm": 4.651849746704102, | |
| "learning_rate": 1.2341787690142437e-05, | |
| "loss": 43.4785, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 4.061489191353083, | |
| "grad_norm": 3.9412457942962646, | |
| "learning_rate": 1.2262935564418886e-05, | |
| "loss": 42.1075, | |
| "step": 1186 | |
| }, | |
| { | |
| "epoch": 4.068321323725647, | |
| "grad_norm": 5.621413230895996, | |
| "learning_rate": 1.2184254201795365e-05, | |
| "loss": 44.5849, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 4.075153456098212, | |
| "grad_norm": 4.291881084442139, | |
| "learning_rate": 1.2105744657136064e-05, | |
| "loss": 42.9562, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 4.0819855884707765, | |
| "grad_norm": 3.730132818222046, | |
| "learning_rate": 1.2027407983001681e-05, | |
| "loss": 44.0838, | |
| "step": 1192 | |
| }, | |
| { | |
| "epoch": 4.088817720843341, | |
| "grad_norm": 3.540987968444824, | |
| "learning_rate": 1.1949245229635245e-05, | |
| "loss": 43.4705, | |
| "step": 1194 | |
| }, | |
| { | |
| "epoch": 4.095649853215906, | |
| "grad_norm": 3.0649805068969727, | |
| "learning_rate": 1.1871257444948098e-05, | |
| "loss": 43.0996, | |
| "step": 1196 | |
| }, | |
| { | |
| "epoch": 4.10248198558847, | |
| "grad_norm": 3.2024762630462646, | |
| "learning_rate": 1.1793445674505776e-05, | |
| "loss": 42.772, | |
| "step": 1198 | |
| }, | |
| { | |
| "epoch": 4.109314117961035, | |
| "grad_norm": 3.462251663208008, | |
| "learning_rate": 1.1715810961514073e-05, | |
| "loss": 43.2502, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.109314117961035, | |
| "eval_loss": 0.7009151577949524, | |
| "eval_runtime": 133.1765, | |
| "eval_samples_per_second": 29.622, | |
| "eval_steps_per_second": 7.411, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.116146250333601, | |
| "grad_norm": 4.633735656738281, | |
| "learning_rate": 1.1638354346804971e-05, | |
| "loss": 42.8239, | |
| "step": 1202 | |
| }, | |
| { | |
| "epoch": 4.122978382706165, | |
| "grad_norm": 3.758700132369995, | |
| "learning_rate": 1.1561076868822756e-05, | |
| "loss": 43.3475, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 4.12981051507873, | |
| "grad_norm": 4.143715858459473, | |
| "learning_rate": 1.148397956361007e-05, | |
| "loss": 44.0, | |
| "step": 1206 | |
| }, | |
| { | |
| "epoch": 4.1366426474512945, | |
| "grad_norm": 5.201571941375732, | |
| "learning_rate": 1.1407063464793966e-05, | |
| "loss": 42.5036, | |
| "step": 1208 | |
| }, | |
| { | |
| "epoch": 4.143474779823859, | |
| "grad_norm": 3.4282047748565674, | |
| "learning_rate": 1.133032960357216e-05, | |
| "loss": 43.0577, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 4.150306912196424, | |
| "grad_norm": 4.114802837371826, | |
| "learning_rate": 1.1253779008699131e-05, | |
| "loss": 43.3517, | |
| "step": 1212 | |
| }, | |
| { | |
| "epoch": 4.157139044568988, | |
| "grad_norm": 3.979163408279419, | |
| "learning_rate": 1.1177412706472321e-05, | |
| "loss": 42.5044, | |
| "step": 1214 | |
| }, | |
| { | |
| "epoch": 4.163971176941553, | |
| "grad_norm": 4.363109588623047, | |
| "learning_rate": 1.1101231720718442e-05, | |
| "loss": 43.8954, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 4.170803309314118, | |
| "grad_norm": 4.6219401359558105, | |
| "learning_rate": 1.1025237072779663e-05, | |
| "loss": 43.413, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 4.177635441686682, | |
| "grad_norm": 4.945540904998779, | |
| "learning_rate": 1.09494297815e-05, | |
| "loss": 43.9628, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 4.184467574059248, | |
| "grad_norm": 4.4585747718811035, | |
| "learning_rate": 1.0873810863211595e-05, | |
| "loss": 42.6454, | |
| "step": 1222 | |
| }, | |
| { | |
| "epoch": 4.1912997064318125, | |
| "grad_norm": 4.659883499145508, | |
| "learning_rate": 1.0798381331721109e-05, | |
| "loss": 42.5656, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 4.198131838804377, | |
| "grad_norm": 4.411434650421143, | |
| "learning_rate": 1.0723142198296155e-05, | |
| "loss": 41.2252, | |
| "step": 1226 | |
| }, | |
| { | |
| "epoch": 4.204963971176942, | |
| "grad_norm": 4.985414028167725, | |
| "learning_rate": 1.0648094471651724e-05, | |
| "loss": 42.05, | |
| "step": 1228 | |
| }, | |
| { | |
| "epoch": 4.211796103549506, | |
| "grad_norm": 5.09487771987915, | |
| "learning_rate": 1.0573239157936619e-05, | |
| "loss": 42.9917, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 4.218628235922071, | |
| "grad_norm": 4.299539089202881, | |
| "learning_rate": 1.049857726072005e-05, | |
| "loss": 42.7934, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 4.225460368294636, | |
| "grad_norm": 4.075766086578369, | |
| "learning_rate": 1.0424109780978103e-05, | |
| "loss": 41.0067, | |
| "step": 1234 | |
| }, | |
| { | |
| "epoch": 4.2322925006672, | |
| "grad_norm": 4.9132232666015625, | |
| "learning_rate": 1.034983771708035e-05, | |
| "loss": 43.6556, | |
| "step": 1236 | |
| }, | |
| { | |
| "epoch": 4.239124633039765, | |
| "grad_norm": 4.45914888381958, | |
| "learning_rate": 1.0275762064776492e-05, | |
| "loss": 42.588, | |
| "step": 1238 | |
| }, | |
| { | |
| "epoch": 4.24595676541233, | |
| "grad_norm": 3.7621419429779053, | |
| "learning_rate": 1.020188381718295e-05, | |
| "loss": 41.7435, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 4.252788897784894, | |
| "grad_norm": 2.9593658447265625, | |
| "learning_rate": 1.0128203964769601e-05, | |
| "loss": 43.7138, | |
| "step": 1242 | |
| }, | |
| { | |
| "epoch": 4.25962103015746, | |
| "grad_norm": 4.333788871765137, | |
| "learning_rate": 1.0054723495346482e-05, | |
| "loss": 42.7332, | |
| "step": 1244 | |
| }, | |
| { | |
| "epoch": 4.266453162530024, | |
| "grad_norm": 4.040637493133545, | |
| "learning_rate": 9.981443394050525e-06, | |
| "loss": 43.0547, | |
| "step": 1246 | |
| }, | |
| { | |
| "epoch": 4.273285294902589, | |
| "grad_norm": 5.255796432495117, | |
| "learning_rate": 9.908364643332399e-06, | |
| "loss": 42.1078, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 4.280117427275154, | |
| "grad_norm": 3.434884786605835, | |
| "learning_rate": 9.835488222943285e-06, | |
| "loss": 42.6684, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.280117427275154, | |
| "eval_loss": 0.6948874592781067, | |
| "eval_runtime": 138.5111, | |
| "eval_samples_per_second": 28.481, | |
| "eval_steps_per_second": 7.126, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.286949559647718, | |
| "grad_norm": 4.761016368865967, | |
| "learning_rate": 9.762815109921761e-06, | |
| "loss": 43.8, | |
| "step": 1252 | |
| }, | |
| { | |
| "epoch": 4.293781692020283, | |
| "grad_norm": 5.999067783355713, | |
| "learning_rate": 9.690346278580726e-06, | |
| "loss": 42.8654, | |
| "step": 1254 | |
| }, | |
| { | |
| "epoch": 4.300613824392848, | |
| "grad_norm": 4.777903079986572, | |
| "learning_rate": 9.618082700494319e-06, | |
| "loss": 42.3409, | |
| "step": 1256 | |
| }, | |
| { | |
| "epoch": 4.307445956765412, | |
| "grad_norm": 4.543084144592285, | |
| "learning_rate": 9.546025344484869e-06, | |
| "loss": 43.6205, | |
| "step": 1258 | |
| }, | |
| { | |
| "epoch": 4.314278089137977, | |
| "grad_norm": 3.6853065490722656, | |
| "learning_rate": 9.474175176609956e-06, | |
| "loss": 43.9045, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 4.3211102215105415, | |
| "grad_norm": 4.3578338623046875, | |
| "learning_rate": 9.402533160149416e-06, | |
| "loss": 41.781, | |
| "step": 1262 | |
| }, | |
| { | |
| "epoch": 4.327942353883106, | |
| "grad_norm": 4.191073894500732, | |
| "learning_rate": 9.331100255592437e-06, | |
| "loss": 42.5713, | |
| "step": 1264 | |
| }, | |
| { | |
| "epoch": 4.334774486255672, | |
| "grad_norm": 5.591835021972656, | |
| "learning_rate": 9.259877420624721e-06, | |
| "loss": 42.9316, | |
| "step": 1266 | |
| }, | |
| { | |
| "epoch": 4.341606618628236, | |
| "grad_norm": 4.916292667388916, | |
| "learning_rate": 9.18886561011557e-06, | |
| "loss": 42.9316, | |
| "step": 1268 | |
| }, | |
| { | |
| "epoch": 4.348438751000801, | |
| "grad_norm": 3.4310858249664307, | |
| "learning_rate": 9.118065776105159e-06, | |
| "loss": 42.0445, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 4.3552708833733655, | |
| "grad_norm": 3.6645348072052, | |
| "learning_rate": 9.047478867791732e-06, | |
| "loss": 41.5698, | |
| "step": 1272 | |
| }, | |
| { | |
| "epoch": 4.36210301574593, | |
| "grad_norm": 4.118466854095459, | |
| "learning_rate": 8.977105831518864e-06, | |
| "loss": 41.7493, | |
| "step": 1274 | |
| }, | |
| { | |
| "epoch": 4.368935148118495, | |
| "grad_norm": 4.731881141662598, | |
| "learning_rate": 8.906947610762825e-06, | |
| "loss": 41.2277, | |
| "step": 1276 | |
| }, | |
| { | |
| "epoch": 4.3757672804910595, | |
| "grad_norm": 4.580758571624756, | |
| "learning_rate": 8.837005146119872e-06, | |
| "loss": 42.3467, | |
| "step": 1278 | |
| }, | |
| { | |
| "epoch": 4.382599412863624, | |
| "grad_norm": 5.310960292816162, | |
| "learning_rate": 8.767279375293672e-06, | |
| "loss": 43.1447, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 4.389431545236189, | |
| "grad_norm": 4.382359027862549, | |
| "learning_rate": 8.697771233082744e-06, | |
| "loss": 42.4424, | |
| "step": 1282 | |
| }, | |
| { | |
| "epoch": 4.396263677608753, | |
| "grad_norm": 3.6488263607025146, | |
| "learning_rate": 8.628481651367876e-06, | |
| "loss": 43.8516, | |
| "step": 1284 | |
| }, | |
| { | |
| "epoch": 4.403095809981318, | |
| "grad_norm": 3.2983975410461426, | |
| "learning_rate": 8.55941155909968e-06, | |
| "loss": 43.3322, | |
| "step": 1286 | |
| }, | |
| { | |
| "epoch": 4.4099279423538835, | |
| "grad_norm": 3.5116684436798096, | |
| "learning_rate": 8.490561882286136e-06, | |
| "loss": 41.4651, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 4.416760074726448, | |
| "grad_norm": 3.5123932361602783, | |
| "learning_rate": 8.421933543980126e-06, | |
| "loss": 43.1034, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 4.423592207099013, | |
| "grad_norm": 4.123583793640137, | |
| "learning_rate": 8.353527464267104e-06, | |
| "loss": 43.566, | |
| "step": 1292 | |
| }, | |
| { | |
| "epoch": 4.430424339471577, | |
| "grad_norm": 3.6427931785583496, | |
| "learning_rate": 8.285344560252777e-06, | |
| "loss": 42.0333, | |
| "step": 1294 | |
| }, | |
| { | |
| "epoch": 4.437256471844142, | |
| "grad_norm": 3.8917388916015625, | |
| "learning_rate": 8.217385746050742e-06, | |
| "loss": 42.0382, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 4.444088604216707, | |
| "grad_norm": 4.964122772216797, | |
| "learning_rate": 8.149651932770308e-06, | |
| "loss": 43.6584, | |
| "step": 1298 | |
| }, | |
| { | |
| "epoch": 4.450920736589271, | |
| "grad_norm": 4.227240085601807, | |
| "learning_rate": 8.082144028504233e-06, | |
| "loss": 42.4086, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.450920736589271, | |
| "eval_loss": 0.6897044777870178, | |
| "eval_runtime": 131.8148, | |
| "eval_samples_per_second": 29.928, | |
| "eval_steps_per_second": 7.488, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.457752868961836, | |
| "grad_norm": 4.605757713317871, | |
| "learning_rate": 8.014862938316542e-06, | |
| "loss": 43.7962, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 4.464585001334401, | |
| "grad_norm": 4.2398176193237305, | |
| "learning_rate": 7.947809564230445e-06, | |
| "loss": 42.3544, | |
| "step": 1304 | |
| }, | |
| { | |
| "epoch": 4.471417133706965, | |
| "grad_norm": 5.234216213226318, | |
| "learning_rate": 7.880984805216185e-06, | |
| "loss": 41.9833, | |
| "step": 1306 | |
| }, | |
| { | |
| "epoch": 4.47824926607953, | |
| "grad_norm": 3.9220240116119385, | |
| "learning_rate": 7.814389557179017e-06, | |
| "loss": 42.0345, | |
| "step": 1308 | |
| }, | |
| { | |
| "epoch": 4.485081398452095, | |
| "grad_norm": 5.44996976852417, | |
| "learning_rate": 7.748024712947205e-06, | |
| "loss": 42.0309, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 4.49191353082466, | |
| "grad_norm": 5.07472038269043, | |
| "learning_rate": 7.681891162260015e-06, | |
| "loss": 42.6996, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 4.498745663197225, | |
| "grad_norm": 3.818120241165161, | |
| "learning_rate": 7.615989791755834e-06, | |
| "loss": 42.8775, | |
| "step": 1314 | |
| }, | |
| { | |
| "epoch": 4.505577795569789, | |
| "grad_norm": 4.252802848815918, | |
| "learning_rate": 7.5503214849602516e-06, | |
| "loss": 42.4118, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 4.512409927942354, | |
| "grad_norm": 4.17697286605835, | |
| "learning_rate": 7.484887122274215e-06, | |
| "loss": 41.2153, | |
| "step": 1318 | |
| }, | |
| { | |
| "epoch": 4.519242060314919, | |
| "grad_norm": 3.7324466705322266, | |
| "learning_rate": 7.419687580962223e-06, | |
| "loss": 42.3343, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 4.526074192687483, | |
| "grad_norm": 3.870089054107666, | |
| "learning_rate": 7.354723735140609e-06, | |
| "loss": 42.0028, | |
| "step": 1322 | |
| }, | |
| { | |
| "epoch": 4.532906325060048, | |
| "grad_norm": 3.6424801349639893, | |
| "learning_rate": 7.289996455765749e-06, | |
| "loss": 43.5842, | |
| "step": 1324 | |
| }, | |
| { | |
| "epoch": 4.5397384574326125, | |
| "grad_norm": 4.695961952209473, | |
| "learning_rate": 7.225506610622456e-06, | |
| "loss": 42.0951, | |
| "step": 1326 | |
| }, | |
| { | |
| "epoch": 4.546570589805177, | |
| "grad_norm": 4.842666149139404, | |
| "learning_rate": 7.161255064312283e-06, | |
| "loss": 43.8668, | |
| "step": 1328 | |
| }, | |
| { | |
| "epoch": 4.553402722177742, | |
| "grad_norm": 4.4085822105407715, | |
| "learning_rate": 7.0972426782419884e-06, | |
| "loss": 43.7836, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 4.560234854550307, | |
| "grad_norm": 3.606607437133789, | |
| "learning_rate": 7.033470310611945e-06, | |
| "loss": 41.4304, | |
| "step": 1332 | |
| }, | |
| { | |
| "epoch": 4.567066986922872, | |
| "grad_norm": 4.789222717285156, | |
| "learning_rate": 6.969938816404639e-06, | |
| "loss": 41.6355, | |
| "step": 1334 | |
| }, | |
| { | |
| "epoch": 4.573899119295437, | |
| "grad_norm": 4.463109493255615, | |
| "learning_rate": 6.906649047373246e-06, | |
| "loss": 43.4969, | |
| "step": 1336 | |
| }, | |
| { | |
| "epoch": 4.580731251668001, | |
| "grad_norm": 4.483322620391846, | |
| "learning_rate": 6.843601852030171e-06, | |
| "loss": 42.4094, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 4.587563384040566, | |
| "grad_norm": 4.021024703979492, | |
| "learning_rate": 6.780798075635675e-06, | |
| "loss": 42.2893, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 4.5943955164131305, | |
| "grad_norm": 3.9479868412017822, | |
| "learning_rate": 6.718238560186571e-06, | |
| "loss": 40.8073, | |
| "step": 1342 | |
| }, | |
| { | |
| "epoch": 4.601227648785695, | |
| "grad_norm": 4.778145790100098, | |
| "learning_rate": 6.655924144404907e-06, | |
| "loss": 42.0845, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 4.60805978115826, | |
| "grad_norm": 3.555271863937378, | |
| "learning_rate": 6.593855663726722e-06, | |
| "loss": 41.1015, | |
| "step": 1346 | |
| }, | |
| { | |
| "epoch": 4.614891913530824, | |
| "grad_norm": 4.007204532623291, | |
| "learning_rate": 6.532033950290886e-06, | |
| "loss": 42.9137, | |
| "step": 1348 | |
| }, | |
| { | |
| "epoch": 4.621724045903389, | |
| "grad_norm": 4.328546524047852, | |
| "learning_rate": 6.470459832927881e-06, | |
| "loss": 41.274, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.621724045903389, | |
| "eval_loss": 0.6830974221229553, | |
| "eval_runtime": 135.2812, | |
| "eval_samples_per_second": 29.161, | |
| "eval_steps_per_second": 7.296, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.628556178275954, | |
| "grad_norm": 4.948083877563477, | |
| "learning_rate": 6.409134137148737e-06, | |
| "loss": 43.0462, | |
| "step": 1352 | |
| }, | |
| { | |
| "epoch": 4.635388310648519, | |
| "grad_norm": 4.637773036956787, | |
| "learning_rate": 6.3480576851339625e-06, | |
| "loss": 42.6268, | |
| "step": 1354 | |
| }, | |
| { | |
| "epoch": 4.642220443021084, | |
| "grad_norm": 3.72841215133667, | |
| "learning_rate": 6.28723129572247e-06, | |
| "loss": 41.0574, | |
| "step": 1356 | |
| }, | |
| { | |
| "epoch": 4.6490525753936485, | |
| "grad_norm": 4.539714813232422, | |
| "learning_rate": 6.226655784400684e-06, | |
| "loss": 43.5752, | |
| "step": 1358 | |
| }, | |
| { | |
| "epoch": 4.655884707766213, | |
| "grad_norm": 5.519583225250244, | |
| "learning_rate": 6.166331963291519e-06, | |
| "loss": 43.3111, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 4.662716840138778, | |
| "grad_norm": 4.942199230194092, | |
| "learning_rate": 6.106260641143546e-06, | |
| "loss": 43.6514, | |
| "step": 1362 | |
| }, | |
| { | |
| "epoch": 4.669548972511342, | |
| "grad_norm": 5.164299011230469, | |
| "learning_rate": 6.046442623320145e-06, | |
| "loss": 40.8611, | |
| "step": 1364 | |
| }, | |
| { | |
| "epoch": 4.676381104883907, | |
| "grad_norm": 4.309698581695557, | |
| "learning_rate": 5.986878711788702e-06, | |
| "loss": 41.3937, | |
| "step": 1366 | |
| }, | |
| { | |
| "epoch": 4.683213237256472, | |
| "grad_norm": 4.105101585388184, | |
| "learning_rate": 5.927569705109828e-06, | |
| "loss": 40.3001, | |
| "step": 1368 | |
| }, | |
| { | |
| "epoch": 4.690045369629036, | |
| "grad_norm": 3.571514368057251, | |
| "learning_rate": 5.868516398426716e-06, | |
| "loss": 41.6858, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 4.696877502001601, | |
| "grad_norm": 5.120858192443848, | |
| "learning_rate": 5.809719583454415e-06, | |
| "loss": 41.4156, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 4.703709634374166, | |
| "grad_norm": 4.679799556732178, | |
| "learning_rate": 5.751180048469243e-06, | |
| "loss": 43.1858, | |
| "step": 1374 | |
| }, | |
| { | |
| "epoch": 4.710541766746731, | |
| "grad_norm": 3.0465521812438965, | |
| "learning_rate": 5.692898578298253e-06, | |
| "loss": 41.213, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 4.717373899119296, | |
| "grad_norm": 4.835347652435303, | |
| "learning_rate": 5.634875954308638e-06, | |
| "loss": 44.0938, | |
| "step": 1378 | |
| }, | |
| { | |
| "epoch": 4.72420603149186, | |
| "grad_norm": 6.645193099975586, | |
| "learning_rate": 5.577112954397321e-06, | |
| "loss": 41.7528, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 4.731038163864425, | |
| "grad_norm": 4.592052936553955, | |
| "learning_rate": 5.519610352980501e-06, | |
| "loss": 42.566, | |
| "step": 1382 | |
| }, | |
| { | |
| "epoch": 4.73787029623699, | |
| "grad_norm": 3.7620317935943604, | |
| "learning_rate": 5.462368920983249e-06, | |
| "loss": 41.7184, | |
| "step": 1384 | |
| }, | |
| { | |
| "epoch": 4.744702428609554, | |
| "grad_norm": 4.0445027351379395, | |
| "learning_rate": 5.405389425829219e-06, | |
| "loss": 41.6249, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 4.751534560982119, | |
| "grad_norm": 3.744433641433716, | |
| "learning_rate": 5.348672631430318e-06, | |
| "loss": 43.0626, | |
| "step": 1388 | |
| }, | |
| { | |
| "epoch": 4.7583666933546835, | |
| "grad_norm": 3.12141489982605, | |
| "learning_rate": 5.292219298176476e-06, | |
| "loss": 42.1533, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 4.765198825727248, | |
| "grad_norm": 6.73304557800293, | |
| "learning_rate": 5.236030182925475e-06, | |
| "loss": 41.6015, | |
| "step": 1392 | |
| }, | |
| { | |
| "epoch": 4.772030958099813, | |
| "grad_norm": 4.076465129852295, | |
| "learning_rate": 5.1801060389927606e-06, | |
| "loss": 43.2645, | |
| "step": 1394 | |
| }, | |
| { | |
| "epoch": 4.7788630904723775, | |
| "grad_norm": 4.178272247314453, | |
| "learning_rate": 5.124447616141381e-06, | |
| "loss": 43.0354, | |
| "step": 1396 | |
| }, | |
| { | |
| "epoch": 4.785695222844943, | |
| "grad_norm": 4.555927276611328, | |
| "learning_rate": 5.06905566057192e-06, | |
| "loss": 42.1086, | |
| "step": 1398 | |
| }, | |
| { | |
| "epoch": 4.792527355217508, | |
| "grad_norm": 4.799075126647949, | |
| "learning_rate": 5.013930914912476e-06, | |
| "loss": 40.7555, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.792527355217508, | |
| "eval_loss": 0.6814665198326111, | |
| "eval_runtime": 134.9461, | |
| "eval_samples_per_second": 29.234, | |
| "eval_steps_per_second": 7.314, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.799359487590072, | |
| "grad_norm": 3.7408673763275146, | |
| "learning_rate": 4.959074118208726e-06, | |
| "loss": 40.9295, | |
| "step": 1402 | |
| }, | |
| { | |
| "epoch": 4.806191619962637, | |
| "grad_norm": 3.9520747661590576, | |
| "learning_rate": 4.9044860059140275e-06, | |
| "loss": 43.4186, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 4.8130237523352015, | |
| "grad_norm": 4.115049839019775, | |
| "learning_rate": 4.850167309879519e-06, | |
| "loss": 42.2491, | |
| "step": 1406 | |
| }, | |
| { | |
| "epoch": 4.819855884707766, | |
| "grad_norm": 5.181631088256836, | |
| "learning_rate": 4.796118758344354e-06, | |
| "loss": 41.583, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 4.826688017080331, | |
| "grad_norm": 3.838186740875244, | |
| "learning_rate": 4.742341075925916e-06, | |
| "loss": 43.3278, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 4.833520149452895, | |
| "grad_norm": 3.6494245529174805, | |
| "learning_rate": 4.6888349836100825e-06, | |
| "loss": 41.3961, | |
| "step": 1412 | |
| }, | |
| { | |
| "epoch": 4.84035228182546, | |
| "grad_norm": 4.139842510223389, | |
| "learning_rate": 4.6356011987416075e-06, | |
| "loss": 43.4135, | |
| "step": 1414 | |
| }, | |
| { | |
| "epoch": 4.847184414198025, | |
| "grad_norm": 4.385437965393066, | |
| "learning_rate": 4.58264043501446e-06, | |
| "loss": 42.1478, | |
| "step": 1416 | |
| }, | |
| { | |
| "epoch": 4.854016546570589, | |
| "grad_norm": 3.691343307495117, | |
| "learning_rate": 4.52995340246227e-06, | |
| "loss": 42.4175, | |
| "step": 1418 | |
| }, | |
| { | |
| "epoch": 4.860848678943155, | |
| "grad_norm": 4.149899482727051, | |
| "learning_rate": 4.477540807448832e-06, | |
| "loss": 42.4116, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 4.8676808113157195, | |
| "grad_norm": 3.8960561752319336, | |
| "learning_rate": 4.425403352658591e-06, | |
| "loss": 41.2306, | |
| "step": 1422 | |
| }, | |
| { | |
| "epoch": 4.874512943688284, | |
| "grad_norm": 3.6276168823242188, | |
| "learning_rate": 4.373541737087264e-06, | |
| "loss": 42.7317, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 4.881345076060849, | |
| "grad_norm": 4.214303016662598, | |
| "learning_rate": 4.32195665603245e-06, | |
| "loss": 41.6166, | |
| "step": 1426 | |
| }, | |
| { | |
| "epoch": 4.888177208433413, | |
| "grad_norm": 4.3136210441589355, | |
| "learning_rate": 4.270648801084296e-06, | |
| "loss": 42.3309, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 4.895009340805978, | |
| "grad_norm": 5.340824604034424, | |
| "learning_rate": 4.219618860116242e-06, | |
| "loss": 40.6249, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 4.901841473178543, | |
| "grad_norm": 3.750943183898926, | |
| "learning_rate": 4.1688675172758064e-06, | |
| "loss": 42.0754, | |
| "step": 1432 | |
| }, | |
| { | |
| "epoch": 4.908673605551107, | |
| "grad_norm": 3.8021140098571777, | |
| "learning_rate": 4.118395452975382e-06, | |
| "loss": 42.8221, | |
| "step": 1434 | |
| }, | |
| { | |
| "epoch": 4.915505737923672, | |
| "grad_norm": 5.09911584854126, | |
| "learning_rate": 4.068203343883159e-06, | |
| "loss": 42.3164, | |
| "step": 1436 | |
| }, | |
| { | |
| "epoch": 4.9223378702962375, | |
| "grad_norm": 3.590981960296631, | |
| "learning_rate": 4.018291862914001e-06, | |
| "loss": 41.0773, | |
| "step": 1438 | |
| }, | |
| { | |
| "epoch": 4.929170002668801, | |
| "grad_norm": 4.474262714385986, | |
| "learning_rate": 3.968661679220468e-06, | |
| "loss": 41.1827, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 4.936002135041367, | |
| "grad_norm": 3.780853748321533, | |
| "learning_rate": 3.919313458183838e-06, | |
| "loss": 41.9009, | |
| "step": 1442 | |
| }, | |
| { | |
| "epoch": 4.942834267413931, | |
| "grad_norm": 4.165524482727051, | |
| "learning_rate": 3.8702478614051355e-06, | |
| "loss": 41.6988, | |
| "step": 1444 | |
| }, | |
| { | |
| "epoch": 4.949666399786496, | |
| "grad_norm": 4.537020683288574, | |
| "learning_rate": 3.821465546696337e-06, | |
| "loss": 42.6527, | |
| "step": 1446 | |
| }, | |
| { | |
| "epoch": 4.956498532159061, | |
| "grad_norm": 5.992898941040039, | |
| "learning_rate": 3.772967168071517e-06, | |
| "loss": 42.3257, | |
| "step": 1448 | |
| }, | |
| { | |
| "epoch": 4.963330664531625, | |
| "grad_norm": 5.681396007537842, | |
| "learning_rate": 3.7247533757380603e-06, | |
| "loss": 42.5366, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.963330664531625, | |
| "eval_loss": 0.6770752668380737, | |
| "eval_runtime": 133.8871, | |
| "eval_samples_per_second": 29.465, | |
| "eval_steps_per_second": 7.372, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.97016279690419, | |
| "grad_norm": 4.46541166305542, | |
| "learning_rate": 3.6768248160879787e-06, | |
| "loss": 41.0476, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 4.976994929276755, | |
| "grad_norm": 4.15000057220459, | |
| "learning_rate": 3.6291821316892184e-06, | |
| "loss": 40.7134, | |
| "step": 1454 | |
| }, | |
| { | |
| "epoch": 4.983827061649319, | |
| "grad_norm": 4.230960369110107, | |
| "learning_rate": 3.5818259612770744e-06, | |
| "loss": 43.5967, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 4.990659194021884, | |
| "grad_norm": 4.932849884033203, | |
| "learning_rate": 3.53475693974559e-06, | |
| "loss": 43.2516, | |
| "step": 1458 | |
| }, | |
| { | |
| "epoch": 4.997491326394449, | |
| "grad_norm": 4.316704273223877, | |
| "learning_rate": 3.487975698139084e-06, | |
| "loss": 42.3811, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 5.003416066186283, | |
| "grad_norm": 4.146729469299316, | |
| "learning_rate": 3.4414828636436525e-06, | |
| "loss": 36.1288, | |
| "step": 1462 | |
| }, | |
| { | |
| "epoch": 5.010248198558847, | |
| "grad_norm": 5.610274791717529, | |
| "learning_rate": 3.3952790595787987e-06, | |
| "loss": 40.6556, | |
| "step": 1464 | |
| }, | |
| { | |
| "epoch": 5.017080330931412, | |
| "grad_norm": 6.292807102203369, | |
| "learning_rate": 3.3493649053890326e-06, | |
| "loss": 42.2675, | |
| "step": 1466 | |
| }, | |
| { | |
| "epoch": 5.023912463303977, | |
| "grad_norm": 4.371929168701172, | |
| "learning_rate": 3.3037410166356143e-06, | |
| "loss": 41.1544, | |
| "step": 1468 | |
| }, | |
| { | |
| "epoch": 5.030744595676541, | |
| "grad_norm": 3.275562047958374, | |
| "learning_rate": 3.258408004988278e-06, | |
| "loss": 42.7401, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 5.037576728049106, | |
| "grad_norm": 5.2857666015625, | |
| "learning_rate": 3.2133664782169948e-06, | |
| "loss": 39.4961, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 5.044408860421671, | |
| "grad_norm": 3.9162814617156982, | |
| "learning_rate": 3.168617040183897e-06, | |
| "loss": 42.7691, | |
| "step": 1474 | |
| }, | |
| { | |
| "epoch": 5.051240992794235, | |
| "grad_norm": 4.741237640380859, | |
| "learning_rate": 3.1241602908351404e-06, | |
| "loss": 39.9539, | |
| "step": 1476 | |
| }, | |
| { | |
| "epoch": 5.0580731251668, | |
| "grad_norm": 4.904325008392334, | |
| "learning_rate": 3.079996826192849e-06, | |
| "loss": 40.999, | |
| "step": 1478 | |
| }, | |
| { | |
| "epoch": 5.0649052575393645, | |
| "grad_norm": 3.9396679401397705, | |
| "learning_rate": 3.036127238347164e-06, | |
| "loss": 41.8233, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 5.071737389911929, | |
| "grad_norm": 3.5699760913848877, | |
| "learning_rate": 2.992552115448258e-06, | |
| "loss": 41.4895, | |
| "step": 1482 | |
| }, | |
| { | |
| "epoch": 5.078569522284495, | |
| "grad_norm": 4.227250099182129, | |
| "learning_rate": 2.9492720416985e-06, | |
| "loss": 41.7825, | |
| "step": 1484 | |
| }, | |
| { | |
| "epoch": 5.085401654657059, | |
| "grad_norm": 3.8788514137268066, | |
| "learning_rate": 2.9062875973445813e-06, | |
| "loss": 41.4301, | |
| "step": 1486 | |
| }, | |
| { | |
| "epoch": 5.092233787029624, | |
| "grad_norm": 3.7242729663848877, | |
| "learning_rate": 2.8635993586697553e-06, | |
| "loss": 40.2917, | |
| "step": 1488 | |
| }, | |
| { | |
| "epoch": 5.099065919402189, | |
| "grad_norm": 5.645269870758057, | |
| "learning_rate": 2.821207897986114e-06, | |
| "loss": 41.1435, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 5.105898051774753, | |
| "grad_norm": 3.9231839179992676, | |
| "learning_rate": 2.779113783626916e-06, | |
| "loss": 41.5506, | |
| "step": 1492 | |
| }, | |
| { | |
| "epoch": 5.112730184147318, | |
| "grad_norm": 4.276205062866211, | |
| "learning_rate": 2.7373175799389415e-06, | |
| "loss": 40.4141, | |
| "step": 1494 | |
| }, | |
| { | |
| "epoch": 5.1195623165198825, | |
| "grad_norm": 6.223433971405029, | |
| "learning_rate": 2.6958198472749717e-06, | |
| "loss": 42.1149, | |
| "step": 1496 | |
| }, | |
| { | |
| "epoch": 5.126394448892447, | |
| "grad_norm": 4.167882442474365, | |
| "learning_rate": 2.65462114198623e-06, | |
| "loss": 40.7711, | |
| "step": 1498 | |
| }, | |
| { | |
| "epoch": 5.133226581265012, | |
| "grad_norm": 3.588376998901367, | |
| "learning_rate": 2.6137220164149435e-06, | |
| "loss": 42.5513, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.133226581265012, | |
| "eval_loss": 0.6761642694473267, | |
| "eval_runtime": 137.9512, | |
| "eval_samples_per_second": 28.597, | |
| "eval_steps_per_second": 7.155, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.140058713637576, | |
| "grad_norm": 4.149092674255371, | |
| "learning_rate": 2.573123018886961e-06, | |
| "loss": 40.5633, | |
| "step": 1502 | |
| }, | |
| { | |
| "epoch": 5.146890846010141, | |
| "grad_norm": 3.9322760105133057, | |
| "learning_rate": 2.5328246937043526e-06, | |
| "loss": 41.3711, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 5.1537229783827065, | |
| "grad_norm": 4.557422161102295, | |
| "learning_rate": 2.492827581138149e-06, | |
| "loss": 39.5696, | |
| "step": 1506 | |
| }, | |
| { | |
| "epoch": 5.160555110755271, | |
| "grad_norm": 3.772927761077881, | |
| "learning_rate": 2.4531322174210975e-06, | |
| "loss": 42.9544, | |
| "step": 1508 | |
| }, | |
| { | |
| "epoch": 5.167387243127836, | |
| "grad_norm": 4.051291465759277, | |
| "learning_rate": 2.4137391347404476e-06, | |
| "loss": 40.978, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 5.1742193755004005, | |
| "grad_norm": 3.6557424068450928, | |
| "learning_rate": 2.37464886123083e-06, | |
| "loss": 41.606, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 5.181051507872965, | |
| "grad_norm": 4.801413536071777, | |
| "learning_rate": 2.3358619209672e-06, | |
| "loss": 41.5917, | |
| "step": 1514 | |
| }, | |
| { | |
| "epoch": 5.18788364024553, | |
| "grad_norm": 4.2001423835754395, | |
| "learning_rate": 2.2973788339577613e-06, | |
| "loss": 43.0596, | |
| "step": 1516 | |
| }, | |
| { | |
| "epoch": 5.194715772618094, | |
| "grad_norm": 5.291867256164551, | |
| "learning_rate": 2.2592001161370392e-06, | |
| "loss": 40.3588, | |
| "step": 1518 | |
| }, | |
| { | |
| "epoch": 5.201547904990659, | |
| "grad_norm": 3.7930984497070312, | |
| "learning_rate": 2.2213262793589484e-06, | |
| "loss": 42.0758, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 5.208380037363224, | |
| "grad_norm": 4.888052940368652, | |
| "learning_rate": 2.1837578313899098e-06, | |
| "loss": 39.7415, | |
| "step": 1522 | |
| }, | |
| { | |
| "epoch": 5.215212169735788, | |
| "grad_norm": 4.963688850402832, | |
| "learning_rate": 2.1464952759020855e-06, | |
| "loss": 42.05, | |
| "step": 1524 | |
| }, | |
| { | |
| "epoch": 5.222044302108353, | |
| "grad_norm": 4.556923866271973, | |
| "learning_rate": 2.109539112466588e-06, | |
| "loss": 40.5828, | |
| "step": 1526 | |
| }, | |
| { | |
| "epoch": 5.228876434480918, | |
| "grad_norm": 3.550285577774048, | |
| "learning_rate": 2.0728898365467903e-06, | |
| "loss": 41.4201, | |
| "step": 1528 | |
| }, | |
| { | |
| "epoch": 5.235708566853483, | |
| "grad_norm": 4.290851593017578, | |
| "learning_rate": 2.0365479394917147e-06, | |
| "loss": 41.1988, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 5.242540699226048, | |
| "grad_norm": 4.436618804931641, | |
| "learning_rate": 2.0005139085293945e-06, | |
| "loss": 41.1016, | |
| "step": 1532 | |
| }, | |
| { | |
| "epoch": 5.249372831598612, | |
| "grad_norm": 6.221188068389893, | |
| "learning_rate": 1.9647882267603862e-06, | |
| "loss": 42.1538, | |
| "step": 1534 | |
| }, | |
| { | |
| "epoch": 5.256204963971177, | |
| "grad_norm": 4.712629795074463, | |
| "learning_rate": 1.9293713731512673e-06, | |
| "loss": 41.1176, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 5.263037096343742, | |
| "grad_norm": 4.693170070648193, | |
| "learning_rate": 1.894263822528225e-06, | |
| "loss": 41.3687, | |
| "step": 1538 | |
| }, | |
| { | |
| "epoch": 5.269869228716306, | |
| "grad_norm": 4.854535102844238, | |
| "learning_rate": 1.8594660455706763e-06, | |
| "loss": 41.6856, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 5.276701361088871, | |
| "grad_norm": 3.5167202949523926, | |
| "learning_rate": 1.8249785088049893e-06, | |
| "loss": 42.5848, | |
| "step": 1542 | |
| }, | |
| { | |
| "epoch": 5.2835334934614355, | |
| "grad_norm": 4.029543399810791, | |
| "learning_rate": 1.790801674598186e-06, | |
| "loss": 41.8932, | |
| "step": 1544 | |
| }, | |
| { | |
| "epoch": 5.290365625834, | |
| "grad_norm": 4.217826843261719, | |
| "learning_rate": 1.7569360011517848e-06, | |
| "loss": 41.478, | |
| "step": 1546 | |
| }, | |
| { | |
| "epoch": 5.297197758206565, | |
| "grad_norm": 3.8237998485565186, | |
| "learning_rate": 1.7233819424956248e-06, | |
| "loss": 42.5394, | |
| "step": 1548 | |
| }, | |
| { | |
| "epoch": 5.30402989057913, | |
| "grad_norm": 5.044140338897705, | |
| "learning_rate": 1.6901399484818004e-06, | |
| "loss": 41.0466, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 5.30402989057913, | |
| "eval_loss": 0.6723917722702026, | |
| "eval_runtime": 132.3674, | |
| "eval_samples_per_second": 29.803, | |
| "eval_steps_per_second": 7.457, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 5.310862022951695, | |
| "grad_norm": 4.023882865905762, | |
| "learning_rate": 1.6572104647786247e-06, | |
| "loss": 40.4515, | |
| "step": 1552 | |
| }, | |
| { | |
| "epoch": 5.31769415532426, | |
| "grad_norm": 5.667575836181641, | |
| "learning_rate": 1.624593932864632e-06, | |
| "loss": 42.2196, | |
| "step": 1554 | |
| }, | |
| { | |
| "epoch": 5.324526287696824, | |
| "grad_norm": 3.771815299987793, | |
| "learning_rate": 1.5922907900227018e-06, | |
| "loss": 41.1018, | |
| "step": 1556 | |
| }, | |
| { | |
| "epoch": 5.331358420069389, | |
| "grad_norm": 4.044847011566162, | |
| "learning_rate": 1.5603014693341662e-06, | |
| "loss": 40.8528, | |
| "step": 1558 | |
| }, | |
| { | |
| "epoch": 5.3381905524419535, | |
| "grad_norm": 4.64625358581543, | |
| "learning_rate": 1.5286263996730026e-06, | |
| "loss": 41.612, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 5.345022684814518, | |
| "grad_norm": 5.102336406707764, | |
| "learning_rate": 1.497266005700107e-06, | |
| "loss": 40.965, | |
| "step": 1562 | |
| }, | |
| { | |
| "epoch": 5.351854817187083, | |
| "grad_norm": 3.1535797119140625, | |
| "learning_rate": 1.4662207078575684e-06, | |
| "loss": 40.5264, | |
| "step": 1564 | |
| }, | |
| { | |
| "epoch": 5.358686949559647, | |
| "grad_norm": 3.740694522857666, | |
| "learning_rate": 1.4354909223630669e-06, | |
| "loss": 41.5863, | |
| "step": 1566 | |
| }, | |
| { | |
| "epoch": 5.365519081932212, | |
| "grad_norm": 4.79527473449707, | |
| "learning_rate": 1.40507706120426e-06, | |
| "loss": 41.3632, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 5.372351214304777, | |
| "grad_norm": 4.936699867248535, | |
| "learning_rate": 1.3749795321332887e-06, | |
| "loss": 41.898, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 5.379183346677342, | |
| "grad_norm": 6.228104114532471, | |
| "learning_rate": 1.3451987386612851e-06, | |
| "loss": 41.3327, | |
| "step": 1572 | |
| }, | |
| { | |
| "epoch": 5.386015479049907, | |
| "grad_norm": 3.9607808589935303, | |
| "learning_rate": 1.3157350800529878e-06, | |
| "loss": 39.3806, | |
| "step": 1574 | |
| }, | |
| { | |
| "epoch": 5.3928476114224715, | |
| "grad_norm": 3.2485790252685547, | |
| "learning_rate": 1.286588951321363e-06, | |
| "loss": 39.292, | |
| "step": 1576 | |
| }, | |
| { | |
| "epoch": 5.399679743795036, | |
| "grad_norm": 4.702234745025635, | |
| "learning_rate": 1.2577607432223276e-06, | |
| "loss": 40.3127, | |
| "step": 1578 | |
| }, | |
| { | |
| "epoch": 5.406511876167601, | |
| "grad_norm": 4.465649127960205, | |
| "learning_rate": 1.2292508422495158e-06, | |
| "loss": 41.7889, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 5.413344008540165, | |
| "grad_norm": 4.618641376495361, | |
| "learning_rate": 1.2010596306290589e-06, | |
| "loss": 41.2257, | |
| "step": 1582 | |
| }, | |
| { | |
| "epoch": 5.42017614091273, | |
| "grad_norm": 4.093713283538818, | |
| "learning_rate": 1.1731874863145143e-06, | |
| "loss": 41.7067, | |
| "step": 1584 | |
| }, | |
| { | |
| "epoch": 5.427008273285295, | |
| "grad_norm": 5.642305374145508, | |
| "learning_rate": 1.145634782981761e-06, | |
| "loss": 41.1947, | |
| "step": 1586 | |
| }, | |
| { | |
| "epoch": 5.433840405657859, | |
| "grad_norm": 3.9637906551361084, | |
| "learning_rate": 1.1184018900240011e-06, | |
| "loss": 41.5425, | |
| "step": 1588 | |
| }, | |
| { | |
| "epoch": 5.440672538030424, | |
| "grad_norm": 4.328593730926514, | |
| "learning_rate": 1.0914891725468141e-06, | |
| "loss": 41.7915, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 5.4475046704029895, | |
| "grad_norm": 4.559619903564453, | |
| "learning_rate": 1.06489699136324e-06, | |
| "loss": 39.5462, | |
| "step": 1592 | |
| }, | |
| { | |
| "epoch": 5.454336802775554, | |
| "grad_norm": 4.174973011016846, | |
| "learning_rate": 1.0386257029889768e-06, | |
| "loss": 40.6458, | |
| "step": 1594 | |
| }, | |
| { | |
| "epoch": 5.461168935148119, | |
| "grad_norm": 3.249431610107422, | |
| "learning_rate": 1.0126756596375686e-06, | |
| "loss": 41.4128, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 5.468001067520683, | |
| "grad_norm": 4.598479747772217, | |
| "learning_rate": 9.87047209215694e-07, | |
| "loss": 41.7854, | |
| "step": 1598 | |
| }, | |
| { | |
| "epoch": 5.474833199893248, | |
| "grad_norm": 3.558709144592285, | |
| "learning_rate": 9.617406953185138e-07, | |
| "loss": 41.9632, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.474833199893248, | |
| "eval_loss": 0.6698766350746155, | |
| "eval_runtime": 133.9539, | |
| "eval_samples_per_second": 29.45, | |
| "eval_steps_per_second": 7.368, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.481665332265813, | |
| "grad_norm": 5.397751331329346, | |
| "learning_rate": 9.36756457225052e-07, | |
| "loss": 40.2635, | |
| "step": 1602 | |
| }, | |
| { | |
| "epoch": 5.488497464638377, | |
| "grad_norm": 5.443418502807617, | |
| "learning_rate": 9.120948298936421e-07, | |
| "loss": 40.6923, | |
| "step": 1604 | |
| }, | |
| { | |
| "epoch": 5.495329597010942, | |
| "grad_norm": 3.991673707962036, | |
| "learning_rate": 8.87756143957455e-07, | |
| "loss": 40.0543, | |
| "step": 1606 | |
| }, | |
| { | |
| "epoch": 5.502161729383507, | |
| "grad_norm": 4.649523735046387, | |
| "learning_rate": 8.637407257200497e-07, | |
| "loss": 41.3534, | |
| "step": 1608 | |
| }, | |
| { | |
| "epoch": 5.508993861756071, | |
| "grad_norm": 4.675793170928955, | |
| "learning_rate": 8.400488971509968e-07, | |
| "loss": 39.8315, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 5.515825994128637, | |
| "grad_norm": 3.273359775543213, | |
| "learning_rate": 8.166809758815896e-07, | |
| "loss": 39.9979, | |
| "step": 1612 | |
| }, | |
| { | |
| "epoch": 5.5226581265012005, | |
| "grad_norm": 4.165469169616699, | |
| "learning_rate": 7.936372752005399e-07, | |
| "loss": 39.3362, | |
| "step": 1614 | |
| }, | |
| { | |
| "epoch": 5.529490258873766, | |
| "grad_norm": 4.015806674957275, | |
| "learning_rate": 7.709181040498254e-07, | |
| "loss": 40.7772, | |
| "step": 1616 | |
| }, | |
| { | |
| "epoch": 5.536322391246331, | |
| "grad_norm": 6.13747501373291, | |
| "learning_rate": 7.485237670205175e-07, | |
| "loss": 40.8463, | |
| "step": 1618 | |
| }, | |
| { | |
| "epoch": 5.543154523618895, | |
| "grad_norm": 3.6014761924743652, | |
| "learning_rate": 7.264545643486997e-07, | |
| "loss": 40.231, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 5.54998665599146, | |
| "grad_norm": 4.055222034454346, | |
| "learning_rate": 7.047107919114588e-07, | |
| "loss": 42.5435, | |
| "step": 1622 | |
| }, | |
| { | |
| "epoch": 5.5568187883640245, | |
| "grad_norm": 5.444411277770996, | |
| "learning_rate": 6.832927412229018e-07, | |
| "loss": 41.0914, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 5.563650920736589, | |
| "grad_norm": 3.4832520484924316, | |
| "learning_rate": 6.622006994302543e-07, | |
| "loss": 42.297, | |
| "step": 1626 | |
| }, | |
| { | |
| "epoch": 5.570483053109154, | |
| "grad_norm": 5.123753547668457, | |
| "learning_rate": 6.41434949310013e-07, | |
| "loss": 40.4283, | |
| "step": 1628 | |
| }, | |
| { | |
| "epoch": 5.5773151854817185, | |
| "grad_norm": 5.2065277099609375, | |
| "learning_rate": 6.209957692641544e-07, | |
| "loss": 40.5581, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 5.584147317854283, | |
| "grad_norm": 4.573667049407959, | |
| "learning_rate": 6.008834333163876e-07, | |
| "loss": 39.4126, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 5.590979450226849, | |
| "grad_norm": 5.208593368530273, | |
| "learning_rate": 5.810982111085106e-07, | |
| "loss": 40.7202, | |
| "step": 1634 | |
| }, | |
| { | |
| "epoch": 5.597811582599413, | |
| "grad_norm": 4.341737747192383, | |
| "learning_rate": 5.616403678967624e-07, | |
| "loss": 40.9683, | |
| "step": 1636 | |
| }, | |
| { | |
| "epoch": 5.604643714971978, | |
| "grad_norm": 4.836015701293945, | |
| "learning_rate": 5.42510164548285e-07, | |
| "loss": 40.4273, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 5.6114758473445425, | |
| "grad_norm": 4.308472633361816, | |
| "learning_rate": 5.237078575376336e-07, | |
| "loss": 41.0492, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 5.618307979717107, | |
| "grad_norm": 4.316090106964111, | |
| "learning_rate": 5.052336989433082e-07, | |
| "loss": 40.6806, | |
| "step": 1642 | |
| }, | |
| { | |
| "epoch": 5.625140112089672, | |
| "grad_norm": 3.6825830936431885, | |
| "learning_rate": 4.870879364444109e-07, | |
| "loss": 40.5467, | |
| "step": 1644 | |
| }, | |
| { | |
| "epoch": 5.631972244462236, | |
| "grad_norm": 5.199794769287109, | |
| "learning_rate": 4.692708133172991e-07, | |
| "loss": 39.4587, | |
| "step": 1646 | |
| }, | |
| { | |
| "epoch": 5.638804376834801, | |
| "grad_norm": 3.3388471603393555, | |
| "learning_rate": 4.517825684323324e-07, | |
| "loss": 39.1098, | |
| "step": 1648 | |
| }, | |
| { | |
| "epoch": 5.645636509207366, | |
| "grad_norm": 4.200729846954346, | |
| "learning_rate": 4.346234362506724e-07, | |
| "loss": 40.122, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 5.645636509207366, | |
| "eval_loss": 0.6662212014198303, | |
| "eval_runtime": 137.6293, | |
| "eval_samples_per_second": 28.664, | |
| "eval_steps_per_second": 7.171, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 5.65246864157993, | |
| "grad_norm": 3.9246127605438232, | |
| "learning_rate": 4.1779364682113796e-07, | |
| "loss": 40.0725, | |
| "step": 1652 | |
| }, | |
| { | |
| "epoch": 5.659300773952495, | |
| "grad_norm": 4.904084205627441, | |
| "learning_rate": 4.012934257771134e-07, | |
| "loss": 40.0188, | |
| "step": 1654 | |
| }, | |
| { | |
| "epoch": 5.6661329063250605, | |
| "grad_norm": 4.436688423156738, | |
| "learning_rate": 3.851229943335394e-07, | |
| "loss": 39.9216, | |
| "step": 1656 | |
| }, | |
| { | |
| "epoch": 5.672965038697625, | |
| "grad_norm": 4.027088642120361, | |
| "learning_rate": 3.6928256928393247e-07, | |
| "loss": 41.4124, | |
| "step": 1658 | |
| }, | |
| { | |
| "epoch": 5.67979717107019, | |
| "grad_norm": 3.796221971511841, | |
| "learning_rate": 3.537723629974815e-07, | |
| "loss": 39.8851, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 5.686629303442754, | |
| "grad_norm": 4.7540130615234375, | |
| "learning_rate": 3.3859258341621125e-07, | |
| "loss": 40.1716, | |
| "step": 1662 | |
| }, | |
| { | |
| "epoch": 5.693461435815319, | |
| "grad_norm": 4.521333694458008, | |
| "learning_rate": 3.237434340521789e-07, | |
| "loss": 41.4182, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 5.700293568187884, | |
| "grad_norm": 4.776477336883545, | |
| "learning_rate": 3.0922511398475683e-07, | |
| "loss": 41.2698, | |
| "step": 1666 | |
| }, | |
| { | |
| "epoch": 5.707125700560448, | |
| "grad_norm": 4.749114990234375, | |
| "learning_rate": 2.9503781785795713e-07, | |
| "loss": 42.4175, | |
| "step": 1668 | |
| }, | |
| { | |
| "epoch": 5.713957832933013, | |
| "grad_norm": 4.831925392150879, | |
| "learning_rate": 2.8118173587782516e-07, | |
| "loss": 40.593, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 5.720789965305578, | |
| "grad_norm": 4.17523193359375, | |
| "learning_rate": 2.6765705380989437e-07, | |
| "loss": 39.8755, | |
| "step": 1672 | |
| }, | |
| { | |
| "epoch": 5.727622097678142, | |
| "grad_norm": 4.183824062347412, | |
| "learning_rate": 2.544639529766829e-07, | |
| "loss": 40.7682, | |
| "step": 1674 | |
| }, | |
| { | |
| "epoch": 5.734454230050707, | |
| "grad_norm": 4.203549385070801, | |
| "learning_rate": 2.416026102552732e-07, | |
| "loss": 40.1932, | |
| "step": 1676 | |
| }, | |
| { | |
| "epoch": 5.741286362423272, | |
| "grad_norm": 4.252909183502197, | |
| "learning_rate": 2.290731980749361e-07, | |
| "loss": 41.4024, | |
| "step": 1678 | |
| }, | |
| { | |
| "epoch": 5.748118494795837, | |
| "grad_norm": 4.110680103302002, | |
| "learning_rate": 2.168758844148272e-07, | |
| "loss": 40.8089, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 5.754950627168402, | |
| "grad_norm": 4.860687732696533, | |
| "learning_rate": 2.050108328017164e-07, | |
| "loss": 41.278, | |
| "step": 1682 | |
| }, | |
| { | |
| "epoch": 5.761782759540966, | |
| "grad_norm": 7.037466526031494, | |
| "learning_rate": 1.93478202307823e-07, | |
| "loss": 42.0162, | |
| "step": 1684 | |
| }, | |
| { | |
| "epoch": 5.768614891913531, | |
| "grad_norm": 4.048498630523682, | |
| "learning_rate": 1.8227814754865068e-07, | |
| "loss": 41.2187, | |
| "step": 1686 | |
| }, | |
| { | |
| "epoch": 5.775447024286096, | |
| "grad_norm": 3.721379518508911, | |
| "learning_rate": 1.7141081868094212e-07, | |
| "loss": 41.8383, | |
| "step": 1688 | |
| }, | |
| { | |
| "epoch": 5.78227915665866, | |
| "grad_norm": 6.793107509613037, | |
| "learning_rate": 1.6087636140065532e-07, | |
| "loss": 40.5894, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 5.789111289031225, | |
| "grad_norm": 4.424513339996338, | |
| "learning_rate": 1.5067491694100154e-07, | |
| "loss": 41.2666, | |
| "step": 1692 | |
| }, | |
| { | |
| "epoch": 5.7959434214037895, | |
| "grad_norm": 4.707203388214111, | |
| "learning_rate": 1.4080662207056894e-07, | |
| "loss": 41.2405, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 5.802775553776354, | |
| "grad_norm": 2.994469165802002, | |
| "learning_rate": 1.3127160909147672e-07, | |
| "loss": 42.6466, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 5.809607686148919, | |
| "grad_norm": 3.029481887817383, | |
| "learning_rate": 1.220700058376073e-07, | |
| "loss": 40.642, | |
| "step": 1698 | |
| }, | |
| { | |
| "epoch": 5.816439818521484, | |
| "grad_norm": 3.4690332412719727, | |
| "learning_rate": 1.1320193567288529e-07, | |
| "loss": 41.02, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.816439818521484, | |
| "eval_loss": 0.6652334928512573, | |
| "eval_runtime": 134.4616, | |
| "eval_samples_per_second": 29.339, | |
| "eval_steps_per_second": 7.34, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.823271950894049, | |
| "grad_norm": 5.008721828460693, | |
| "learning_rate": 1.0466751748963444e-07, | |
| "loss": 40.1855, | |
| "step": 1702 | |
| }, | |
| { | |
| "epoch": 5.830104083266614, | |
| "grad_norm": 5.638387680053711, | |
| "learning_rate": 9.646686570697061e-08, | |
| "loss": 40.6194, | |
| "step": 1704 | |
| }, | |
| { | |
| "epoch": 5.836936215639178, | |
| "grad_norm": 5.234898567199707, | |
| "learning_rate": 8.860009026928629e-08, | |
| "loss": 40.6608, | |
| "step": 1706 | |
| }, | |
| { | |
| "epoch": 5.843768348011743, | |
| "grad_norm": 4.212846279144287, | |
| "learning_rate": 8.106729664475176e-08, | |
| "loss": 41.4097, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 5.8506004803843075, | |
| "grad_norm": 3.5884008407592773, | |
| "learning_rate": 7.386858582392187e-08, | |
| "loss": 39.4515, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 5.857432612756872, | |
| "grad_norm": 4.441662788391113, | |
| "learning_rate": 6.700405431837587e-08, | |
| "loss": 41.8026, | |
| "step": 1712 | |
| }, | |
| { | |
| "epoch": 5.864264745129437, | |
| "grad_norm": 5.290170192718506, | |
| "learning_rate": 6.047379415941856e-08, | |
| "loss": 40.8839, | |
| "step": 1714 | |
| }, | |
| { | |
| "epoch": 5.871096877502001, | |
| "grad_norm": 3.4507861137390137, | |
| "learning_rate": 5.4277892896853476e-08, | |
| "loss": 40.574, | |
| "step": 1716 | |
| }, | |
| { | |
| "epoch": 5.877929009874566, | |
| "grad_norm": 3.869871139526367, | |
| "learning_rate": 4.8416433597803234e-08, | |
| "loss": 41.8288, | |
| "step": 1718 | |
| }, | |
| { | |
| "epoch": 5.884761142247131, | |
| "grad_norm": 4.644185543060303, | |
| "learning_rate": 4.2889494845599344e-08, | |
| "loss": 41.318, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 5.891593274619696, | |
| "grad_norm": 3.191018581390381, | |
| "learning_rate": 3.769715073872748e-08, | |
| "loss": 41.1112, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 5.898425406992261, | |
| "grad_norm": 3.394134998321533, | |
| "learning_rate": 3.283947088983663e-08, | |
| "loss": 41.9932, | |
| "step": 1724 | |
| }, | |
| { | |
| "epoch": 5.9052575393648254, | |
| "grad_norm": 4.62444543838501, | |
| "learning_rate": 2.831652042480093e-08, | |
| "loss": 39.9583, | |
| "step": 1726 | |
| }, | |
| { | |
| "epoch": 5.91208967173739, | |
| "grad_norm": 4.27966833114624, | |
| "learning_rate": 2.4128359981850924e-08, | |
| "loss": 39.915, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 5.918921804109955, | |
| "grad_norm": 3.7036333084106445, | |
| "learning_rate": 2.0275045710760334e-08, | |
| "loss": 40.0384, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 5.925753936482519, | |
| "grad_norm": 5.249677658081055, | |
| "learning_rate": 1.6756629272085545e-08, | |
| "loss": 40.1564, | |
| "step": 1732 | |
| }, | |
| { | |
| "epoch": 5.932586068855084, | |
| "grad_norm": 4.477707862854004, | |
| "learning_rate": 1.3573157836485606e-08, | |
| "loss": 40.6008, | |
| "step": 1734 | |
| }, | |
| { | |
| "epoch": 5.939418201227649, | |
| "grad_norm": 4.939481258392334, | |
| "learning_rate": 1.0724674084083841e-08, | |
| "loss": 40.9639, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 5.946250333600213, | |
| "grad_norm": 2.9428999423980713, | |
| "learning_rate": 8.211216203890537e-09, | |
| "loss": 40.9722, | |
| "step": 1738 | |
| }, | |
| { | |
| "epoch": 5.953082465972778, | |
| "grad_norm": 4.589330673217773, | |
| "learning_rate": 6.032817893297793e-09, | |
| "loss": 41.4832, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 5.9599145983453425, | |
| "grad_norm": 5.4429450035095215, | |
| "learning_rate": 4.1895083576271035e-09, | |
| "loss": 41.8059, | |
| "step": 1742 | |
| }, | |
| { | |
| "epoch": 5.966746730717908, | |
| "grad_norm": 3.5152432918548584, | |
| "learning_rate": 2.681312309735229e-09, | |
| "loss": 41.2228, | |
| "step": 1744 | |
| }, | |
| { | |
| "epoch": 5.973578863090473, | |
| "grad_norm": 4.573424339294434, | |
| "learning_rate": 1.5082499696839059e-09, | |
| "loss": 41.9849, | |
| "step": 1746 | |
| }, | |
| { | |
| "epoch": 5.980410995463037, | |
| "grad_norm": 4.099581718444824, | |
| "learning_rate": 6.703370644706164e-10, | |
| "loss": 40.6948, | |
| "step": 1748 | |
| }, | |
| { | |
| "epoch": 5.987243127835602, | |
| "grad_norm": 4.090056896209717, | |
| "learning_rate": 1.6758482781209507e-10, | |
| "loss": 40.9226, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 5.987243127835602, | |
| "eval_loss": 0.6658891439437866, | |
| "eval_runtime": 134.1369, | |
| "eval_samples_per_second": 29.41, | |
| "eval_steps_per_second": 7.358, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 5.994075260208167, | |
| "grad_norm": 4.494061470031738, | |
| "learning_rate": 0.0, | |
| "loss": 41.0993, | |
| "step": 1752 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 1752, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 1 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.616163439072248e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |