|
{ |
|
"best_metric": 0.6168031096458435, |
|
"best_model_checkpoint": "/Lora_models/checkpoint-5000", |
|
"epoch": 0.9752438109527382, |
|
"eval_steps": 200, |
|
"global_step": 5200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018754688672168042, |
|
"grad_norm": 0.9346357583999634, |
|
"learning_rate": 2.7e-06, |
|
"loss": 2.5803, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0037509377344336083, |
|
"grad_norm": 1.251865267753601, |
|
"learning_rate": 5.7000000000000005e-06, |
|
"loss": 2.4539, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005626406601650412, |
|
"grad_norm": 1.055527687072754, |
|
"learning_rate": 8.7e-06, |
|
"loss": 2.5107, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007501875468867217, |
|
"grad_norm": 1.1238517761230469, |
|
"learning_rate": 1.1700000000000001e-05, |
|
"loss": 2.4512, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.009377344336084021, |
|
"grad_norm": 1.20820152759552, |
|
"learning_rate": 1.47e-05, |
|
"loss": 2.4078, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011252813203300824, |
|
"grad_norm": 2.085667133331299, |
|
"learning_rate": 1.77e-05, |
|
"loss": 2.2828, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01312828207051763, |
|
"grad_norm": 1.357437252998352, |
|
"learning_rate": 2.07e-05, |
|
"loss": 2.0614, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.015003750937734433, |
|
"grad_norm": 1.6979730129241943, |
|
"learning_rate": 2.37e-05, |
|
"loss": 1.9482, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01687921980495124, |
|
"grad_norm": 1.7537882328033447, |
|
"learning_rate": 2.6700000000000002e-05, |
|
"loss": 1.775, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.018754688672168042, |
|
"grad_norm": 1.8001635074615479, |
|
"learning_rate": 2.97e-05, |
|
"loss": 1.6462, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.020630157539384845, |
|
"grad_norm": 1.744318962097168, |
|
"learning_rate": 2.9948394495412847e-05, |
|
"loss": 1.3796, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02250562640660165, |
|
"grad_norm": 1.6778439283370972, |
|
"learning_rate": 2.989105504587156e-05, |
|
"loss": 1.2642, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.024381095273818456, |
|
"grad_norm": 2.0165181159973145, |
|
"learning_rate": 2.9833715596330273e-05, |
|
"loss": 0.9405, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02625656414103526, |
|
"grad_norm": 1.1453020572662354, |
|
"learning_rate": 2.9776376146788993e-05, |
|
"loss": 0.8365, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.028132033008252063, |
|
"grad_norm": 0.9237515330314636, |
|
"learning_rate": 2.9719036697247706e-05, |
|
"loss": 0.9324, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.030007501875468866, |
|
"grad_norm": 1.1040199995040894, |
|
"learning_rate": 2.9661697247706423e-05, |
|
"loss": 0.8905, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03188297074268567, |
|
"grad_norm": 1.052095651626587, |
|
"learning_rate": 2.9604357798165136e-05, |
|
"loss": 0.7226, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03375843960990248, |
|
"grad_norm": 1.4509942531585693, |
|
"learning_rate": 2.9547018348623853e-05, |
|
"loss": 0.7401, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03563390847711928, |
|
"grad_norm": 1.3202496767044067, |
|
"learning_rate": 2.948967889908257e-05, |
|
"loss": 0.6498, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.037509377344336084, |
|
"grad_norm": 1.257535457611084, |
|
"learning_rate": 2.9432339449541286e-05, |
|
"loss": 0.8931, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.037509377344336084, |
|
"eval_loss": 0.8721055388450623, |
|
"eval_runtime": 5.4276, |
|
"eval_samples_per_second": 22.109, |
|
"eval_steps_per_second": 2.764, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03938484621155289, |
|
"grad_norm": 1.1922121047973633, |
|
"learning_rate": 2.9375e-05, |
|
"loss": 0.9556, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04126031507876969, |
|
"grad_norm": 1.1076873540878296, |
|
"learning_rate": 2.9317660550458716e-05, |
|
"loss": 0.8148, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.043135783945986494, |
|
"grad_norm": 1.3961315155029297, |
|
"learning_rate": 2.9260321100917432e-05, |
|
"loss": 0.7225, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0450112528132033, |
|
"grad_norm": 1.3916189670562744, |
|
"learning_rate": 2.920298165137615e-05, |
|
"loss": 0.8899, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04688672168042011, |
|
"grad_norm": 1.4073107242584229, |
|
"learning_rate": 2.9145642201834862e-05, |
|
"loss": 0.7895, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04876219054763691, |
|
"grad_norm": 1.4054927825927734, |
|
"learning_rate": 2.908830275229358e-05, |
|
"loss": 0.8662, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.050637659414853715, |
|
"grad_norm": 1.0531301498413086, |
|
"learning_rate": 2.9030963302752292e-05, |
|
"loss": 0.7619, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05251312828207052, |
|
"grad_norm": 1.3712563514709473, |
|
"learning_rate": 2.8973623853211012e-05, |
|
"loss": 0.8608, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05438859714928732, |
|
"grad_norm": 1.4002102613449097, |
|
"learning_rate": 2.8916284403669725e-05, |
|
"loss": 0.7368, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.056264066016504126, |
|
"grad_norm": 1.381103754043579, |
|
"learning_rate": 2.8858944954128442e-05, |
|
"loss": 0.8059, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05813953488372093, |
|
"grad_norm": 1.9642342329025269, |
|
"learning_rate": 2.8801605504587155e-05, |
|
"loss": 0.8149, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06001500375093773, |
|
"grad_norm": 1.4538882970809937, |
|
"learning_rate": 2.8744266055045875e-05, |
|
"loss": 0.6891, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.061890472618154536, |
|
"grad_norm": 1.47075617313385, |
|
"learning_rate": 2.8686926605504588e-05, |
|
"loss": 0.8458, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06376594148537135, |
|
"grad_norm": 1.1955201625823975, |
|
"learning_rate": 2.8629587155963305e-05, |
|
"loss": 0.7404, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06564141035258815, |
|
"grad_norm": 1.151567816734314, |
|
"learning_rate": 2.8572247706422018e-05, |
|
"loss": 0.6032, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06751687921980495, |
|
"grad_norm": 1.3864506483078003, |
|
"learning_rate": 2.8514908256880738e-05, |
|
"loss": 0.7951, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06939234808702176, |
|
"grad_norm": 1.3162195682525635, |
|
"learning_rate": 2.845756880733945e-05, |
|
"loss": 0.698, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07126781695423856, |
|
"grad_norm": 1.0740330219268799, |
|
"learning_rate": 2.8400229357798164e-05, |
|
"loss": 0.7008, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07314328582145536, |
|
"grad_norm": 1.4195873737335205, |
|
"learning_rate": 2.834288990825688e-05, |
|
"loss": 0.7324, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07501875468867217, |
|
"grad_norm": 1.6425796747207642, |
|
"learning_rate": 2.8285550458715594e-05, |
|
"loss": 0.6954, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07501875468867217, |
|
"eval_loss": 0.790172815322876, |
|
"eval_runtime": 5.5372, |
|
"eval_samples_per_second": 21.671, |
|
"eval_steps_per_second": 2.709, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07689422355588897, |
|
"grad_norm": 1.4094221591949463, |
|
"learning_rate": 2.8228211009174314e-05, |
|
"loss": 0.6533, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07876969242310577, |
|
"grad_norm": 1.7052984237670898, |
|
"learning_rate": 2.8170871559633027e-05, |
|
"loss": 0.8291, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 1.766396164894104, |
|
"learning_rate": 2.8113532110091744e-05, |
|
"loss": 0.5917, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08252063015753938, |
|
"grad_norm": 1.3280906677246094, |
|
"learning_rate": 2.8056192660550457e-05, |
|
"loss": 0.5834, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08439609902475619, |
|
"grad_norm": 1.472038984298706, |
|
"learning_rate": 2.7998853211009177e-05, |
|
"loss": 0.6189, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08627156789197299, |
|
"grad_norm": 2.434629440307617, |
|
"learning_rate": 2.794151376146789e-05, |
|
"loss": 0.6107, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.08814703675918979, |
|
"grad_norm": 1.7748132944107056, |
|
"learning_rate": 2.7884174311926607e-05, |
|
"loss": 0.508, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0900225056264066, |
|
"grad_norm": 1.7380709648132324, |
|
"learning_rate": 2.782683486238532e-05, |
|
"loss": 0.6482, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0918979744936234, |
|
"grad_norm": 1.1493253707885742, |
|
"learning_rate": 2.7769495412844037e-05, |
|
"loss": 0.6531, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.09377344336084022, |
|
"grad_norm": 1.384508728981018, |
|
"learning_rate": 2.7712155963302753e-05, |
|
"loss": 0.7061, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09564891222805702, |
|
"grad_norm": 1.792687177658081, |
|
"learning_rate": 2.765481651376147e-05, |
|
"loss": 0.6, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.09752438109527382, |
|
"grad_norm": 1.657291054725647, |
|
"learning_rate": 2.7597477064220183e-05, |
|
"loss": 0.612, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.09939984996249063, |
|
"grad_norm": 1.2928940057754517, |
|
"learning_rate": 2.75401376146789e-05, |
|
"loss": 0.7446, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.10127531882970743, |
|
"grad_norm": 1.3647221326828003, |
|
"learning_rate": 2.7482798165137616e-05, |
|
"loss": 0.6422, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.10315078769692423, |
|
"grad_norm": 1.7979224920272827, |
|
"learning_rate": 2.7425458715596333e-05, |
|
"loss": 0.5261, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.10502625656414104, |
|
"grad_norm": 1.4330195188522339, |
|
"learning_rate": 2.7368119266055046e-05, |
|
"loss": 0.6801, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.10690172543135784, |
|
"grad_norm": 1.4820642471313477, |
|
"learning_rate": 2.7310779816513763e-05, |
|
"loss": 0.6767, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.10877719429857464, |
|
"grad_norm": 1.6445374488830566, |
|
"learning_rate": 2.7253440366972476e-05, |
|
"loss": 0.6727, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.11065266316579145, |
|
"grad_norm": 1.5297715663909912, |
|
"learning_rate": 2.7196100917431196e-05, |
|
"loss": 0.5425, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.11252813203300825, |
|
"grad_norm": 2.9595024585723877, |
|
"learning_rate": 2.713876146788991e-05, |
|
"loss": 0.5758, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11252813203300825, |
|
"eval_loss": 0.7191774249076843, |
|
"eval_runtime": 5.5393, |
|
"eval_samples_per_second": 21.663, |
|
"eval_steps_per_second": 2.708, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11440360090022506, |
|
"grad_norm": 1.300517201423645, |
|
"learning_rate": 2.7081422018348626e-05, |
|
"loss": 0.6598, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 1.4118940830230713, |
|
"learning_rate": 2.702408256880734e-05, |
|
"loss": 0.7422, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.11815453863465866, |
|
"grad_norm": 1.1897056102752686, |
|
"learning_rate": 2.6966743119266055e-05, |
|
"loss": 0.6301, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.12003000750187547, |
|
"grad_norm": 1.3802241086959839, |
|
"learning_rate": 2.6909403669724772e-05, |
|
"loss": 0.7331, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.12190547636909227, |
|
"grad_norm": 1.3182717561721802, |
|
"learning_rate": 2.6852064220183485e-05, |
|
"loss": 0.6309, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.12378094523630907, |
|
"grad_norm": 1.1293184757232666, |
|
"learning_rate": 2.6794724770642202e-05, |
|
"loss": 0.657, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.12565641410352588, |
|
"grad_norm": 1.525172472000122, |
|
"learning_rate": 2.6737385321100915e-05, |
|
"loss": 0.7025, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.1275318829707427, |
|
"grad_norm": 1.1699976921081543, |
|
"learning_rate": 2.6680045871559635e-05, |
|
"loss": 0.6213, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.12940735183795948, |
|
"grad_norm": 1.5956225395202637, |
|
"learning_rate": 2.6622706422018348e-05, |
|
"loss": 0.6628, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.1312828207051763, |
|
"grad_norm": 1.3606040477752686, |
|
"learning_rate": 2.6565366972477065e-05, |
|
"loss": 0.6042, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1331582895723931, |
|
"grad_norm": 1.310490369796753, |
|
"learning_rate": 2.6508027522935778e-05, |
|
"loss": 0.6976, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.1350337584396099, |
|
"grad_norm": 1.0546784400939941, |
|
"learning_rate": 2.6450688073394498e-05, |
|
"loss": 0.6681, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1369092273068267, |
|
"grad_norm": 1.977148175239563, |
|
"learning_rate": 2.639334862385321e-05, |
|
"loss": 0.723, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.13878469617404351, |
|
"grad_norm": 1.526698350906372, |
|
"learning_rate": 2.6336009174311928e-05, |
|
"loss": 0.6595, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1406601650412603, |
|
"grad_norm": 1.302213191986084, |
|
"learning_rate": 2.627866972477064e-05, |
|
"loss": 0.6826, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.14253563390847712, |
|
"grad_norm": 1.4109597206115723, |
|
"learning_rate": 2.6221330275229358e-05, |
|
"loss": 0.679, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1444111027756939, |
|
"grad_norm": 1.390410304069519, |
|
"learning_rate": 2.6163990825688074e-05, |
|
"loss": 0.5305, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.14628657164291073, |
|
"grad_norm": 1.778104543685913, |
|
"learning_rate": 2.610665137614679e-05, |
|
"loss": 0.5224, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.14816204051012752, |
|
"grad_norm": 1.4633052349090576, |
|
"learning_rate": 2.6049311926605504e-05, |
|
"loss": 0.6479, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.15003750937734434, |
|
"grad_norm": 1.4608770608901978, |
|
"learning_rate": 2.599197247706422e-05, |
|
"loss": 0.6688, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15003750937734434, |
|
"eval_loss": 0.7000935673713684, |
|
"eval_runtime": 5.5538, |
|
"eval_samples_per_second": 21.607, |
|
"eval_steps_per_second": 2.701, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15191297824456115, |
|
"grad_norm": 1.419257640838623, |
|
"learning_rate": 2.5934633027522937e-05, |
|
"loss": 0.4972, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.15378844711177794, |
|
"grad_norm": 1.383324146270752, |
|
"learning_rate": 2.5877293577981654e-05, |
|
"loss": 0.6802, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.15566391597899476, |
|
"grad_norm": 1.4699604511260986, |
|
"learning_rate": 2.5819954128440367e-05, |
|
"loss": 0.5997, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.15753938484621155, |
|
"grad_norm": 1.7882672548294067, |
|
"learning_rate": 2.5762614678899084e-05, |
|
"loss": 0.5495, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.15941485371342837, |
|
"grad_norm": 1.4856760501861572, |
|
"learning_rate": 2.57052752293578e-05, |
|
"loss": 0.6453, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 1.4340794086456299, |
|
"learning_rate": 2.5647935779816517e-05, |
|
"loss": 0.6263, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.16316579144786197, |
|
"grad_norm": 1.7416778802871704, |
|
"learning_rate": 2.559059633027523e-05, |
|
"loss": 0.5994, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.16504126031507876, |
|
"grad_norm": 1.2337450981140137, |
|
"learning_rate": 2.5533256880733947e-05, |
|
"loss": 0.6288, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.16691672918229558, |
|
"grad_norm": 1.6339170932769775, |
|
"learning_rate": 2.547591743119266e-05, |
|
"loss": 0.6181, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.16879219804951237, |
|
"grad_norm": 1.7602070569992065, |
|
"learning_rate": 2.5418577981651376e-05, |
|
"loss": 0.5742, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1706676669167292, |
|
"grad_norm": 1.3282010555267334, |
|
"learning_rate": 2.5361238532110093e-05, |
|
"loss": 0.6869, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.17254313578394598, |
|
"grad_norm": 1.471281886100769, |
|
"learning_rate": 2.5303899082568806e-05, |
|
"loss": 0.6788, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.1744186046511628, |
|
"grad_norm": 1.4900857210159302, |
|
"learning_rate": 2.5246559633027523e-05, |
|
"loss": 0.5706, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.17629407351837958, |
|
"grad_norm": 1.903571605682373, |
|
"learning_rate": 2.518922018348624e-05, |
|
"loss": 0.7056, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1781695423855964, |
|
"grad_norm": 1.7407480478286743, |
|
"learning_rate": 2.5131880733944956e-05, |
|
"loss": 0.5102, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.1800450112528132, |
|
"grad_norm": 1.6534910202026367, |
|
"learning_rate": 2.507454128440367e-05, |
|
"loss": 0.6654, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.18192048012003, |
|
"grad_norm": 1.7608113288879395, |
|
"learning_rate": 2.5017201834862386e-05, |
|
"loss": 0.6793, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.1837959489872468, |
|
"grad_norm": 1.737579107284546, |
|
"learning_rate": 2.49598623853211e-05, |
|
"loss": 0.5787, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.18567141785446362, |
|
"grad_norm": 1.7096258401870728, |
|
"learning_rate": 2.490252293577982e-05, |
|
"loss": 0.7254, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.18754688672168043, |
|
"grad_norm": 1.7333779335021973, |
|
"learning_rate": 2.4845183486238532e-05, |
|
"loss": 0.7332, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18754688672168043, |
|
"eval_loss": 0.6944581270217896, |
|
"eval_runtime": 5.5505, |
|
"eval_samples_per_second": 21.62, |
|
"eval_steps_per_second": 2.702, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18942235558889722, |
|
"grad_norm": 1.235772967338562, |
|
"learning_rate": 2.478784403669725e-05, |
|
"loss": 0.684, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.19129782445611404, |
|
"grad_norm": 1.3532825708389282, |
|
"learning_rate": 2.4730504587155962e-05, |
|
"loss": 0.5977, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.19317329332333083, |
|
"grad_norm": 1.7143605947494507, |
|
"learning_rate": 2.4673165137614682e-05, |
|
"loss": 0.6109, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.19504876219054765, |
|
"grad_norm": 1.4067035913467407, |
|
"learning_rate": 2.4615825688073395e-05, |
|
"loss": 0.6823, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.19692423105776444, |
|
"grad_norm": 1.713149070739746, |
|
"learning_rate": 2.4558486238532112e-05, |
|
"loss": 0.5852, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.19879969992498125, |
|
"grad_norm": 1.5876083374023438, |
|
"learning_rate": 2.4501146788990825e-05, |
|
"loss": 0.6451, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.20067516879219804, |
|
"grad_norm": 1.3769148588180542, |
|
"learning_rate": 2.444380733944954e-05, |
|
"loss": 0.5712, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.20255063765941486, |
|
"grad_norm": 1.408957839012146, |
|
"learning_rate": 2.4386467889908258e-05, |
|
"loss": 0.5704, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.20442610652663165, |
|
"grad_norm": 1.3725367784500122, |
|
"learning_rate": 2.4329128440366975e-05, |
|
"loss": 0.4405, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.20630157539384847, |
|
"grad_norm": 1.6715686321258545, |
|
"learning_rate": 2.4271788990825688e-05, |
|
"loss": 0.5433, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.20817704426106526, |
|
"grad_norm": 1.4740937948226929, |
|
"learning_rate": 2.4214449541284405e-05, |
|
"loss": 0.6363, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.21005251312828208, |
|
"grad_norm": 1.640724778175354, |
|
"learning_rate": 2.415711009174312e-05, |
|
"loss": 0.6302, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.21192798199549887, |
|
"grad_norm": 1.8040657043457031, |
|
"learning_rate": 2.4099770642201838e-05, |
|
"loss": 0.6726, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.21380345086271568, |
|
"grad_norm": 1.6836594343185425, |
|
"learning_rate": 2.404243119266055e-05, |
|
"loss": 0.6706, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.21567891972993247, |
|
"grad_norm": 1.9649243354797363, |
|
"learning_rate": 2.3985091743119264e-05, |
|
"loss": 0.5189, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2175543885971493, |
|
"grad_norm": 1.5541070699691772, |
|
"learning_rate": 2.392775229357798e-05, |
|
"loss": 0.7497, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.21942985746436608, |
|
"grad_norm": 1.9473050832748413, |
|
"learning_rate": 2.3870412844036697e-05, |
|
"loss": 0.5373, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.2213053263315829, |
|
"grad_norm": 1.8983582258224487, |
|
"learning_rate": 2.3813073394495414e-05, |
|
"loss": 0.6523, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2231807951987997, |
|
"grad_norm": 1.6753871440887451, |
|
"learning_rate": 2.3755733944954127e-05, |
|
"loss": 0.6294, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.2250562640660165, |
|
"grad_norm": 1.706829309463501, |
|
"learning_rate": 2.3698394495412844e-05, |
|
"loss": 0.6741, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2250562640660165, |
|
"eval_loss": 0.6810731887817383, |
|
"eval_runtime": 5.4298, |
|
"eval_samples_per_second": 22.1, |
|
"eval_steps_per_second": 2.763, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.22693173293323332, |
|
"grad_norm": 1.6627461910247803, |
|
"learning_rate": 2.364105504587156e-05, |
|
"loss": 0.6354, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.2288072018004501, |
|
"grad_norm": 2.973555564880371, |
|
"learning_rate": 2.3583715596330277e-05, |
|
"loss": 0.5482, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.23068267066766693, |
|
"grad_norm": 1.9262856245040894, |
|
"learning_rate": 2.352637614678899e-05, |
|
"loss": 0.7558, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 1.816595435142517, |
|
"learning_rate": 2.3469036697247707e-05, |
|
"loss": 0.6039, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.23443360840210054, |
|
"grad_norm": 1.9634557962417603, |
|
"learning_rate": 2.341169724770642e-05, |
|
"loss": 0.5126, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.23630907726931732, |
|
"grad_norm": 1.7136008739471436, |
|
"learning_rate": 2.335435779816514e-05, |
|
"loss": 0.6365, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.23818454613653414, |
|
"grad_norm": 1.4523965120315552, |
|
"learning_rate": 2.3297018348623853e-05, |
|
"loss": 0.5995, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.24006001500375093, |
|
"grad_norm": 1.6242806911468506, |
|
"learning_rate": 2.323967889908257e-05, |
|
"loss": 0.6412, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 1.7888171672821045, |
|
"learning_rate": 2.3182339449541283e-05, |
|
"loss": 0.6565, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.24381095273818454, |
|
"grad_norm": 1.6343475580215454, |
|
"learning_rate": 2.3125000000000003e-05, |
|
"loss": 0.6212, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.24568642160540136, |
|
"grad_norm": 1.3897461891174316, |
|
"learning_rate": 2.3067660550458716e-05, |
|
"loss": 0.5839, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.24756189047261815, |
|
"grad_norm": 1.502485752105713, |
|
"learning_rate": 2.3010321100917433e-05, |
|
"loss": 0.6725, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.24943735933983496, |
|
"grad_norm": 1.3770966529846191, |
|
"learning_rate": 2.2952981651376146e-05, |
|
"loss": 0.5998, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.25131282820705175, |
|
"grad_norm": 1.7012661695480347, |
|
"learning_rate": 2.2895642201834863e-05, |
|
"loss": 0.6668, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.25318829707426854, |
|
"grad_norm": 1.747942566871643, |
|
"learning_rate": 2.283830275229358e-05, |
|
"loss": 0.6948, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.2550637659414854, |
|
"grad_norm": 1.4288934469223022, |
|
"learning_rate": 2.2780963302752296e-05, |
|
"loss": 0.6063, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.2569392348087022, |
|
"grad_norm": 1.6301014423370361, |
|
"learning_rate": 2.272362385321101e-05, |
|
"loss": 0.677, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.25881470367591897, |
|
"grad_norm": 1.3200469017028809, |
|
"learning_rate": 2.2666284403669726e-05, |
|
"loss": 0.653, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.26069017254313576, |
|
"grad_norm": 1.5794614553451538, |
|
"learning_rate": 2.2608944954128442e-05, |
|
"loss": 0.6477, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.2625656414103526, |
|
"grad_norm": 1.5092536211013794, |
|
"learning_rate": 2.2551605504587155e-05, |
|
"loss": 0.6202, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2625656414103526, |
|
"eval_loss": 0.6741260290145874, |
|
"eval_runtime": 5.5003, |
|
"eval_samples_per_second": 21.817, |
|
"eval_steps_per_second": 2.727, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2644411102775694, |
|
"grad_norm": 1.5101447105407715, |
|
"learning_rate": 2.2494266055045872e-05, |
|
"loss": 0.583, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.2663165791447862, |
|
"grad_norm": 1.5355420112609863, |
|
"learning_rate": 2.2436926605504585e-05, |
|
"loss": 0.5422, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.268192048012003, |
|
"grad_norm": 1.5322073698043823, |
|
"learning_rate": 2.2379587155963305e-05, |
|
"loss": 0.6067, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.2700675168792198, |
|
"grad_norm": 1.5003911256790161, |
|
"learning_rate": 2.232224770642202e-05, |
|
"loss": 0.5578, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.2719429857464366, |
|
"grad_norm": 1.4054975509643555, |
|
"learning_rate": 2.2264908256880735e-05, |
|
"loss": 0.5819, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.2738184546136534, |
|
"grad_norm": 1.7100839614868164, |
|
"learning_rate": 2.2207568807339448e-05, |
|
"loss": 0.7076, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.27569392348087024, |
|
"grad_norm": 1.6358684301376343, |
|
"learning_rate": 2.2150229357798165e-05, |
|
"loss": 0.5748, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.27756939234808703, |
|
"grad_norm": 1.8648029565811157, |
|
"learning_rate": 2.209288990825688e-05, |
|
"loss": 0.5488, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.2794448612153038, |
|
"grad_norm": 2.0715155601501465, |
|
"learning_rate": 2.2035550458715598e-05, |
|
"loss": 0.6121, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.2813203300825206, |
|
"grad_norm": 1.4680354595184326, |
|
"learning_rate": 2.197821100917431e-05, |
|
"loss": 0.5775, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.28319579894973745, |
|
"grad_norm": 1.646637201309204, |
|
"learning_rate": 2.1920871559633028e-05, |
|
"loss": 0.6433, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.28507126781695424, |
|
"grad_norm": 1.9596463441848755, |
|
"learning_rate": 2.1863532110091744e-05, |
|
"loss": 0.6534, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.28694673668417103, |
|
"grad_norm": 2.375546455383301, |
|
"learning_rate": 2.180619266055046e-05, |
|
"loss": 0.5802, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.2888222055513878, |
|
"grad_norm": 1.2877148389816284, |
|
"learning_rate": 2.1748853211009174e-05, |
|
"loss": 0.6626, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.29069767441860467, |
|
"grad_norm": 1.3704779148101807, |
|
"learning_rate": 2.169151376146789e-05, |
|
"loss": 0.7177, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.29257314328582146, |
|
"grad_norm": 1.9320201873779297, |
|
"learning_rate": 2.1634174311926604e-05, |
|
"loss": 0.6648, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.29444861215303825, |
|
"grad_norm": 2.351738452911377, |
|
"learning_rate": 2.1576834862385324e-05, |
|
"loss": 0.5823, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.29632408102025504, |
|
"grad_norm": 1.6075841188430786, |
|
"learning_rate": 2.1519495412844037e-05, |
|
"loss": 0.6363, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.2981995498874719, |
|
"grad_norm": 1.7780178785324097, |
|
"learning_rate": 2.1462155963302754e-05, |
|
"loss": 0.7098, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.30007501875468867, |
|
"grad_norm": 1.8664710521697998, |
|
"learning_rate": 2.1404816513761467e-05, |
|
"loss": 0.5582, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.30007501875468867, |
|
"eval_loss": 0.6711069345474243, |
|
"eval_runtime": 5.5099, |
|
"eval_samples_per_second": 21.779, |
|
"eval_steps_per_second": 2.722, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.30195048762190546, |
|
"grad_norm": 1.7083989381790161, |
|
"learning_rate": 2.1347477064220187e-05, |
|
"loss": 0.6628, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.3038259564891223, |
|
"grad_norm": 1.7052229642868042, |
|
"learning_rate": 2.12901376146789e-05, |
|
"loss": 0.5975, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.3057014253563391, |
|
"grad_norm": 1.5098538398742676, |
|
"learning_rate": 2.1232798165137617e-05, |
|
"loss": 0.5729, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.3075768942235559, |
|
"grad_norm": 1.6489193439483643, |
|
"learning_rate": 2.117545871559633e-05, |
|
"loss": 0.5064, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.3094523630907727, |
|
"grad_norm": 1.9127089977264404, |
|
"learning_rate": 2.1118119266055043e-05, |
|
"loss": 0.582, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.3113278319579895, |
|
"grad_norm": 1.801680326461792, |
|
"learning_rate": 2.1060779816513763e-05, |
|
"loss": 0.6894, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.3132033008252063, |
|
"grad_norm": 1.622673511505127, |
|
"learning_rate": 2.1003440366972476e-05, |
|
"loss": 0.6085, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.3150787696924231, |
|
"grad_norm": 1.9467750787734985, |
|
"learning_rate": 2.0946100917431193e-05, |
|
"loss": 0.6822, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.3169542385596399, |
|
"grad_norm": 1.5031330585479736, |
|
"learning_rate": 2.0888761467889906e-05, |
|
"loss": 0.6694, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.31882970742685673, |
|
"grad_norm": 1.68521249294281, |
|
"learning_rate": 2.0831422018348626e-05, |
|
"loss": 0.6556, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3207051762940735, |
|
"grad_norm": 1.8257548809051514, |
|
"learning_rate": 2.077408256880734e-05, |
|
"loss": 0.5684, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 1.6865085363388062, |
|
"learning_rate": 2.0716743119266056e-05, |
|
"loss": 0.6543, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.3244561140285071, |
|
"grad_norm": 1.7781134843826294, |
|
"learning_rate": 2.065940366972477e-05, |
|
"loss": 0.5249, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.32633158289572395, |
|
"grad_norm": 1.9172645807266235, |
|
"learning_rate": 2.0602064220183486e-05, |
|
"loss": 0.564, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.32820705176294074, |
|
"grad_norm": 1.9964970350265503, |
|
"learning_rate": 2.0544724770642202e-05, |
|
"loss": 0.5688, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.3300825206301575, |
|
"grad_norm": 2.0303592681884766, |
|
"learning_rate": 2.048738532110092e-05, |
|
"loss": 0.6081, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.3319579894973743, |
|
"grad_norm": 2.4410409927368164, |
|
"learning_rate": 2.0430045871559632e-05, |
|
"loss": 0.542, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.33383345836459116, |
|
"grad_norm": 1.7117453813552856, |
|
"learning_rate": 2.037270642201835e-05, |
|
"loss": 0.4778, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.33570892723180795, |
|
"grad_norm": 1.5781958103179932, |
|
"learning_rate": 2.0315366972477065e-05, |
|
"loss": 0.5451, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.33758439609902474, |
|
"grad_norm": 1.601178526878357, |
|
"learning_rate": 2.0258027522935782e-05, |
|
"loss": 0.5371, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.33758439609902474, |
|
"eval_loss": 0.667210042476654, |
|
"eval_runtime": 5.5955, |
|
"eval_samples_per_second": 21.446, |
|
"eval_steps_per_second": 2.681, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3394598649662416, |
|
"grad_norm": 1.520401954650879, |
|
"learning_rate": 2.0200688073394495e-05, |
|
"loss": 0.7463, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.3413353338334584, |
|
"grad_norm": 1.5495413541793823, |
|
"learning_rate": 2.0143348623853212e-05, |
|
"loss": 0.5746, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.34321080270067517, |
|
"grad_norm": 1.656015157699585, |
|
"learning_rate": 2.0086009174311925e-05, |
|
"loss": 0.5587, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.34508627156789196, |
|
"grad_norm": 1.7179194688796997, |
|
"learning_rate": 2.0028669724770645e-05, |
|
"loss": 0.6316, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.3469617404351088, |
|
"grad_norm": 2.026876926422119, |
|
"learning_rate": 1.9971330275229358e-05, |
|
"loss": 0.5859, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 1.675175428390503, |
|
"learning_rate": 1.9913990825688075e-05, |
|
"loss": 0.5499, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.3507126781695424, |
|
"grad_norm": 1.3794666528701782, |
|
"learning_rate": 1.9856651376146788e-05, |
|
"loss": 0.651, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.35258814703675917, |
|
"grad_norm": 1.6561700105667114, |
|
"learning_rate": 1.9799311926605508e-05, |
|
"loss": 0.52, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.354463615903976, |
|
"grad_norm": 1.9196125268936157, |
|
"learning_rate": 1.974197247706422e-05, |
|
"loss": 0.5643, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.3563390847711928, |
|
"grad_norm": 2.157627820968628, |
|
"learning_rate": 1.9684633027522934e-05, |
|
"loss": 0.5726, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3582145536384096, |
|
"grad_norm": 1.8069156408309937, |
|
"learning_rate": 1.962729357798165e-05, |
|
"loss": 0.6398, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.3600900225056264, |
|
"grad_norm": 1.7318720817565918, |
|
"learning_rate": 1.9569954128440368e-05, |
|
"loss": 0.4835, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.36196549137284323, |
|
"grad_norm": 2.1636054515838623, |
|
"learning_rate": 1.9512614678899084e-05, |
|
"loss": 0.587, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.36384096024006, |
|
"grad_norm": 2.062150478363037, |
|
"learning_rate": 1.9455275229357797e-05, |
|
"loss": 0.763, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.3657164291072768, |
|
"grad_norm": 1.6775376796722412, |
|
"learning_rate": 1.9397935779816514e-05, |
|
"loss": 0.6575, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.3675918979744936, |
|
"grad_norm": 1.5422090291976929, |
|
"learning_rate": 1.9340596330275227e-05, |
|
"loss": 0.6128, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.36946736684171044, |
|
"grad_norm": 1.7209275960922241, |
|
"learning_rate": 1.9283256880733947e-05, |
|
"loss": 0.5796, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.37134283570892723, |
|
"grad_norm": 1.5626654624938965, |
|
"learning_rate": 1.922591743119266e-05, |
|
"loss": 0.5237, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.373218304576144, |
|
"grad_norm": 1.6950414180755615, |
|
"learning_rate": 1.9168577981651377e-05, |
|
"loss": 0.5983, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.37509377344336087, |
|
"grad_norm": 1.5081120729446411, |
|
"learning_rate": 1.911123853211009e-05, |
|
"loss": 0.5603, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.37509377344336087, |
|
"eval_loss": 0.6599423885345459, |
|
"eval_runtime": 5.5471, |
|
"eval_samples_per_second": 21.633, |
|
"eval_steps_per_second": 2.704, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.37696924231057766, |
|
"grad_norm": 1.7430557012557983, |
|
"learning_rate": 1.905389908256881e-05, |
|
"loss": 0.5582, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.37884471117779445, |
|
"grad_norm": 1.8989301919937134, |
|
"learning_rate": 1.8996559633027523e-05, |
|
"loss": 0.6411, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.38072018004501124, |
|
"grad_norm": 1.9164332151412964, |
|
"learning_rate": 1.893922018348624e-05, |
|
"loss": 0.5733, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.3825956489122281, |
|
"grad_norm": 1.9230120182037354, |
|
"learning_rate": 1.8881880733944953e-05, |
|
"loss": 0.6592, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.38447111777944487, |
|
"grad_norm": 1.9948559999465942, |
|
"learning_rate": 1.882454128440367e-05, |
|
"loss": 0.5918, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.38634658664666166, |
|
"grad_norm": 1.8086504936218262, |
|
"learning_rate": 1.8767201834862386e-05, |
|
"loss": 0.5939, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.38822205551387845, |
|
"grad_norm": 1.715736985206604, |
|
"learning_rate": 1.8709862385321103e-05, |
|
"loss": 0.6624, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.3900975243810953, |
|
"grad_norm": 2.9393413066864014, |
|
"learning_rate": 1.8652522935779816e-05, |
|
"loss": 0.5954, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.3919729932483121, |
|
"grad_norm": 2.3764209747314453, |
|
"learning_rate": 1.8595183486238533e-05, |
|
"loss": 0.6181, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.3938484621155289, |
|
"grad_norm": 1.7462408542633057, |
|
"learning_rate": 1.853784403669725e-05, |
|
"loss": 0.4992, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.39572393098274566, |
|
"grad_norm": 2.006526470184326, |
|
"learning_rate": 1.8480504587155966e-05, |
|
"loss": 0.5331, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.3975993998499625, |
|
"grad_norm": 2.453961133956909, |
|
"learning_rate": 1.842316513761468e-05, |
|
"loss": 0.5828, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.3994748687171793, |
|
"grad_norm": 1.9606050252914429, |
|
"learning_rate": 1.8365825688073396e-05, |
|
"loss": 0.596, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.4013503375843961, |
|
"grad_norm": 1.776755690574646, |
|
"learning_rate": 1.830848623853211e-05, |
|
"loss": 0.6688, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 1.6970465183258057, |
|
"learning_rate": 1.8251146788990826e-05, |
|
"loss": 0.6403, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.4051012753188297, |
|
"grad_norm": 2.1834471225738525, |
|
"learning_rate": 1.8193807339449542e-05, |
|
"loss": 0.6169, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.4069767441860465, |
|
"grad_norm": 1.4596108198165894, |
|
"learning_rate": 1.8136467889908255e-05, |
|
"loss": 0.5783, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.4088522130532633, |
|
"grad_norm": 1.808875560760498, |
|
"learning_rate": 1.8079128440366972e-05, |
|
"loss": 0.6834, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.41072768192048015, |
|
"grad_norm": 2.0414464473724365, |
|
"learning_rate": 1.802178899082569e-05, |
|
"loss": 0.5546, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.41260315078769694, |
|
"grad_norm": 1.7231241464614868, |
|
"learning_rate": 1.7964449541284405e-05, |
|
"loss": 0.5875, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.41260315078769694, |
|
"eval_loss": 0.6564481258392334, |
|
"eval_runtime": 5.5564, |
|
"eval_samples_per_second": 21.597, |
|
"eval_steps_per_second": 2.7, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4144786196549137, |
|
"grad_norm": 1.5646345615386963, |
|
"learning_rate": 1.790711009174312e-05, |
|
"loss": 0.4631, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.4163540885221305, |
|
"grad_norm": 2.1417741775512695, |
|
"learning_rate": 1.7849770642201835e-05, |
|
"loss": 0.5978, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.41822955738934736, |
|
"grad_norm": 1.5909672975540161, |
|
"learning_rate": 1.7792431192660548e-05, |
|
"loss": 0.6276, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.42010502625656415, |
|
"grad_norm": 1.5815021991729736, |
|
"learning_rate": 1.7735091743119268e-05, |
|
"loss": 0.5655, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.42198049512378094, |
|
"grad_norm": 2.173349618911743, |
|
"learning_rate": 1.767775229357798e-05, |
|
"loss": 0.5172, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.42385596399099773, |
|
"grad_norm": 1.611697793006897, |
|
"learning_rate": 1.7620412844036698e-05, |
|
"loss": 0.5828, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.4257314328582146, |
|
"grad_norm": 2.148935556411743, |
|
"learning_rate": 1.756307339449541e-05, |
|
"loss": 0.5796, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.42760690172543137, |
|
"grad_norm": 2.8221611976623535, |
|
"learning_rate": 1.750573394495413e-05, |
|
"loss": 0.6098, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.42948237059264815, |
|
"grad_norm": 1.8515477180480957, |
|
"learning_rate": 1.7448394495412844e-05, |
|
"loss": 0.6519, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.43135783945986494, |
|
"grad_norm": 1.9033889770507812, |
|
"learning_rate": 1.739105504587156e-05, |
|
"loss": 0.5771, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4332333083270818, |
|
"grad_norm": 2.1629979610443115, |
|
"learning_rate": 1.7333715596330274e-05, |
|
"loss": 0.5308, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.4351087771942986, |
|
"grad_norm": 1.713036060333252, |
|
"learning_rate": 1.727637614678899e-05, |
|
"loss": 0.6036, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.43698424606151537, |
|
"grad_norm": 1.626887559890747, |
|
"learning_rate": 1.7219036697247707e-05, |
|
"loss": 0.5932, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.43885971492873216, |
|
"grad_norm": 2.026658535003662, |
|
"learning_rate": 1.7161697247706424e-05, |
|
"loss": 0.509, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.440735183795949, |
|
"grad_norm": 1.617053508758545, |
|
"learning_rate": 1.7104357798165137e-05, |
|
"loss": 0.5841, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.4426106526631658, |
|
"grad_norm": 1.8023245334625244, |
|
"learning_rate": 1.7047018348623854e-05, |
|
"loss": 0.5244, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.4444861215303826, |
|
"grad_norm": 2.0502309799194336, |
|
"learning_rate": 1.698967889908257e-05, |
|
"loss": 0.5936, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.4463615903975994, |
|
"grad_norm": 2.410144567489624, |
|
"learning_rate": 1.6932339449541287e-05, |
|
"loss": 0.6206, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.4482370592648162, |
|
"grad_norm": 2.0925815105438232, |
|
"learning_rate": 1.6875e-05, |
|
"loss": 0.5086, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.450112528132033, |
|
"grad_norm": 1.8199101686477661, |
|
"learning_rate": 1.6817660550458713e-05, |
|
"loss": 0.583, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.450112528132033, |
|
"eval_loss": 0.6492409110069275, |
|
"eval_runtime": 5.5179, |
|
"eval_samples_per_second": 21.747, |
|
"eval_steps_per_second": 2.718, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4519879969992498, |
|
"grad_norm": 1.7940239906311035, |
|
"learning_rate": 1.6760321100917433e-05, |
|
"loss": 0.6154, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.45386346586646664, |
|
"grad_norm": 2.281325340270996, |
|
"learning_rate": 1.6702981651376147e-05, |
|
"loss": 0.5542, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.45573893473368343, |
|
"grad_norm": 1.8717613220214844, |
|
"learning_rate": 1.6645642201834863e-05, |
|
"loss": 0.5242, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.4576144036009002, |
|
"grad_norm": 2.2120072841644287, |
|
"learning_rate": 1.6588302752293576e-05, |
|
"loss": 0.63, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.459489872468117, |
|
"grad_norm": 2.10752272605896, |
|
"learning_rate": 1.6530963302752293e-05, |
|
"loss": 0.5712, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.46136534133533386, |
|
"grad_norm": 2.3129327297210693, |
|
"learning_rate": 1.647362385321101e-05, |
|
"loss": 0.7044, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.46324081020255065, |
|
"grad_norm": 1.424224853515625, |
|
"learning_rate": 1.6416284403669726e-05, |
|
"loss": 0.5588, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 1.6627572774887085, |
|
"learning_rate": 1.635894495412844e-05, |
|
"loss": 0.4543, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.4669917479369842, |
|
"grad_norm": 1.6522067785263062, |
|
"learning_rate": 1.6301605504587156e-05, |
|
"loss": 0.6003, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.46886721680420107, |
|
"grad_norm": 2.2070651054382324, |
|
"learning_rate": 1.6244266055045873e-05, |
|
"loss": 0.6294, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.47074268567141786, |
|
"grad_norm": 2.1523821353912354, |
|
"learning_rate": 1.618692660550459e-05, |
|
"loss": 0.6067, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.47261815453863465, |
|
"grad_norm": 2.468892812728882, |
|
"learning_rate": 1.6129587155963302e-05, |
|
"loss": 0.6267, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.47449362340585144, |
|
"grad_norm": 1.9735854864120483, |
|
"learning_rate": 1.607224770642202e-05, |
|
"loss": 0.6124, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.4763690922730683, |
|
"grad_norm": 1.7900265455245972, |
|
"learning_rate": 1.6014908256880732e-05, |
|
"loss": 0.5845, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.4782445611402851, |
|
"grad_norm": 2.2069602012634277, |
|
"learning_rate": 1.5957568807339452e-05, |
|
"loss": 0.6071, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.48012003000750186, |
|
"grad_norm": 2.3752589225769043, |
|
"learning_rate": 1.5900229357798165e-05, |
|
"loss": 0.6074, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.48199549887471865, |
|
"grad_norm": 1.8114852905273438, |
|
"learning_rate": 1.5842889908256882e-05, |
|
"loss": 0.5358, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 1.6503331661224365, |
|
"learning_rate": 1.5785550458715595e-05, |
|
"loss": 0.6664, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.4857464366091523, |
|
"grad_norm": 1.7421520948410034, |
|
"learning_rate": 1.5728211009174315e-05, |
|
"loss": 0.6137, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.4876219054763691, |
|
"grad_norm": 1.865038275718689, |
|
"learning_rate": 1.567087155963303e-05, |
|
"loss": 0.6328, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4876219054763691, |
|
"eval_loss": 0.6427852511405945, |
|
"eval_runtime": 5.6723, |
|
"eval_samples_per_second": 21.155, |
|
"eval_steps_per_second": 2.644, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4894973743435859, |
|
"grad_norm": 2.0670528411865234, |
|
"learning_rate": 1.5613532110091745e-05, |
|
"loss": 0.5686, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.4913728432108027, |
|
"grad_norm": 2.00549054145813, |
|
"learning_rate": 1.5556192660550458e-05, |
|
"loss": 0.5701, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.4932483120780195, |
|
"grad_norm": 2.3382251262664795, |
|
"learning_rate": 1.5498853211009175e-05, |
|
"loss": 0.6212, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.4951237809452363, |
|
"grad_norm": 1.849523901939392, |
|
"learning_rate": 1.544151376146789e-05, |
|
"loss": 0.5844, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.49699924981245314, |
|
"grad_norm": 2.0589709281921387, |
|
"learning_rate": 1.5384174311926605e-05, |
|
"loss": 0.5736, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.4988747186796699, |
|
"grad_norm": 2.3713736534118652, |
|
"learning_rate": 1.532683486238532e-05, |
|
"loss": 0.5543, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.5007501875468867, |
|
"grad_norm": 1.4133175611495972, |
|
"learning_rate": 1.5269495412844034e-05, |
|
"loss": 0.5916, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.5026256564141035, |
|
"grad_norm": 1.828869104385376, |
|
"learning_rate": 1.5212155963302753e-05, |
|
"loss": 0.6424, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.5045011252813203, |
|
"grad_norm": 1.8340333700180054, |
|
"learning_rate": 1.5154816513761468e-05, |
|
"loss": 0.5335, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.5063765941485371, |
|
"grad_norm": 2.287064790725708, |
|
"learning_rate": 1.5097477064220184e-05, |
|
"loss": 0.5604, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.508252063015754, |
|
"grad_norm": 2.0678904056549072, |
|
"learning_rate": 1.5040137614678897e-05, |
|
"loss": 0.6822, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.5101275318829708, |
|
"grad_norm": 1.848810076713562, |
|
"learning_rate": 1.4982798165137616e-05, |
|
"loss": 0.5741, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.5120030007501876, |
|
"grad_norm": 1.8436052799224854, |
|
"learning_rate": 1.492545871559633e-05, |
|
"loss": 0.592, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.5138784696174044, |
|
"grad_norm": 1.8554112911224365, |
|
"learning_rate": 1.4868119266055047e-05, |
|
"loss": 0.5372, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.5157539384846211, |
|
"grad_norm": 1.7678755521774292, |
|
"learning_rate": 1.4810779816513762e-05, |
|
"loss": 0.5748, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.5176294073518379, |
|
"grad_norm": 1.71146821975708, |
|
"learning_rate": 1.4753440366972479e-05, |
|
"loss": 0.6795, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.5195048762190547, |
|
"grad_norm": 1.6599249839782715, |
|
"learning_rate": 1.4696100917431192e-05, |
|
"loss": 0.5559, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.5213803450862715, |
|
"grad_norm": 2.273698568344116, |
|
"learning_rate": 1.4638761467889908e-05, |
|
"loss": 0.4864, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.5232558139534884, |
|
"grad_norm": 2.400425434112549, |
|
"learning_rate": 1.4581422018348623e-05, |
|
"loss": 0.6152, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.5251312828207052, |
|
"grad_norm": 2.1009607315063477, |
|
"learning_rate": 1.452408256880734e-05, |
|
"loss": 0.5518, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5251312828207052, |
|
"eval_loss": 0.6440523266792297, |
|
"eval_runtime": 5.5215, |
|
"eval_samples_per_second": 21.733, |
|
"eval_steps_per_second": 2.717, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.527006751687922, |
|
"grad_norm": 1.724177360534668, |
|
"learning_rate": 1.4466743119266055e-05, |
|
"loss": 0.5635, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.5288822205551388, |
|
"grad_norm": 1.6806992292404175, |
|
"learning_rate": 1.440940366972477e-05, |
|
"loss": 0.6072, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.5307576894223556, |
|
"grad_norm": 1.914863109588623, |
|
"learning_rate": 1.4352064220183486e-05, |
|
"loss": 0.5919, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.5326331582895724, |
|
"grad_norm": 1.9246379137039185, |
|
"learning_rate": 1.4294724770642201e-05, |
|
"loss": 0.6282, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.5345086271567892, |
|
"grad_norm": 2.0513482093811035, |
|
"learning_rate": 1.4237385321100918e-05, |
|
"loss": 0.5117, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.536384096024006, |
|
"grad_norm": 2.160053253173828, |
|
"learning_rate": 1.4180045871559633e-05, |
|
"loss": 0.5916, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.5382595648912228, |
|
"grad_norm": 1.3989676237106323, |
|
"learning_rate": 1.412270642201835e-05, |
|
"loss": 0.6169, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.5401350337584396, |
|
"grad_norm": 1.9387221336364746, |
|
"learning_rate": 1.4065366972477064e-05, |
|
"loss": 0.6374, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.5420105026256564, |
|
"grad_norm": 2.054593563079834, |
|
"learning_rate": 1.4008027522935781e-05, |
|
"loss": 0.5947, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.5438859714928732, |
|
"grad_norm": 1.8106393814086914, |
|
"learning_rate": 1.3950688073394496e-05, |
|
"loss": 0.6232, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.54576144036009, |
|
"grad_norm": 2.042513132095337, |
|
"learning_rate": 1.389334862385321e-05, |
|
"loss": 0.481, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.5476369092273068, |
|
"grad_norm": 1.6872574090957642, |
|
"learning_rate": 1.3836009174311927e-05, |
|
"loss": 0.5128, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.5495123780945236, |
|
"grad_norm": 1.8918819427490234, |
|
"learning_rate": 1.3778669724770642e-05, |
|
"loss": 0.6205, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.5513878469617405, |
|
"grad_norm": 2.6372804641723633, |
|
"learning_rate": 1.3721330275229359e-05, |
|
"loss": 0.4981, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.5532633158289573, |
|
"grad_norm": 1.8915632963180542, |
|
"learning_rate": 1.3663990825688074e-05, |
|
"loss": 0.5481, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5551387846961741, |
|
"grad_norm": 2.0230934619903564, |
|
"learning_rate": 1.360665137614679e-05, |
|
"loss": 0.5806, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.5570142535633908, |
|
"grad_norm": 2.1508560180664062, |
|
"learning_rate": 1.3549311926605505e-05, |
|
"loss": 0.6057, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.5588897224306076, |
|
"grad_norm": 1.7368062734603882, |
|
"learning_rate": 1.3491972477064222e-05, |
|
"loss": 0.5743, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.5607651912978244, |
|
"grad_norm": 1.9738160371780396, |
|
"learning_rate": 1.3434633027522937e-05, |
|
"loss": 0.6063, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.5626406601650412, |
|
"grad_norm": 1.9070963859558105, |
|
"learning_rate": 1.3377293577981652e-05, |
|
"loss": 0.5965, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5626406601650412, |
|
"eval_loss": 0.637257993221283, |
|
"eval_runtime": 5.3838, |
|
"eval_samples_per_second": 22.289, |
|
"eval_steps_per_second": 2.786, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 1.9798016548156738, |
|
"learning_rate": 1.3319954128440368e-05, |
|
"loss": 0.5758, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.5663915978994749, |
|
"grad_norm": 1.68988037109375, |
|
"learning_rate": 1.3262614678899081e-05, |
|
"loss": 0.5435, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.5682670667666917, |
|
"grad_norm": 1.9612882137298584, |
|
"learning_rate": 1.3205275229357798e-05, |
|
"loss": 0.7064, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.5701425356339085, |
|
"grad_norm": 1.9069509506225586, |
|
"learning_rate": 1.3147935779816513e-05, |
|
"loss": 0.6531, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5720180045011253, |
|
"grad_norm": 2.185046434402466, |
|
"learning_rate": 1.309059633027523e-05, |
|
"loss": 0.548, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5738934733683421, |
|
"grad_norm": 1.6375807523727417, |
|
"learning_rate": 1.3033256880733944e-05, |
|
"loss": 0.5555, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.5757689422355589, |
|
"grad_norm": 2.4809699058532715, |
|
"learning_rate": 1.2975917431192661e-05, |
|
"loss": 0.5071, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.5776444111027756, |
|
"grad_norm": 2.071410894393921, |
|
"learning_rate": 1.2918577981651376e-05, |
|
"loss": 0.6192, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.5795198799699925, |
|
"grad_norm": 1.9961457252502441, |
|
"learning_rate": 1.2861238532110092e-05, |
|
"loss": 0.6463, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 1.7288352251052856, |
|
"learning_rate": 1.2803899082568807e-05, |
|
"loss": 0.5121, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5832708177044261, |
|
"grad_norm": 2.855468988418579, |
|
"learning_rate": 1.2746559633027522e-05, |
|
"loss": 0.5797, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.5851462865716429, |
|
"grad_norm": 2.2987215518951416, |
|
"learning_rate": 1.2689220183486239e-05, |
|
"loss": 0.5607, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.5870217554388597, |
|
"grad_norm": 1.4077903032302856, |
|
"learning_rate": 1.2631880733944954e-05, |
|
"loss": 0.6127, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.5888972243060765, |
|
"grad_norm": 2.1426985263824463, |
|
"learning_rate": 1.257454128440367e-05, |
|
"loss": 0.5774, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.5907726931732933, |
|
"grad_norm": 1.681693196296692, |
|
"learning_rate": 1.2517201834862385e-05, |
|
"loss": 0.5311, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5926481620405101, |
|
"grad_norm": 2.1285390853881836, |
|
"learning_rate": 1.2459862385321102e-05, |
|
"loss": 0.7334, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.594523630907727, |
|
"grad_norm": 1.7066893577575684, |
|
"learning_rate": 1.2402522935779817e-05, |
|
"loss": 0.4741, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.5963990997749438, |
|
"grad_norm": 2.3069071769714355, |
|
"learning_rate": 1.2345183486238533e-05, |
|
"loss": 0.6068, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.5982745686421606, |
|
"grad_norm": 1.898915410041809, |
|
"learning_rate": 1.2287844036697248e-05, |
|
"loss": 0.4881, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.6001500375093773, |
|
"grad_norm": 1.9187260866165161, |
|
"learning_rate": 1.2230504587155963e-05, |
|
"loss": 0.5603, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6001500375093773, |
|
"eval_loss": 0.6345093250274658, |
|
"eval_runtime": 5.5367, |
|
"eval_samples_per_second": 21.674, |
|
"eval_steps_per_second": 2.709, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6020255063765941, |
|
"grad_norm": 1.7056176662445068, |
|
"learning_rate": 1.217316513761468e-05, |
|
"loss": 0.4889, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.6039009752438109, |
|
"grad_norm": 1.7351319789886475, |
|
"learning_rate": 1.2115825688073395e-05, |
|
"loss": 0.5233, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.6057764441110277, |
|
"grad_norm": 3.0656421184539795, |
|
"learning_rate": 1.2058486238532111e-05, |
|
"loss": 0.5427, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.6076519129782446, |
|
"grad_norm": 2.4634621143341064, |
|
"learning_rate": 1.2001146788990826e-05, |
|
"loss": 0.5901, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.6095273818454614, |
|
"grad_norm": 1.7477375268936157, |
|
"learning_rate": 1.1943807339449543e-05, |
|
"loss": 0.6393, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.6114028507126782, |
|
"grad_norm": 2.034407377243042, |
|
"learning_rate": 1.1886467889908258e-05, |
|
"loss": 0.4688, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.613278319579895, |
|
"grad_norm": 1.604066014289856, |
|
"learning_rate": 1.1829128440366974e-05, |
|
"loss": 0.5928, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.6151537884471118, |
|
"grad_norm": 1.9047834873199463, |
|
"learning_rate": 1.1771788990825687e-05, |
|
"loss": 0.512, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.6170292573143286, |
|
"grad_norm": 2.166414737701416, |
|
"learning_rate": 1.1714449541284404e-05, |
|
"loss": 0.6807, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.6189047261815454, |
|
"grad_norm": 2.463648796081543, |
|
"learning_rate": 1.1657110091743119e-05, |
|
"loss": 0.7143, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6207801950487621, |
|
"grad_norm": 1.8840951919555664, |
|
"learning_rate": 1.1599770642201834e-05, |
|
"loss": 0.6137, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.622655663915979, |
|
"grad_norm": 2.49739408493042, |
|
"learning_rate": 1.154243119266055e-05, |
|
"loss": 0.6415, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.6245311327831958, |
|
"grad_norm": 2.0638840198516846, |
|
"learning_rate": 1.1485091743119265e-05, |
|
"loss": 0.5118, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.6264066016504126, |
|
"grad_norm": 2.0733895301818848, |
|
"learning_rate": 1.1427752293577982e-05, |
|
"loss": 0.6573, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.6282820705176294, |
|
"grad_norm": 2.006185293197632, |
|
"learning_rate": 1.1370412844036697e-05, |
|
"loss": 0.4634, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.6301575393848462, |
|
"grad_norm": 2.2666101455688477, |
|
"learning_rate": 1.1313073394495413e-05, |
|
"loss": 0.6536, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.632033008252063, |
|
"grad_norm": 2.7148234844207764, |
|
"learning_rate": 1.1255733944954128e-05, |
|
"loss": 0.7032, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.6339084771192798, |
|
"grad_norm": 1.6289362907409668, |
|
"learning_rate": 1.1198394495412845e-05, |
|
"loss": 0.5175, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.6357839459864967, |
|
"grad_norm": 2.742385149002075, |
|
"learning_rate": 1.114105504587156e-05, |
|
"loss": 0.582, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.6376594148537135, |
|
"grad_norm": 2.092541217803955, |
|
"learning_rate": 1.1083715596330275e-05, |
|
"loss": 0.6501, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6376594148537135, |
|
"eval_loss": 0.631032407283783, |
|
"eval_runtime": 5.5482, |
|
"eval_samples_per_second": 21.629, |
|
"eval_steps_per_second": 2.704, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6395348837209303, |
|
"grad_norm": 1.8964581489562988, |
|
"learning_rate": 1.1026376146788991e-05, |
|
"loss": 0.5905, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.641410352588147, |
|
"grad_norm": 1.6054551601409912, |
|
"learning_rate": 1.0969036697247706e-05, |
|
"loss": 0.4636, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.6432858214553638, |
|
"grad_norm": 2.0726969242095947, |
|
"learning_rate": 1.0911697247706423e-05, |
|
"loss": 0.6542, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 3.5420382022857666, |
|
"learning_rate": 1.0854357798165138e-05, |
|
"loss": 0.5573, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.6470367591897974, |
|
"grad_norm": 2.462528705596924, |
|
"learning_rate": 1.0797018348623854e-05, |
|
"loss": 0.5605, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6489122280570142, |
|
"grad_norm": 2.0307133197784424, |
|
"learning_rate": 1.073967889908257e-05, |
|
"loss": 0.5594, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.6507876969242311, |
|
"grad_norm": 2.2088277339935303, |
|
"learning_rate": 1.0682339449541286e-05, |
|
"loss": 0.6143, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.6526631657914479, |
|
"grad_norm": 1.4962677955627441, |
|
"learning_rate": 1.0625e-05, |
|
"loss": 0.5801, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.6545386346586647, |
|
"grad_norm": 1.796766996383667, |
|
"learning_rate": 1.0567660550458716e-05, |
|
"loss": 0.6032, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.6564141035258815, |
|
"grad_norm": 2.6135787963867188, |
|
"learning_rate": 1.0510321100917432e-05, |
|
"loss": 0.5422, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6582895723930983, |
|
"grad_norm": 2.0830154418945312, |
|
"learning_rate": 1.0452981651376147e-05, |
|
"loss": 0.5509, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.660165041260315, |
|
"grad_norm": 2.061523675918579, |
|
"learning_rate": 1.0395642201834864e-05, |
|
"loss": 0.5258, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.6620405101275318, |
|
"grad_norm": 1.8006651401519775, |
|
"learning_rate": 1.0338302752293577e-05, |
|
"loss": 0.5546, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.6639159789947486, |
|
"grad_norm": 2.187450647354126, |
|
"learning_rate": 1.0280963302752294e-05, |
|
"loss": 0.598, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.6657914478619655, |
|
"grad_norm": 1.984383463859558, |
|
"learning_rate": 1.0223623853211008e-05, |
|
"loss": 0.5181, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.6676669167291823, |
|
"grad_norm": 2.5804004669189453, |
|
"learning_rate": 1.0166284403669725e-05, |
|
"loss": 0.4769, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.6695423855963991, |
|
"grad_norm": 2.4561312198638916, |
|
"learning_rate": 1.010894495412844e-05, |
|
"loss": 0.5985, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.6714178544636159, |
|
"grad_norm": 2.456256866455078, |
|
"learning_rate": 1.0051605504587157e-05, |
|
"loss": 0.6127, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.6732933233308327, |
|
"grad_norm": 2.1540181636810303, |
|
"learning_rate": 9.994266055045871e-06, |
|
"loss": 0.6621, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.6751687921980495, |
|
"grad_norm": 2.0861988067626953, |
|
"learning_rate": 9.936926605504586e-06, |
|
"loss": 0.5981, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6751687921980495, |
|
"eval_loss": 0.6278895735740662, |
|
"eval_runtime": 5.5329, |
|
"eval_samples_per_second": 21.688, |
|
"eval_steps_per_second": 2.711, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6770442610652663, |
|
"grad_norm": 2.0881967544555664, |
|
"learning_rate": 9.879587155963303e-06, |
|
"loss": 0.5829, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.6789197299324832, |
|
"grad_norm": 1.6255912780761719, |
|
"learning_rate": 9.822247706422018e-06, |
|
"loss": 0.5327, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.6807951987997, |
|
"grad_norm": 1.970249891281128, |
|
"learning_rate": 9.764908256880734e-06, |
|
"loss": 0.5841, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.6826706676669168, |
|
"grad_norm": 2.4903528690338135, |
|
"learning_rate": 9.70756880733945e-06, |
|
"loss": 0.6067, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.6845461365341335, |
|
"grad_norm": 1.9478775262832642, |
|
"learning_rate": 9.650229357798166e-06, |
|
"loss": 0.5565, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.6864216054013503, |
|
"grad_norm": 1.8559181690216064, |
|
"learning_rate": 9.592889908256881e-06, |
|
"loss": 0.5934, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.6882970742685671, |
|
"grad_norm": 1.7717185020446777, |
|
"learning_rate": 9.535550458715597e-06, |
|
"loss": 0.5408, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.6901725431357839, |
|
"grad_norm": 2.0449588298797607, |
|
"learning_rate": 9.478211009174312e-06, |
|
"loss": 0.5795, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.6920480120030007, |
|
"grad_norm": 2.3706321716308594, |
|
"learning_rate": 9.420871559633027e-06, |
|
"loss": 0.5183, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.6939234808702176, |
|
"grad_norm": 1.7281607389450073, |
|
"learning_rate": 9.363532110091744e-06, |
|
"loss": 0.494, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6957989497374344, |
|
"grad_norm": 3.5256292819976807, |
|
"learning_rate": 9.306192660550459e-06, |
|
"loss": 0.5757, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 1.58797025680542, |
|
"learning_rate": 9.248853211009175e-06, |
|
"loss": 0.5399, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.699549887471868, |
|
"grad_norm": 1.9900200366973877, |
|
"learning_rate": 9.19151376146789e-06, |
|
"loss": 0.6236, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.7014253563390848, |
|
"grad_norm": 1.7843225002288818, |
|
"learning_rate": 9.134174311926607e-06, |
|
"loss": 0.549, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.7033008252063015, |
|
"grad_norm": 1.9925148487091064, |
|
"learning_rate": 9.076834862385322e-06, |
|
"loss": 0.49, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.7051762940735183, |
|
"grad_norm": 2.0657670497894287, |
|
"learning_rate": 9.019495412844038e-06, |
|
"loss": 0.5305, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.7070517629407351, |
|
"grad_norm": 2.2417612075805664, |
|
"learning_rate": 8.962155963302753e-06, |
|
"loss": 0.6312, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.708927231807952, |
|
"grad_norm": 2.196537733078003, |
|
"learning_rate": 8.904816513761468e-06, |
|
"loss": 0.6512, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.7108027006751688, |
|
"grad_norm": 1.830484390258789, |
|
"learning_rate": 8.847477064220183e-06, |
|
"loss": 0.5919, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.7126781695423856, |
|
"grad_norm": 2.0573606491088867, |
|
"learning_rate": 8.790137614678898e-06, |
|
"loss": 0.5749, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7126781695423856, |
|
"eval_loss": 0.6254046559333801, |
|
"eval_runtime": 5.5087, |
|
"eval_samples_per_second": 21.784, |
|
"eval_steps_per_second": 2.723, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7145536384096024, |
|
"grad_norm": 1.9237205982208252, |
|
"learning_rate": 8.732798165137615e-06, |
|
"loss": 0.5669, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.7164291072768192, |
|
"grad_norm": 1.9309799671173096, |
|
"learning_rate": 8.67545871559633e-06, |
|
"loss": 0.4759, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.718304576144036, |
|
"grad_norm": 1.7976388931274414, |
|
"learning_rate": 8.618119266055046e-06, |
|
"loss": 0.5962, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.7201800450112528, |
|
"grad_norm": 2.3641951084136963, |
|
"learning_rate": 8.560779816513761e-06, |
|
"loss": 0.627, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.7220555138784697, |
|
"grad_norm": 1.5216801166534424, |
|
"learning_rate": 8.503440366972478e-06, |
|
"loss": 0.6098, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.7239309827456865, |
|
"grad_norm": 1.8570992946624756, |
|
"learning_rate": 8.446100917431192e-06, |
|
"loss": 0.5813, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 3.1294426918029785, |
|
"learning_rate": 8.388761467889909e-06, |
|
"loss": 0.5876, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.72768192048012, |
|
"grad_norm": 2.678264617919922, |
|
"learning_rate": 8.331422018348624e-06, |
|
"loss": 0.6336, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.7295573893473368, |
|
"grad_norm": 1.5208237171173096, |
|
"learning_rate": 8.274082568807339e-06, |
|
"loss": 0.5342, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.7314328582145536, |
|
"grad_norm": 2.246694326400757, |
|
"learning_rate": 8.216743119266055e-06, |
|
"loss": 0.58, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7333083270817704, |
|
"grad_norm": 1.5300601720809937, |
|
"learning_rate": 8.15940366972477e-06, |
|
"loss": 0.5734, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.7351837959489872, |
|
"grad_norm": 2.032264471054077, |
|
"learning_rate": 8.102064220183487e-06, |
|
"loss": 0.5499, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.7370592648162041, |
|
"grad_norm": 2.2106308937072754, |
|
"learning_rate": 8.044724770642202e-06, |
|
"loss": 0.5158, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.7389347336834209, |
|
"grad_norm": 1.91170334815979, |
|
"learning_rate": 7.987385321100918e-06, |
|
"loss": 0.6973, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.7408102025506377, |
|
"grad_norm": 1.750429391860962, |
|
"learning_rate": 7.930045871559633e-06, |
|
"loss": 0.5268, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7426856714178545, |
|
"grad_norm": 3.0469017028808594, |
|
"learning_rate": 7.87270642201835e-06, |
|
"loss": 0.6026, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.7445611402850713, |
|
"grad_norm": 1.8385506868362427, |
|
"learning_rate": 7.815366972477065e-06, |
|
"loss": 0.6316, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.746436609152288, |
|
"grad_norm": 2.0888671875, |
|
"learning_rate": 7.75802752293578e-06, |
|
"loss": 0.6271, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.7483120780195048, |
|
"grad_norm": 2.3192808628082275, |
|
"learning_rate": 7.700688073394496e-06, |
|
"loss": 0.5864, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.7501875468867217, |
|
"grad_norm": 1.7646706104278564, |
|
"learning_rate": 7.643348623853211e-06, |
|
"loss": 0.5462, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7501875468867217, |
|
"eval_loss": 0.6223539710044861, |
|
"eval_runtime": 5.5777, |
|
"eval_samples_per_second": 21.514, |
|
"eval_steps_per_second": 2.689, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7520630157539385, |
|
"grad_norm": 2.0803816318511963, |
|
"learning_rate": 7.586009174311928e-06, |
|
"loss": 0.7064, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.7539384846211553, |
|
"grad_norm": 2.42698073387146, |
|
"learning_rate": 7.528669724770644e-06, |
|
"loss": 0.5476, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.7558139534883721, |
|
"grad_norm": 2.320164442062378, |
|
"learning_rate": 7.471330275229358e-06, |
|
"loss": 0.6527, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.7576894223555889, |
|
"grad_norm": 2.235037088394165, |
|
"learning_rate": 7.413990825688073e-06, |
|
"loss": 0.6471, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.7595648912228057, |
|
"grad_norm": 2.3369674682617188, |
|
"learning_rate": 7.356651376146789e-06, |
|
"loss": 0.6455, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.7614403600900225, |
|
"grad_norm": 2.7365052700042725, |
|
"learning_rate": 7.299311926605505e-06, |
|
"loss": 0.6133, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.7633158289572393, |
|
"grad_norm": 1.987430453300476, |
|
"learning_rate": 7.241972477064221e-06, |
|
"loss": 0.5892, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.7651912978244562, |
|
"grad_norm": 2.2822089195251465, |
|
"learning_rate": 7.184633027522936e-06, |
|
"loss": 0.5558, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.767066766691673, |
|
"grad_norm": 2.1317837238311768, |
|
"learning_rate": 7.127293577981651e-06, |
|
"loss": 0.5939, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.7689422355588897, |
|
"grad_norm": 2.5483336448669434, |
|
"learning_rate": 7.069954128440367e-06, |
|
"loss": 0.6171, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7708177044261065, |
|
"grad_norm": 1.9714287519454956, |
|
"learning_rate": 7.012614678899083e-06, |
|
"loss": 0.6518, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.7726931732933233, |
|
"grad_norm": 1.9111765623092651, |
|
"learning_rate": 6.9552752293577985e-06, |
|
"loss": 0.5786, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.7745686421605401, |
|
"grad_norm": 1.9817109107971191, |
|
"learning_rate": 6.8979357798165134e-06, |
|
"loss": 0.5463, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.7764441110277569, |
|
"grad_norm": 1.834665060043335, |
|
"learning_rate": 6.840596330275229e-06, |
|
"loss": 0.5541, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.7783195798949737, |
|
"grad_norm": 2.018120765686035, |
|
"learning_rate": 6.783256880733945e-06, |
|
"loss": 0.5399, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.7801950487621906, |
|
"grad_norm": 2.5197436809539795, |
|
"learning_rate": 6.725917431192661e-06, |
|
"loss": 0.5581, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.7820705176294074, |
|
"grad_norm": 2.2083163261413574, |
|
"learning_rate": 6.6685779816513764e-06, |
|
"loss": 0.5535, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.7839459864966242, |
|
"grad_norm": 2.2999789714813232, |
|
"learning_rate": 6.611238532110092e-06, |
|
"loss": 0.5212, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.785821455363841, |
|
"grad_norm": 2.2333500385284424, |
|
"learning_rate": 6.553899082568808e-06, |
|
"loss": 0.6867, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.7876969242310577, |
|
"grad_norm": 2.5943992137908936, |
|
"learning_rate": 6.496559633027524e-06, |
|
"loss": 0.4554, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7876969242310577, |
|
"eval_loss": 0.6219611763954163, |
|
"eval_runtime": 5.5648, |
|
"eval_samples_per_second": 21.564, |
|
"eval_steps_per_second": 2.696, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7895723930982745, |
|
"grad_norm": 2.9401698112487793, |
|
"learning_rate": 6.4392201834862394e-06, |
|
"loss": 0.6103, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.7914478619654913, |
|
"grad_norm": 2.275641679763794, |
|
"learning_rate": 6.381880733944954e-06, |
|
"loss": 0.5384, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.7933233308327082, |
|
"grad_norm": 1.5987143516540527, |
|
"learning_rate": 6.324541284403669e-06, |
|
"loss": 0.6324, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.795198799699925, |
|
"grad_norm": 1.6601738929748535, |
|
"learning_rate": 6.267201834862385e-06, |
|
"loss": 0.5049, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.7970742685671418, |
|
"grad_norm": 2.5912208557128906, |
|
"learning_rate": 6.209862385321101e-06, |
|
"loss": 0.6013, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.7989497374343586, |
|
"grad_norm": 2.051008701324463, |
|
"learning_rate": 6.1525229357798165e-06, |
|
"loss": 0.6521, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.8008252063015754, |
|
"grad_norm": 2.331805467605591, |
|
"learning_rate": 6.095183486238532e-06, |
|
"loss": 0.5396, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.8027006751687922, |
|
"grad_norm": 2.048785924911499, |
|
"learning_rate": 6.037844036697248e-06, |
|
"loss": 0.5918, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.804576144036009, |
|
"grad_norm": 2.387164354324341, |
|
"learning_rate": 5.980504587155964e-06, |
|
"loss": 0.624, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 1.7921018600463867, |
|
"learning_rate": 5.9231651376146795e-06, |
|
"loss": 0.5066, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8083270817704427, |
|
"grad_norm": 1.3692150115966797, |
|
"learning_rate": 5.865825688073395e-06, |
|
"loss": 0.509, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.8102025506376594, |
|
"grad_norm": 1.9718056917190552, |
|
"learning_rate": 5.80848623853211e-06, |
|
"loss": 0.6208, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.8120780195048762, |
|
"grad_norm": 1.9130088090896606, |
|
"learning_rate": 5.751146788990826e-06, |
|
"loss": 0.5508, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 2.5534584522247314, |
|
"learning_rate": 5.693807339449541e-06, |
|
"loss": 0.6473, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.8158289572393098, |
|
"grad_norm": 2.3137259483337402, |
|
"learning_rate": 5.6364678899082565e-06, |
|
"loss": 0.5723, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.8177044261065266, |
|
"grad_norm": 2.2267236709594727, |
|
"learning_rate": 5.579128440366972e-06, |
|
"loss": 0.516, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.8195798949737434, |
|
"grad_norm": 2.8468329906463623, |
|
"learning_rate": 5.521788990825688e-06, |
|
"loss": 0.6196, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.8214553638409603, |
|
"grad_norm": 1.7340741157531738, |
|
"learning_rate": 5.464449541284404e-06, |
|
"loss": 0.5489, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.8233308327081771, |
|
"grad_norm": 1.9742332696914673, |
|
"learning_rate": 5.4071100917431195e-06, |
|
"loss": 0.585, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.8252063015753939, |
|
"grad_norm": 2.408601999282837, |
|
"learning_rate": 5.349770642201835e-06, |
|
"loss": 0.5685, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8252063015753939, |
|
"eval_loss": 0.6204274296760559, |
|
"eval_runtime": 5.5455, |
|
"eval_samples_per_second": 21.639, |
|
"eval_steps_per_second": 2.705, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8270817704426107, |
|
"grad_norm": 2.1270008087158203, |
|
"learning_rate": 5.292431192660551e-06, |
|
"loss": 0.5759, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.8289572393098275, |
|
"grad_norm": 2.048781156539917, |
|
"learning_rate": 5.235091743119266e-06, |
|
"loss": 0.5268, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.8308327081770442, |
|
"grad_norm": 1.643114686012268, |
|
"learning_rate": 5.177752293577982e-06, |
|
"loss": 0.5481, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.832708177044261, |
|
"grad_norm": 1.9851353168487549, |
|
"learning_rate": 5.120412844036697e-06, |
|
"loss": 0.6492, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.8345836459114778, |
|
"grad_norm": 2.3454835414886475, |
|
"learning_rate": 5.063073394495413e-06, |
|
"loss": 0.5475, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.8364591147786947, |
|
"grad_norm": 2.1236870288848877, |
|
"learning_rate": 5.005733944954129e-06, |
|
"loss": 0.4889, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.8383345836459115, |
|
"grad_norm": 2.490607738494873, |
|
"learning_rate": 4.948394495412844e-06, |
|
"loss": 0.6648, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.8402100525131283, |
|
"grad_norm": 2.781184434890747, |
|
"learning_rate": 4.8910550458715596e-06, |
|
"loss": 0.6362, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.8420855213803451, |
|
"grad_norm": 1.488677740097046, |
|
"learning_rate": 4.833715596330275e-06, |
|
"loss": 0.6213, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.8439609902475619, |
|
"grad_norm": 1.9841208457946777, |
|
"learning_rate": 4.776376146788991e-06, |
|
"loss": 0.6166, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8458364591147787, |
|
"grad_norm": 1.4909323453903198, |
|
"learning_rate": 4.719036697247707e-06, |
|
"loss": 0.4612, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.8477119279819955, |
|
"grad_norm": 1.927198886871338, |
|
"learning_rate": 4.661697247706422e-06, |
|
"loss": 0.5697, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.8495873968492123, |
|
"grad_norm": 2.1951193809509277, |
|
"learning_rate": 4.6043577981651375e-06, |
|
"loss": 0.6029, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.8514628657164292, |
|
"grad_norm": 1.6474297046661377, |
|
"learning_rate": 4.547018348623853e-06, |
|
"loss": 0.5997, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.8533383345836459, |
|
"grad_norm": 2.8692142963409424, |
|
"learning_rate": 4.489678899082569e-06, |
|
"loss": 0.5052, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.8552138034508627, |
|
"grad_norm": 2.2251393795013428, |
|
"learning_rate": 4.432339449541285e-06, |
|
"loss": 0.5406, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.8570892723180795, |
|
"grad_norm": 1.9672750234603882, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.6556, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.8589647411852963, |
|
"grad_norm": 1.9112441539764404, |
|
"learning_rate": 4.317660550458716e-06, |
|
"loss": 0.6307, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.8608402100525131, |
|
"grad_norm": 2.0552773475646973, |
|
"learning_rate": 4.260321100917432e-06, |
|
"loss": 0.5682, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.8627156789197299, |
|
"grad_norm": 1.927811622619629, |
|
"learning_rate": 4.202981651376147e-06, |
|
"loss": 0.5006, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8627156789197299, |
|
"eval_loss": 0.6183449625968933, |
|
"eval_runtime": 5.5214, |
|
"eval_samples_per_second": 21.734, |
|
"eval_steps_per_second": 2.717, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8645911477869468, |
|
"grad_norm": 2.3974733352661133, |
|
"learning_rate": 4.145642201834863e-06, |
|
"loss": 0.6954, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.8664666166541636, |
|
"grad_norm": 2.214097738265991, |
|
"learning_rate": 4.0883027522935775e-06, |
|
"loss": 0.619, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.8683420855213804, |
|
"grad_norm": 2.094970464706421, |
|
"learning_rate": 4.030963302752293e-06, |
|
"loss": 0.5883, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.8702175543885972, |
|
"grad_norm": 1.908461570739746, |
|
"learning_rate": 3.973623853211009e-06, |
|
"loss": 0.4951, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"grad_norm": 2.1103639602661133, |
|
"learning_rate": 3.916284403669725e-06, |
|
"loss": 0.5969, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.8739684921230307, |
|
"grad_norm": 1.8500175476074219, |
|
"learning_rate": 3.8589449541284405e-06, |
|
"loss": 0.5824, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.8758439609902475, |
|
"grad_norm": 2.222599506378174, |
|
"learning_rate": 3.8016055045871563e-06, |
|
"loss": 0.654, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.8777194298574643, |
|
"grad_norm": 2.0447375774383545, |
|
"learning_rate": 3.744266055045872e-06, |
|
"loss": 0.5774, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.8795948987246812, |
|
"grad_norm": 2.4672482013702393, |
|
"learning_rate": 3.686926605504587e-06, |
|
"loss": 0.6135, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.881470367591898, |
|
"grad_norm": 2.1856000423431396, |
|
"learning_rate": 3.6295871559633027e-06, |
|
"loss": 0.5876, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8833458364591148, |
|
"grad_norm": 2.358637809753418, |
|
"learning_rate": 3.5722477064220184e-06, |
|
"loss": 0.5665, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.8852213053263316, |
|
"grad_norm": 1.8287360668182373, |
|
"learning_rate": 3.514908256880734e-06, |
|
"loss": 0.5067, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.8870967741935484, |
|
"grad_norm": 2.045971155166626, |
|
"learning_rate": 3.45756880733945e-06, |
|
"loss": 0.54, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.8889722430607652, |
|
"grad_norm": 2.5090229511260986, |
|
"learning_rate": 3.4002293577981652e-06, |
|
"loss": 0.6294, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.890847711927982, |
|
"grad_norm": 2.9200639724731445, |
|
"learning_rate": 3.3428899082568806e-06, |
|
"loss": 0.5443, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.8927231807951987, |
|
"grad_norm": 2.0221188068389893, |
|
"learning_rate": 3.2855504587155963e-06, |
|
"loss": 0.628, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.8945986496624156, |
|
"grad_norm": 2.6036345958709717, |
|
"learning_rate": 3.228211009174312e-06, |
|
"loss": 0.6387, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.8964741185296324, |
|
"grad_norm": 3.309267044067383, |
|
"learning_rate": 3.170871559633028e-06, |
|
"loss": 0.5863, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.8983495873968492, |
|
"grad_norm": 3.4704477787017822, |
|
"learning_rate": 3.113532110091743e-06, |
|
"loss": 0.5955, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.900225056264066, |
|
"grad_norm": 2.056976556777954, |
|
"learning_rate": 3.056192660550459e-06, |
|
"loss": 0.5984, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.900225056264066, |
|
"eval_loss": 0.6168529987335205, |
|
"eval_runtime": 5.5639, |
|
"eval_samples_per_second": 21.568, |
|
"eval_steps_per_second": 2.696, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9021005251312828, |
|
"grad_norm": 2.358440399169922, |
|
"learning_rate": 2.9988532110091746e-06, |
|
"loss": 0.5657, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.9039759939984996, |
|
"grad_norm": 2.124436140060425, |
|
"learning_rate": 2.94151376146789e-06, |
|
"loss": 0.642, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.9058514628657164, |
|
"grad_norm": 1.5845674276351929, |
|
"learning_rate": 2.8841743119266057e-06, |
|
"loss": 0.5054, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.9077269317329333, |
|
"grad_norm": 2.296250820159912, |
|
"learning_rate": 2.8325688073394495e-06, |
|
"loss": 0.6108, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.9096024006001501, |
|
"grad_norm": 1.7618379592895508, |
|
"learning_rate": 2.7752293577981653e-06, |
|
"loss": 0.6623, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.9114778694673669, |
|
"grad_norm": 2.3502273559570312, |
|
"learning_rate": 2.7178899082568806e-06, |
|
"loss": 0.5688, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.9133533383345837, |
|
"grad_norm": 2.141451597213745, |
|
"learning_rate": 2.6605504587155964e-06, |
|
"loss": 0.6076, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.9152288072018004, |
|
"grad_norm": 2.2488343715667725, |
|
"learning_rate": 2.603211009174312e-06, |
|
"loss": 0.6399, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.9171042760690172, |
|
"grad_norm": 2.0450565814971924, |
|
"learning_rate": 2.545871559633028e-06, |
|
"loss": 0.5864, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.918979744936234, |
|
"grad_norm": 2.490226984024048, |
|
"learning_rate": 2.488532110091743e-06, |
|
"loss": 0.5644, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9208552138034508, |
|
"grad_norm": 2.630089282989502, |
|
"learning_rate": 2.4311926605504585e-06, |
|
"loss": 0.6637, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.9227306826706677, |
|
"grad_norm": 2.2584402561187744, |
|
"learning_rate": 2.3738532110091743e-06, |
|
"loss": 0.5784, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.9246061515378845, |
|
"grad_norm": 2.9330437183380127, |
|
"learning_rate": 2.31651376146789e-06, |
|
"loss": 0.5286, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.9264816204051013, |
|
"grad_norm": 2.6167702674865723, |
|
"learning_rate": 2.2591743119266058e-06, |
|
"loss": 0.5273, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.9283570892723181, |
|
"grad_norm": 2.414607286453247, |
|
"learning_rate": 2.201834862385321e-06, |
|
"loss": 0.5177, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 2.5905508995056152, |
|
"learning_rate": 2.144495412844037e-06, |
|
"loss": 0.5818, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.9321080270067517, |
|
"grad_norm": 2.9565694332122803, |
|
"learning_rate": 2.087155963302752e-06, |
|
"loss": 0.5307, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.9339834958739685, |
|
"grad_norm": 2.3778281211853027, |
|
"learning_rate": 2.029816513761468e-06, |
|
"loss": 0.5477, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.9358589647411854, |
|
"grad_norm": 2.004302978515625, |
|
"learning_rate": 1.9724770642201837e-06, |
|
"loss": 0.55, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.9377344336084021, |
|
"grad_norm": 2.098611354827881, |
|
"learning_rate": 1.915137614678899e-06, |
|
"loss": 0.6077, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9377344336084021, |
|
"eval_loss": 0.6168031096458435, |
|
"eval_runtime": 5.423, |
|
"eval_samples_per_second": 22.128, |
|
"eval_steps_per_second": 2.766, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9396099024756189, |
|
"grad_norm": 2.273273229598999, |
|
"learning_rate": 1.8577981651376147e-06, |
|
"loss": 0.5192, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.9414853713428357, |
|
"grad_norm": 2.40267276763916, |
|
"learning_rate": 1.8004587155963303e-06, |
|
"loss": 0.6307, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.9433608402100525, |
|
"grad_norm": 2.205829620361328, |
|
"learning_rate": 1.743119266055046e-06, |
|
"loss": 0.5307, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.9452363090772693, |
|
"grad_norm": 2.208779811859131, |
|
"learning_rate": 1.6857798165137616e-06, |
|
"loss": 0.5816, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.9471117779444861, |
|
"grad_norm": 2.550372838973999, |
|
"learning_rate": 1.628440366972477e-06, |
|
"loss": 0.588, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.9489872468117029, |
|
"grad_norm": 2.062358856201172, |
|
"learning_rate": 1.5711009174311926e-06, |
|
"loss": 0.6606, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.9508627156789198, |
|
"grad_norm": 2.3175814151763916, |
|
"learning_rate": 1.5137614678899084e-06, |
|
"loss": 0.4878, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.9527381845461366, |
|
"grad_norm": 3.7666046619415283, |
|
"learning_rate": 1.456422018348624e-06, |
|
"loss": 0.566, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.9546136534133534, |
|
"grad_norm": 2.467745304107666, |
|
"learning_rate": 1.3990825688073395e-06, |
|
"loss": 0.6029, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.9564891222805701, |
|
"grad_norm": 1.9065784215927124, |
|
"learning_rate": 1.3417431192660552e-06, |
|
"loss": 0.5577, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9583645911477869, |
|
"grad_norm": 2.447404623031616, |
|
"learning_rate": 1.2844036697247705e-06, |
|
"loss": 0.513, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.9602400600150037, |
|
"grad_norm": 2.8181941509246826, |
|
"learning_rate": 1.2270642201834863e-06, |
|
"loss": 0.4997, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.9621155288822205, |
|
"grad_norm": 2.414186954498291, |
|
"learning_rate": 1.169724770642202e-06, |
|
"loss": 0.5174, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.9639909977494373, |
|
"grad_norm": 2.9557716846466064, |
|
"learning_rate": 1.1123853211009173e-06, |
|
"loss": 0.5591, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.9658664666166542, |
|
"grad_norm": 1.9689189195632935, |
|
"learning_rate": 1.055045871559633e-06, |
|
"loss": 0.5722, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 2.1190686225891113, |
|
"learning_rate": 9.977064220183486e-07, |
|
"loss": 0.558, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.9696174043510878, |
|
"grad_norm": 2.7399091720581055, |
|
"learning_rate": 9.403669724770642e-07, |
|
"loss": 0.5672, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.9714928732183046, |
|
"grad_norm": 2.2235541343688965, |
|
"learning_rate": 8.830275229357798e-07, |
|
"loss": 0.5904, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.9733683420855214, |
|
"grad_norm": 2.248394727706909, |
|
"learning_rate": 8.256880733944955e-07, |
|
"loss": 0.5505, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.9752438109527382, |
|
"grad_norm": 2.1956896781921387, |
|
"learning_rate": 7.68348623853211e-07, |
|
"loss": 0.596, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9752438109527382, |
|
"eval_loss": 0.6171349287033081, |
|
"eval_runtime": 5.3961, |
|
"eval_samples_per_second": 22.238, |
|
"eval_steps_per_second": 2.78, |
|
"step": 5200 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5332, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.618912552812544e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|