{ "best_metric": 0.6168031096458435, "best_model_checkpoint": "/Lora_models/checkpoint-5000", "epoch": 0.9752438109527382, "eval_steps": 200, "global_step": 5200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018754688672168042, "grad_norm": 0.9346357583999634, "learning_rate": 2.7e-06, "loss": 2.5803, "step": 10 }, { "epoch": 0.0037509377344336083, "grad_norm": 1.251865267753601, "learning_rate": 5.7000000000000005e-06, "loss": 2.4539, "step": 20 }, { "epoch": 0.005626406601650412, "grad_norm": 1.055527687072754, "learning_rate": 8.7e-06, "loss": 2.5107, "step": 30 }, { "epoch": 0.007501875468867217, "grad_norm": 1.1238517761230469, "learning_rate": 1.1700000000000001e-05, "loss": 2.4512, "step": 40 }, { "epoch": 0.009377344336084021, "grad_norm": 1.20820152759552, "learning_rate": 1.47e-05, "loss": 2.4078, "step": 50 }, { "epoch": 0.011252813203300824, "grad_norm": 2.085667133331299, "learning_rate": 1.77e-05, "loss": 2.2828, "step": 60 }, { "epoch": 0.01312828207051763, "grad_norm": 1.357437252998352, "learning_rate": 2.07e-05, "loss": 2.0614, "step": 70 }, { "epoch": 0.015003750937734433, "grad_norm": 1.6979730129241943, "learning_rate": 2.37e-05, "loss": 1.9482, "step": 80 }, { "epoch": 0.01687921980495124, "grad_norm": 1.7537882328033447, "learning_rate": 2.6700000000000002e-05, "loss": 1.775, "step": 90 }, { "epoch": 0.018754688672168042, "grad_norm": 1.8001635074615479, "learning_rate": 2.97e-05, "loss": 1.6462, "step": 100 }, { "epoch": 0.020630157539384845, "grad_norm": 1.744318962097168, "learning_rate": 2.9948394495412847e-05, "loss": 1.3796, "step": 110 }, { "epoch": 0.02250562640660165, "grad_norm": 1.6778439283370972, "learning_rate": 2.989105504587156e-05, "loss": 1.2642, "step": 120 }, { "epoch": 0.024381095273818456, "grad_norm": 2.0165181159973145, "learning_rate": 2.9833715596330273e-05, "loss": 0.9405, "step": 130 }, { "epoch": 0.02625656414103526, "grad_norm": 1.1453020572662354, "learning_rate": 2.9776376146788993e-05, "loss": 0.8365, "step": 140 }, { "epoch": 0.028132033008252063, "grad_norm": 0.9237515330314636, "learning_rate": 2.9719036697247706e-05, "loss": 0.9324, "step": 150 }, { "epoch": 0.030007501875468866, "grad_norm": 1.1040199995040894, "learning_rate": 2.9661697247706423e-05, "loss": 0.8905, "step": 160 }, { "epoch": 0.03188297074268567, "grad_norm": 1.052095651626587, "learning_rate": 2.9604357798165136e-05, "loss": 0.7226, "step": 170 }, { "epoch": 0.03375843960990248, "grad_norm": 1.4509942531585693, "learning_rate": 2.9547018348623853e-05, "loss": 0.7401, "step": 180 }, { "epoch": 0.03563390847711928, "grad_norm": 1.3202496767044067, "learning_rate": 2.948967889908257e-05, "loss": 0.6498, "step": 190 }, { "epoch": 0.037509377344336084, "grad_norm": 1.257535457611084, "learning_rate": 2.9432339449541286e-05, "loss": 0.8931, "step": 200 }, { "epoch": 0.037509377344336084, "eval_loss": 0.8721055388450623, "eval_runtime": 5.4276, "eval_samples_per_second": 22.109, "eval_steps_per_second": 2.764, "step": 200 }, { "epoch": 0.03938484621155289, "grad_norm": 1.1922121047973633, "learning_rate": 2.9375e-05, "loss": 0.9556, "step": 210 }, { "epoch": 0.04126031507876969, "grad_norm": 1.1076873540878296, "learning_rate": 2.9317660550458716e-05, "loss": 0.8148, "step": 220 }, { "epoch": 0.043135783945986494, "grad_norm": 1.3961315155029297, "learning_rate": 2.9260321100917432e-05, "loss": 0.7225, "step": 230 }, { "epoch": 0.0450112528132033, "grad_norm": 1.3916189670562744, "learning_rate": 2.920298165137615e-05, "loss": 0.8899, "step": 240 }, { "epoch": 0.04688672168042011, "grad_norm": 1.4073107242584229, "learning_rate": 2.9145642201834862e-05, "loss": 0.7895, "step": 250 }, { "epoch": 0.04876219054763691, "grad_norm": 1.4054927825927734, "learning_rate": 2.908830275229358e-05, "loss": 0.8662, "step": 260 }, { "epoch": 0.050637659414853715, "grad_norm": 1.0531301498413086, "learning_rate": 2.9030963302752292e-05, "loss": 0.7619, "step": 270 }, { "epoch": 0.05251312828207052, "grad_norm": 1.3712563514709473, "learning_rate": 2.8973623853211012e-05, "loss": 0.8608, "step": 280 }, { "epoch": 0.05438859714928732, "grad_norm": 1.4002102613449097, "learning_rate": 2.8916284403669725e-05, "loss": 0.7368, "step": 290 }, { "epoch": 0.056264066016504126, "grad_norm": 1.381103754043579, "learning_rate": 2.8858944954128442e-05, "loss": 0.8059, "step": 300 }, { "epoch": 0.05813953488372093, "grad_norm": 1.9642342329025269, "learning_rate": 2.8801605504587155e-05, "loss": 0.8149, "step": 310 }, { "epoch": 0.06001500375093773, "grad_norm": 1.4538882970809937, "learning_rate": 2.8744266055045875e-05, "loss": 0.6891, "step": 320 }, { "epoch": 0.061890472618154536, "grad_norm": 1.47075617313385, "learning_rate": 2.8686926605504588e-05, "loss": 0.8458, "step": 330 }, { "epoch": 0.06376594148537135, "grad_norm": 1.1955201625823975, "learning_rate": 2.8629587155963305e-05, "loss": 0.7404, "step": 340 }, { "epoch": 0.06564141035258815, "grad_norm": 1.151567816734314, "learning_rate": 2.8572247706422018e-05, "loss": 0.6032, "step": 350 }, { "epoch": 0.06751687921980495, "grad_norm": 1.3864506483078003, "learning_rate": 2.8514908256880738e-05, "loss": 0.7951, "step": 360 }, { "epoch": 0.06939234808702176, "grad_norm": 1.3162195682525635, "learning_rate": 2.845756880733945e-05, "loss": 0.698, "step": 370 }, { "epoch": 0.07126781695423856, "grad_norm": 1.0740330219268799, "learning_rate": 2.8400229357798164e-05, "loss": 0.7008, "step": 380 }, { "epoch": 0.07314328582145536, "grad_norm": 1.4195873737335205, "learning_rate": 2.834288990825688e-05, "loss": 0.7324, "step": 390 }, { "epoch": 0.07501875468867217, "grad_norm": 1.6425796747207642, "learning_rate": 2.8285550458715594e-05, "loss": 0.6954, "step": 400 }, { "epoch": 0.07501875468867217, "eval_loss": 0.790172815322876, "eval_runtime": 5.5372, "eval_samples_per_second": 21.671, "eval_steps_per_second": 2.709, "step": 400 }, { "epoch": 0.07689422355588897, "grad_norm": 1.4094221591949463, "learning_rate": 2.8228211009174314e-05, "loss": 0.6533, "step": 410 }, { "epoch": 0.07876969242310577, "grad_norm": 1.7052984237670898, "learning_rate": 2.8170871559633027e-05, "loss": 0.8291, "step": 420 }, { "epoch": 0.08064516129032258, "grad_norm": 1.766396164894104, "learning_rate": 2.8113532110091744e-05, "loss": 0.5917, "step": 430 }, { "epoch": 0.08252063015753938, "grad_norm": 1.3280906677246094, "learning_rate": 2.8056192660550457e-05, "loss": 0.5834, "step": 440 }, { "epoch": 0.08439609902475619, "grad_norm": 1.472038984298706, "learning_rate": 2.7998853211009177e-05, "loss": 0.6189, "step": 450 }, { "epoch": 0.08627156789197299, "grad_norm": 2.434629440307617, "learning_rate": 2.794151376146789e-05, "loss": 0.6107, "step": 460 }, { "epoch": 0.08814703675918979, "grad_norm": 1.7748132944107056, "learning_rate": 2.7884174311926607e-05, "loss": 0.508, "step": 470 }, { "epoch": 0.0900225056264066, "grad_norm": 1.7380709648132324, "learning_rate": 2.782683486238532e-05, "loss": 0.6482, "step": 480 }, { "epoch": 0.0918979744936234, "grad_norm": 1.1493253707885742, "learning_rate": 2.7769495412844037e-05, "loss": 0.6531, "step": 490 }, { "epoch": 0.09377344336084022, "grad_norm": 1.384508728981018, "learning_rate": 2.7712155963302753e-05, "loss": 0.7061, "step": 500 }, { "epoch": 0.09564891222805702, "grad_norm": 1.792687177658081, "learning_rate": 2.765481651376147e-05, "loss": 0.6, "step": 510 }, { "epoch": 0.09752438109527382, "grad_norm": 1.657291054725647, "learning_rate": 2.7597477064220183e-05, "loss": 0.612, "step": 520 }, { "epoch": 0.09939984996249063, "grad_norm": 1.2928940057754517, "learning_rate": 2.75401376146789e-05, "loss": 0.7446, "step": 530 }, { "epoch": 0.10127531882970743, "grad_norm": 1.3647221326828003, "learning_rate": 2.7482798165137616e-05, "loss": 0.6422, "step": 540 }, { "epoch": 0.10315078769692423, "grad_norm": 1.7979224920272827, "learning_rate": 2.7425458715596333e-05, "loss": 0.5261, "step": 550 }, { "epoch": 0.10502625656414104, "grad_norm": 1.4330195188522339, "learning_rate": 2.7368119266055046e-05, "loss": 0.6801, "step": 560 }, { "epoch": 0.10690172543135784, "grad_norm": 1.4820642471313477, "learning_rate": 2.7310779816513763e-05, "loss": 0.6767, "step": 570 }, { "epoch": 0.10877719429857464, "grad_norm": 1.6445374488830566, "learning_rate": 2.7253440366972476e-05, "loss": 0.6727, "step": 580 }, { "epoch": 0.11065266316579145, "grad_norm": 1.5297715663909912, "learning_rate": 2.7196100917431196e-05, "loss": 0.5425, "step": 590 }, { "epoch": 0.11252813203300825, "grad_norm": 2.9595024585723877, "learning_rate": 2.713876146788991e-05, "loss": 0.5758, "step": 600 }, { "epoch": 0.11252813203300825, "eval_loss": 0.7191774249076843, "eval_runtime": 5.5393, "eval_samples_per_second": 21.663, "eval_steps_per_second": 2.708, "step": 600 }, { "epoch": 0.11440360090022506, "grad_norm": 1.300517201423645, "learning_rate": 2.7081422018348626e-05, "loss": 0.6598, "step": 610 }, { "epoch": 0.11627906976744186, "grad_norm": 1.4118940830230713, "learning_rate": 2.702408256880734e-05, "loss": 0.7422, "step": 620 }, { "epoch": 0.11815453863465866, "grad_norm": 1.1897056102752686, "learning_rate": 2.6966743119266055e-05, "loss": 0.6301, "step": 630 }, { "epoch": 0.12003000750187547, "grad_norm": 1.3802241086959839, "learning_rate": 2.6909403669724772e-05, "loss": 0.7331, "step": 640 }, { "epoch": 0.12190547636909227, "grad_norm": 1.3182717561721802, "learning_rate": 2.6852064220183485e-05, "loss": 0.6309, "step": 650 }, { "epoch": 0.12378094523630907, "grad_norm": 1.1293184757232666, "learning_rate": 2.6794724770642202e-05, "loss": 0.657, "step": 660 }, { "epoch": 0.12565641410352588, "grad_norm": 1.525172472000122, "learning_rate": 2.6737385321100915e-05, "loss": 0.7025, "step": 670 }, { "epoch": 0.1275318829707427, "grad_norm": 1.1699976921081543, "learning_rate": 2.6680045871559635e-05, "loss": 0.6213, "step": 680 }, { "epoch": 0.12940735183795948, "grad_norm": 1.5956225395202637, "learning_rate": 2.6622706422018348e-05, "loss": 0.6628, "step": 690 }, { "epoch": 0.1312828207051763, "grad_norm": 1.3606040477752686, "learning_rate": 2.6565366972477065e-05, "loss": 0.6042, "step": 700 }, { "epoch": 0.1331582895723931, "grad_norm": 1.310490369796753, "learning_rate": 2.6508027522935778e-05, "loss": 0.6976, "step": 710 }, { "epoch": 0.1350337584396099, "grad_norm": 1.0546784400939941, "learning_rate": 2.6450688073394498e-05, "loss": 0.6681, "step": 720 }, { "epoch": 0.1369092273068267, "grad_norm": 1.977148175239563, "learning_rate": 2.639334862385321e-05, "loss": 0.723, "step": 730 }, { "epoch": 0.13878469617404351, "grad_norm": 1.526698350906372, "learning_rate": 2.6336009174311928e-05, "loss": 0.6595, "step": 740 }, { "epoch": 0.1406601650412603, "grad_norm": 1.302213191986084, "learning_rate": 2.627866972477064e-05, "loss": 0.6826, "step": 750 }, { "epoch": 0.14253563390847712, "grad_norm": 1.4109597206115723, "learning_rate": 2.6221330275229358e-05, "loss": 0.679, "step": 760 }, { "epoch": 0.1444111027756939, "grad_norm": 1.390410304069519, "learning_rate": 2.6163990825688074e-05, "loss": 0.5305, "step": 770 }, { "epoch": 0.14628657164291073, "grad_norm": 1.778104543685913, "learning_rate": 2.610665137614679e-05, "loss": 0.5224, "step": 780 }, { "epoch": 0.14816204051012752, "grad_norm": 1.4633052349090576, "learning_rate": 2.6049311926605504e-05, "loss": 0.6479, "step": 790 }, { "epoch": 0.15003750937734434, "grad_norm": 1.4608770608901978, "learning_rate": 2.599197247706422e-05, "loss": 0.6688, "step": 800 }, { "epoch": 0.15003750937734434, "eval_loss": 0.7000935673713684, "eval_runtime": 5.5538, "eval_samples_per_second": 21.607, "eval_steps_per_second": 2.701, "step": 800 }, { "epoch": 0.15191297824456115, "grad_norm": 1.419257640838623, "learning_rate": 2.5934633027522937e-05, "loss": 0.4972, "step": 810 }, { "epoch": 0.15378844711177794, "grad_norm": 1.383324146270752, "learning_rate": 2.5877293577981654e-05, "loss": 0.6802, "step": 820 }, { "epoch": 0.15566391597899476, "grad_norm": 1.4699604511260986, "learning_rate": 2.5819954128440367e-05, "loss": 0.5997, "step": 830 }, { "epoch": 0.15753938484621155, "grad_norm": 1.7882672548294067, "learning_rate": 2.5762614678899084e-05, "loss": 0.5495, "step": 840 }, { "epoch": 0.15941485371342837, "grad_norm": 1.4856760501861572, "learning_rate": 2.57052752293578e-05, "loss": 0.6453, "step": 850 }, { "epoch": 0.16129032258064516, "grad_norm": 1.4340794086456299, "learning_rate": 2.5647935779816517e-05, "loss": 0.6263, "step": 860 }, { "epoch": 0.16316579144786197, "grad_norm": 1.7416778802871704, "learning_rate": 2.559059633027523e-05, "loss": 0.5994, "step": 870 }, { "epoch": 0.16504126031507876, "grad_norm": 1.2337450981140137, "learning_rate": 2.5533256880733947e-05, "loss": 0.6288, "step": 880 }, { "epoch": 0.16691672918229558, "grad_norm": 1.6339170932769775, "learning_rate": 2.547591743119266e-05, "loss": 0.6181, "step": 890 }, { "epoch": 0.16879219804951237, "grad_norm": 1.7602070569992065, "learning_rate": 2.5418577981651376e-05, "loss": 0.5742, "step": 900 }, { "epoch": 0.1706676669167292, "grad_norm": 1.3282010555267334, "learning_rate": 2.5361238532110093e-05, "loss": 0.6869, "step": 910 }, { "epoch": 0.17254313578394598, "grad_norm": 1.471281886100769, "learning_rate": 2.5303899082568806e-05, "loss": 0.6788, "step": 920 }, { "epoch": 0.1744186046511628, "grad_norm": 1.4900857210159302, "learning_rate": 2.5246559633027523e-05, "loss": 0.5706, "step": 930 }, { "epoch": 0.17629407351837958, "grad_norm": 1.903571605682373, "learning_rate": 2.518922018348624e-05, "loss": 0.7056, "step": 940 }, { "epoch": 0.1781695423855964, "grad_norm": 1.7407480478286743, "learning_rate": 2.5131880733944956e-05, "loss": 0.5102, "step": 950 }, { "epoch": 0.1800450112528132, "grad_norm": 1.6534910202026367, "learning_rate": 2.507454128440367e-05, "loss": 0.6654, "step": 960 }, { "epoch": 0.18192048012003, "grad_norm": 1.7608113288879395, "learning_rate": 2.5017201834862386e-05, "loss": 0.6793, "step": 970 }, { "epoch": 0.1837959489872468, "grad_norm": 1.737579107284546, "learning_rate": 2.49598623853211e-05, "loss": 0.5787, "step": 980 }, { "epoch": 0.18567141785446362, "grad_norm": 1.7096258401870728, "learning_rate": 2.490252293577982e-05, "loss": 0.7254, "step": 990 }, { "epoch": 0.18754688672168043, "grad_norm": 1.7333779335021973, "learning_rate": 2.4845183486238532e-05, "loss": 0.7332, "step": 1000 }, { "epoch": 0.18754688672168043, "eval_loss": 0.6944581270217896, "eval_runtime": 5.5505, "eval_samples_per_second": 21.62, "eval_steps_per_second": 2.702, "step": 1000 }, { "epoch": 0.18942235558889722, "grad_norm": 1.235772967338562, "learning_rate": 2.478784403669725e-05, "loss": 0.684, "step": 1010 }, { "epoch": 0.19129782445611404, "grad_norm": 1.3532825708389282, "learning_rate": 2.4730504587155962e-05, "loss": 0.5977, "step": 1020 }, { "epoch": 0.19317329332333083, "grad_norm": 1.7143605947494507, "learning_rate": 2.4673165137614682e-05, "loss": 0.6109, "step": 1030 }, { "epoch": 0.19504876219054765, "grad_norm": 1.4067035913467407, "learning_rate": 2.4615825688073395e-05, "loss": 0.6823, "step": 1040 }, { "epoch": 0.19692423105776444, "grad_norm": 1.713149070739746, "learning_rate": 2.4558486238532112e-05, "loss": 0.5852, "step": 1050 }, { "epoch": 0.19879969992498125, "grad_norm": 1.5876083374023438, "learning_rate": 2.4501146788990825e-05, "loss": 0.6451, "step": 1060 }, { "epoch": 0.20067516879219804, "grad_norm": 1.3769148588180542, "learning_rate": 2.444380733944954e-05, "loss": 0.5712, "step": 1070 }, { "epoch": 0.20255063765941486, "grad_norm": 1.408957839012146, "learning_rate": 2.4386467889908258e-05, "loss": 0.5704, "step": 1080 }, { "epoch": 0.20442610652663165, "grad_norm": 1.3725367784500122, "learning_rate": 2.4329128440366975e-05, "loss": 0.4405, "step": 1090 }, { "epoch": 0.20630157539384847, "grad_norm": 1.6715686321258545, "learning_rate": 2.4271788990825688e-05, "loss": 0.5433, "step": 1100 }, { "epoch": 0.20817704426106526, "grad_norm": 1.4740937948226929, "learning_rate": 2.4214449541284405e-05, "loss": 0.6363, "step": 1110 }, { "epoch": 0.21005251312828208, "grad_norm": 1.640724778175354, "learning_rate": 2.415711009174312e-05, "loss": 0.6302, "step": 1120 }, { "epoch": 0.21192798199549887, "grad_norm": 1.8040657043457031, "learning_rate": 2.4099770642201838e-05, "loss": 0.6726, "step": 1130 }, { "epoch": 0.21380345086271568, "grad_norm": 1.6836594343185425, "learning_rate": 2.404243119266055e-05, "loss": 0.6706, "step": 1140 }, { "epoch": 0.21567891972993247, "grad_norm": 1.9649243354797363, "learning_rate": 2.3985091743119264e-05, "loss": 0.5189, "step": 1150 }, { "epoch": 0.2175543885971493, "grad_norm": 1.5541070699691772, "learning_rate": 2.392775229357798e-05, "loss": 0.7497, "step": 1160 }, { "epoch": 0.21942985746436608, "grad_norm": 1.9473050832748413, "learning_rate": 2.3870412844036697e-05, "loss": 0.5373, "step": 1170 }, { "epoch": 0.2213053263315829, "grad_norm": 1.8983582258224487, "learning_rate": 2.3813073394495414e-05, "loss": 0.6523, "step": 1180 }, { "epoch": 0.2231807951987997, "grad_norm": 1.6753871440887451, "learning_rate": 2.3755733944954127e-05, "loss": 0.6294, "step": 1190 }, { "epoch": 0.2250562640660165, "grad_norm": 1.706829309463501, "learning_rate": 2.3698394495412844e-05, "loss": 0.6741, "step": 1200 }, { "epoch": 0.2250562640660165, "eval_loss": 0.6810731887817383, "eval_runtime": 5.4298, "eval_samples_per_second": 22.1, "eval_steps_per_second": 2.763, "step": 1200 }, { "epoch": 0.22693173293323332, "grad_norm": 1.6627461910247803, "learning_rate": 2.364105504587156e-05, "loss": 0.6354, "step": 1210 }, { "epoch": 0.2288072018004501, "grad_norm": 2.973555564880371, "learning_rate": 2.3583715596330277e-05, "loss": 0.5482, "step": 1220 }, { "epoch": 0.23068267066766693, "grad_norm": 1.9262856245040894, "learning_rate": 2.352637614678899e-05, "loss": 0.7558, "step": 1230 }, { "epoch": 0.23255813953488372, "grad_norm": 1.816595435142517, "learning_rate": 2.3469036697247707e-05, "loss": 0.6039, "step": 1240 }, { "epoch": 0.23443360840210054, "grad_norm": 1.9634557962417603, "learning_rate": 2.341169724770642e-05, "loss": 0.5126, "step": 1250 }, { "epoch": 0.23630907726931732, "grad_norm": 1.7136008739471436, "learning_rate": 2.335435779816514e-05, "loss": 0.6365, "step": 1260 }, { "epoch": 0.23818454613653414, "grad_norm": 1.4523965120315552, "learning_rate": 2.3297018348623853e-05, "loss": 0.5995, "step": 1270 }, { "epoch": 0.24006001500375093, "grad_norm": 1.6242806911468506, "learning_rate": 2.323967889908257e-05, "loss": 0.6412, "step": 1280 }, { "epoch": 0.24193548387096775, "grad_norm": 1.7888171672821045, "learning_rate": 2.3182339449541283e-05, "loss": 0.6565, "step": 1290 }, { "epoch": 0.24381095273818454, "grad_norm": 1.6343475580215454, "learning_rate": 2.3125000000000003e-05, "loss": 0.6212, "step": 1300 }, { "epoch": 0.24568642160540136, "grad_norm": 1.3897461891174316, "learning_rate": 2.3067660550458716e-05, "loss": 0.5839, "step": 1310 }, { "epoch": 0.24756189047261815, "grad_norm": 1.502485752105713, "learning_rate": 2.3010321100917433e-05, "loss": 0.6725, "step": 1320 }, { "epoch": 0.24943735933983496, "grad_norm": 1.3770966529846191, "learning_rate": 2.2952981651376146e-05, "loss": 0.5998, "step": 1330 }, { "epoch": 0.25131282820705175, "grad_norm": 1.7012661695480347, "learning_rate": 2.2895642201834863e-05, "loss": 0.6668, "step": 1340 }, { "epoch": 0.25318829707426854, "grad_norm": 1.747942566871643, "learning_rate": 2.283830275229358e-05, "loss": 0.6948, "step": 1350 }, { "epoch": 0.2550637659414854, "grad_norm": 1.4288934469223022, "learning_rate": 2.2780963302752296e-05, "loss": 0.6063, "step": 1360 }, { "epoch": 0.2569392348087022, "grad_norm": 1.6301014423370361, "learning_rate": 2.272362385321101e-05, "loss": 0.677, "step": 1370 }, { "epoch": 0.25881470367591897, "grad_norm": 1.3200469017028809, "learning_rate": 2.2666284403669726e-05, "loss": 0.653, "step": 1380 }, { "epoch": 0.26069017254313576, "grad_norm": 1.5794614553451538, "learning_rate": 2.2608944954128442e-05, "loss": 0.6477, "step": 1390 }, { "epoch": 0.2625656414103526, "grad_norm": 1.5092536211013794, "learning_rate": 2.2551605504587155e-05, "loss": 0.6202, "step": 1400 }, { "epoch": 0.2625656414103526, "eval_loss": 0.6741260290145874, "eval_runtime": 5.5003, "eval_samples_per_second": 21.817, "eval_steps_per_second": 2.727, "step": 1400 }, { "epoch": 0.2644411102775694, "grad_norm": 1.5101447105407715, "learning_rate": 2.2494266055045872e-05, "loss": 0.583, "step": 1410 }, { "epoch": 0.2663165791447862, "grad_norm": 1.5355420112609863, "learning_rate": 2.2436926605504585e-05, "loss": 0.5422, "step": 1420 }, { "epoch": 0.268192048012003, "grad_norm": 1.5322073698043823, "learning_rate": 2.2379587155963305e-05, "loss": 0.6067, "step": 1430 }, { "epoch": 0.2700675168792198, "grad_norm": 1.5003911256790161, "learning_rate": 2.232224770642202e-05, "loss": 0.5578, "step": 1440 }, { "epoch": 0.2719429857464366, "grad_norm": 1.4054975509643555, "learning_rate": 2.2264908256880735e-05, "loss": 0.5819, "step": 1450 }, { "epoch": 0.2738184546136534, "grad_norm": 1.7100839614868164, "learning_rate": 2.2207568807339448e-05, "loss": 0.7076, "step": 1460 }, { "epoch": 0.27569392348087024, "grad_norm": 1.6358684301376343, "learning_rate": 2.2150229357798165e-05, "loss": 0.5748, "step": 1470 }, { "epoch": 0.27756939234808703, "grad_norm": 1.8648029565811157, "learning_rate": 2.209288990825688e-05, "loss": 0.5488, "step": 1480 }, { "epoch": 0.2794448612153038, "grad_norm": 2.0715155601501465, "learning_rate": 2.2035550458715598e-05, "loss": 0.6121, "step": 1490 }, { "epoch": 0.2813203300825206, "grad_norm": 1.4680354595184326, "learning_rate": 2.197821100917431e-05, "loss": 0.5775, "step": 1500 }, { "epoch": 0.28319579894973745, "grad_norm": 1.646637201309204, "learning_rate": 2.1920871559633028e-05, "loss": 0.6433, "step": 1510 }, { "epoch": 0.28507126781695424, "grad_norm": 1.9596463441848755, "learning_rate": 2.1863532110091744e-05, "loss": 0.6534, "step": 1520 }, { "epoch": 0.28694673668417103, "grad_norm": 2.375546455383301, "learning_rate": 2.180619266055046e-05, "loss": 0.5802, "step": 1530 }, { "epoch": 0.2888222055513878, "grad_norm": 1.2877148389816284, "learning_rate": 2.1748853211009174e-05, "loss": 0.6626, "step": 1540 }, { "epoch": 0.29069767441860467, "grad_norm": 1.3704779148101807, "learning_rate": 2.169151376146789e-05, "loss": 0.7177, "step": 1550 }, { "epoch": 0.29257314328582146, "grad_norm": 1.9320201873779297, "learning_rate": 2.1634174311926604e-05, "loss": 0.6648, "step": 1560 }, { "epoch": 0.29444861215303825, "grad_norm": 2.351738452911377, "learning_rate": 2.1576834862385324e-05, "loss": 0.5823, "step": 1570 }, { "epoch": 0.29632408102025504, "grad_norm": 1.6075841188430786, "learning_rate": 2.1519495412844037e-05, "loss": 0.6363, "step": 1580 }, { "epoch": 0.2981995498874719, "grad_norm": 1.7780178785324097, "learning_rate": 2.1462155963302754e-05, "loss": 0.7098, "step": 1590 }, { "epoch": 0.30007501875468867, "grad_norm": 1.8664710521697998, "learning_rate": 2.1404816513761467e-05, "loss": 0.5582, "step": 1600 }, { "epoch": 0.30007501875468867, "eval_loss": 0.6711069345474243, "eval_runtime": 5.5099, "eval_samples_per_second": 21.779, "eval_steps_per_second": 2.722, "step": 1600 }, { "epoch": 0.30195048762190546, "grad_norm": 1.7083989381790161, "learning_rate": 2.1347477064220187e-05, "loss": 0.6628, "step": 1610 }, { "epoch": 0.3038259564891223, "grad_norm": 1.7052229642868042, "learning_rate": 2.12901376146789e-05, "loss": 0.5975, "step": 1620 }, { "epoch": 0.3057014253563391, "grad_norm": 1.5098538398742676, "learning_rate": 2.1232798165137617e-05, "loss": 0.5729, "step": 1630 }, { "epoch": 0.3075768942235559, "grad_norm": 1.6489193439483643, "learning_rate": 2.117545871559633e-05, "loss": 0.5064, "step": 1640 }, { "epoch": 0.3094523630907727, "grad_norm": 1.9127089977264404, "learning_rate": 2.1118119266055043e-05, "loss": 0.582, "step": 1650 }, { "epoch": 0.3113278319579895, "grad_norm": 1.801680326461792, "learning_rate": 2.1060779816513763e-05, "loss": 0.6894, "step": 1660 }, { "epoch": 0.3132033008252063, "grad_norm": 1.622673511505127, "learning_rate": 2.1003440366972476e-05, "loss": 0.6085, "step": 1670 }, { "epoch": 0.3150787696924231, "grad_norm": 1.9467750787734985, "learning_rate": 2.0946100917431193e-05, "loss": 0.6822, "step": 1680 }, { "epoch": 0.3169542385596399, "grad_norm": 1.5031330585479736, "learning_rate": 2.0888761467889906e-05, "loss": 0.6694, "step": 1690 }, { "epoch": 0.31882970742685673, "grad_norm": 1.68521249294281, "learning_rate": 2.0831422018348626e-05, "loss": 0.6556, "step": 1700 }, { "epoch": 0.3207051762940735, "grad_norm": 1.8257548809051514, "learning_rate": 2.077408256880734e-05, "loss": 0.5684, "step": 1710 }, { "epoch": 0.3225806451612903, "grad_norm": 1.6865085363388062, "learning_rate": 2.0716743119266056e-05, "loss": 0.6543, "step": 1720 }, { "epoch": 0.3244561140285071, "grad_norm": 1.7781134843826294, "learning_rate": 2.065940366972477e-05, "loss": 0.5249, "step": 1730 }, { "epoch": 0.32633158289572395, "grad_norm": 1.9172645807266235, "learning_rate": 2.0602064220183486e-05, "loss": 0.564, "step": 1740 }, { "epoch": 0.32820705176294074, "grad_norm": 1.9964970350265503, "learning_rate": 2.0544724770642202e-05, "loss": 0.5688, "step": 1750 }, { "epoch": 0.3300825206301575, "grad_norm": 2.0303592681884766, "learning_rate": 2.048738532110092e-05, "loss": 0.6081, "step": 1760 }, { "epoch": 0.3319579894973743, "grad_norm": 2.4410409927368164, "learning_rate": 2.0430045871559632e-05, "loss": 0.542, "step": 1770 }, { "epoch": 0.33383345836459116, "grad_norm": 1.7117453813552856, "learning_rate": 2.037270642201835e-05, "loss": 0.4778, "step": 1780 }, { "epoch": 0.33570892723180795, "grad_norm": 1.5781958103179932, "learning_rate": 2.0315366972477065e-05, "loss": 0.5451, "step": 1790 }, { "epoch": 0.33758439609902474, "grad_norm": 1.601178526878357, "learning_rate": 2.0258027522935782e-05, "loss": 0.5371, "step": 1800 }, { "epoch": 0.33758439609902474, "eval_loss": 0.667210042476654, "eval_runtime": 5.5955, "eval_samples_per_second": 21.446, "eval_steps_per_second": 2.681, "step": 1800 }, { "epoch": 0.3394598649662416, "grad_norm": 1.520401954650879, "learning_rate": 2.0200688073394495e-05, "loss": 0.7463, "step": 1810 }, { "epoch": 0.3413353338334584, "grad_norm": 1.5495413541793823, "learning_rate": 2.0143348623853212e-05, "loss": 0.5746, "step": 1820 }, { "epoch": 0.34321080270067517, "grad_norm": 1.656015157699585, "learning_rate": 2.0086009174311925e-05, "loss": 0.5587, "step": 1830 }, { "epoch": 0.34508627156789196, "grad_norm": 1.7179194688796997, "learning_rate": 2.0028669724770645e-05, "loss": 0.6316, "step": 1840 }, { "epoch": 0.3469617404351088, "grad_norm": 2.026876926422119, "learning_rate": 1.9971330275229358e-05, "loss": 0.5859, "step": 1850 }, { "epoch": 0.3488372093023256, "grad_norm": 1.675175428390503, "learning_rate": 1.9913990825688075e-05, "loss": 0.5499, "step": 1860 }, { "epoch": 0.3507126781695424, "grad_norm": 1.3794666528701782, "learning_rate": 1.9856651376146788e-05, "loss": 0.651, "step": 1870 }, { "epoch": 0.35258814703675917, "grad_norm": 1.6561700105667114, "learning_rate": 1.9799311926605508e-05, "loss": 0.52, "step": 1880 }, { "epoch": 0.354463615903976, "grad_norm": 1.9196125268936157, "learning_rate": 1.974197247706422e-05, "loss": 0.5643, "step": 1890 }, { "epoch": 0.3563390847711928, "grad_norm": 2.157627820968628, "learning_rate": 1.9684633027522934e-05, "loss": 0.5726, "step": 1900 }, { "epoch": 0.3582145536384096, "grad_norm": 1.8069156408309937, "learning_rate": 1.962729357798165e-05, "loss": 0.6398, "step": 1910 }, { "epoch": 0.3600900225056264, "grad_norm": 1.7318720817565918, "learning_rate": 1.9569954128440368e-05, "loss": 0.4835, "step": 1920 }, { "epoch": 0.36196549137284323, "grad_norm": 2.1636054515838623, "learning_rate": 1.9512614678899084e-05, "loss": 0.587, "step": 1930 }, { "epoch": 0.36384096024006, "grad_norm": 2.062150478363037, "learning_rate": 1.9455275229357797e-05, "loss": 0.763, "step": 1940 }, { "epoch": 0.3657164291072768, "grad_norm": 1.6775376796722412, "learning_rate": 1.9397935779816514e-05, "loss": 0.6575, "step": 1950 }, { "epoch": 0.3675918979744936, "grad_norm": 1.5422090291976929, "learning_rate": 1.9340596330275227e-05, "loss": 0.6128, "step": 1960 }, { "epoch": 0.36946736684171044, "grad_norm": 1.7209275960922241, "learning_rate": 1.9283256880733947e-05, "loss": 0.5796, "step": 1970 }, { "epoch": 0.37134283570892723, "grad_norm": 1.5626654624938965, "learning_rate": 1.922591743119266e-05, "loss": 0.5237, "step": 1980 }, { "epoch": 0.373218304576144, "grad_norm": 1.6950414180755615, "learning_rate": 1.9168577981651377e-05, "loss": 0.5983, "step": 1990 }, { "epoch": 0.37509377344336087, "grad_norm": 1.5081120729446411, "learning_rate": 1.911123853211009e-05, "loss": 0.5603, "step": 2000 }, { "epoch": 0.37509377344336087, "eval_loss": 0.6599423885345459, "eval_runtime": 5.5471, "eval_samples_per_second": 21.633, "eval_steps_per_second": 2.704, "step": 2000 }, { "epoch": 0.37696924231057766, "grad_norm": 1.7430557012557983, "learning_rate": 1.905389908256881e-05, "loss": 0.5582, "step": 2010 }, { "epoch": 0.37884471117779445, "grad_norm": 1.8989301919937134, "learning_rate": 1.8996559633027523e-05, "loss": 0.6411, "step": 2020 }, { "epoch": 0.38072018004501124, "grad_norm": 1.9164332151412964, "learning_rate": 1.893922018348624e-05, "loss": 0.5733, "step": 2030 }, { "epoch": 0.3825956489122281, "grad_norm": 1.9230120182037354, "learning_rate": 1.8881880733944953e-05, "loss": 0.6592, "step": 2040 }, { "epoch": 0.38447111777944487, "grad_norm": 1.9948559999465942, "learning_rate": 1.882454128440367e-05, "loss": 0.5918, "step": 2050 }, { "epoch": 0.38634658664666166, "grad_norm": 1.8086504936218262, "learning_rate": 1.8767201834862386e-05, "loss": 0.5939, "step": 2060 }, { "epoch": 0.38822205551387845, "grad_norm": 1.715736985206604, "learning_rate": 1.8709862385321103e-05, "loss": 0.6624, "step": 2070 }, { "epoch": 0.3900975243810953, "grad_norm": 2.9393413066864014, "learning_rate": 1.8652522935779816e-05, "loss": 0.5954, "step": 2080 }, { "epoch": 0.3919729932483121, "grad_norm": 2.3764209747314453, "learning_rate": 1.8595183486238533e-05, "loss": 0.6181, "step": 2090 }, { "epoch": 0.3938484621155289, "grad_norm": 1.7462408542633057, "learning_rate": 1.853784403669725e-05, "loss": 0.4992, "step": 2100 }, { "epoch": 0.39572393098274566, "grad_norm": 2.006526470184326, "learning_rate": 1.8480504587155966e-05, "loss": 0.5331, "step": 2110 }, { "epoch": 0.3975993998499625, "grad_norm": 2.453961133956909, "learning_rate": 1.842316513761468e-05, "loss": 0.5828, "step": 2120 }, { "epoch": 0.3994748687171793, "grad_norm": 1.9606050252914429, "learning_rate": 1.8365825688073396e-05, "loss": 0.596, "step": 2130 }, { "epoch": 0.4013503375843961, "grad_norm": 1.776755690574646, "learning_rate": 1.830848623853211e-05, "loss": 0.6688, "step": 2140 }, { "epoch": 0.4032258064516129, "grad_norm": 1.6970465183258057, "learning_rate": 1.8251146788990826e-05, "loss": 0.6403, "step": 2150 }, { "epoch": 0.4051012753188297, "grad_norm": 2.1834471225738525, "learning_rate": 1.8193807339449542e-05, "loss": 0.6169, "step": 2160 }, { "epoch": 0.4069767441860465, "grad_norm": 1.4596108198165894, "learning_rate": 1.8136467889908255e-05, "loss": 0.5783, "step": 2170 }, { "epoch": 0.4088522130532633, "grad_norm": 1.808875560760498, "learning_rate": 1.8079128440366972e-05, "loss": 0.6834, "step": 2180 }, { "epoch": 0.41072768192048015, "grad_norm": 2.0414464473724365, "learning_rate": 1.802178899082569e-05, "loss": 0.5546, "step": 2190 }, { "epoch": 0.41260315078769694, "grad_norm": 1.7231241464614868, "learning_rate": 1.7964449541284405e-05, "loss": 0.5875, "step": 2200 }, { "epoch": 0.41260315078769694, "eval_loss": 0.6564481258392334, "eval_runtime": 5.5564, "eval_samples_per_second": 21.597, "eval_steps_per_second": 2.7, "step": 2200 }, { "epoch": 0.4144786196549137, "grad_norm": 1.5646345615386963, "learning_rate": 1.790711009174312e-05, "loss": 0.4631, "step": 2210 }, { "epoch": 0.4163540885221305, "grad_norm": 2.1417741775512695, "learning_rate": 1.7849770642201835e-05, "loss": 0.5978, "step": 2220 }, { "epoch": 0.41822955738934736, "grad_norm": 1.5909672975540161, "learning_rate": 1.7792431192660548e-05, "loss": 0.6276, "step": 2230 }, { "epoch": 0.42010502625656415, "grad_norm": 1.5815021991729736, "learning_rate": 1.7735091743119268e-05, "loss": 0.5655, "step": 2240 }, { "epoch": 0.42198049512378094, "grad_norm": 2.173349618911743, "learning_rate": 1.767775229357798e-05, "loss": 0.5172, "step": 2250 }, { "epoch": 0.42385596399099773, "grad_norm": 1.611697793006897, "learning_rate": 1.7620412844036698e-05, "loss": 0.5828, "step": 2260 }, { "epoch": 0.4257314328582146, "grad_norm": 2.148935556411743, "learning_rate": 1.756307339449541e-05, "loss": 0.5796, "step": 2270 }, { "epoch": 0.42760690172543137, "grad_norm": 2.8221611976623535, "learning_rate": 1.750573394495413e-05, "loss": 0.6098, "step": 2280 }, { "epoch": 0.42948237059264815, "grad_norm": 1.8515477180480957, "learning_rate": 1.7448394495412844e-05, "loss": 0.6519, "step": 2290 }, { "epoch": 0.43135783945986494, "grad_norm": 1.9033889770507812, "learning_rate": 1.739105504587156e-05, "loss": 0.5771, "step": 2300 }, { "epoch": 0.4332333083270818, "grad_norm": 2.1629979610443115, "learning_rate": 1.7333715596330274e-05, "loss": 0.5308, "step": 2310 }, { "epoch": 0.4351087771942986, "grad_norm": 1.713036060333252, "learning_rate": 1.727637614678899e-05, "loss": 0.6036, "step": 2320 }, { "epoch": 0.43698424606151537, "grad_norm": 1.626887559890747, "learning_rate": 1.7219036697247707e-05, "loss": 0.5932, "step": 2330 }, { "epoch": 0.43885971492873216, "grad_norm": 2.026658535003662, "learning_rate": 1.7161697247706424e-05, "loss": 0.509, "step": 2340 }, { "epoch": 0.440735183795949, "grad_norm": 1.617053508758545, "learning_rate": 1.7104357798165137e-05, "loss": 0.5841, "step": 2350 }, { "epoch": 0.4426106526631658, "grad_norm": 1.8023245334625244, "learning_rate": 1.7047018348623854e-05, "loss": 0.5244, "step": 2360 }, { "epoch": 0.4444861215303826, "grad_norm": 2.0502309799194336, "learning_rate": 1.698967889908257e-05, "loss": 0.5936, "step": 2370 }, { "epoch": 0.4463615903975994, "grad_norm": 2.410144567489624, "learning_rate": 1.6932339449541287e-05, "loss": 0.6206, "step": 2380 }, { "epoch": 0.4482370592648162, "grad_norm": 2.0925815105438232, "learning_rate": 1.6875e-05, "loss": 0.5086, "step": 2390 }, { "epoch": 0.450112528132033, "grad_norm": 1.8199101686477661, "learning_rate": 1.6817660550458713e-05, "loss": 0.583, "step": 2400 }, { "epoch": 0.450112528132033, "eval_loss": 0.6492409110069275, "eval_runtime": 5.5179, "eval_samples_per_second": 21.747, "eval_steps_per_second": 2.718, "step": 2400 }, { "epoch": 0.4519879969992498, "grad_norm": 1.7940239906311035, "learning_rate": 1.6760321100917433e-05, "loss": 0.6154, "step": 2410 }, { "epoch": 0.45386346586646664, "grad_norm": 2.281325340270996, "learning_rate": 1.6702981651376147e-05, "loss": 0.5542, "step": 2420 }, { "epoch": 0.45573893473368343, "grad_norm": 1.8717613220214844, "learning_rate": 1.6645642201834863e-05, "loss": 0.5242, "step": 2430 }, { "epoch": 0.4576144036009002, "grad_norm": 2.2120072841644287, "learning_rate": 1.6588302752293576e-05, "loss": 0.63, "step": 2440 }, { "epoch": 0.459489872468117, "grad_norm": 2.10752272605896, "learning_rate": 1.6530963302752293e-05, "loss": 0.5712, "step": 2450 }, { "epoch": 0.46136534133533386, "grad_norm": 2.3129327297210693, "learning_rate": 1.647362385321101e-05, "loss": 0.7044, "step": 2460 }, { "epoch": 0.46324081020255065, "grad_norm": 1.424224853515625, "learning_rate": 1.6416284403669726e-05, "loss": 0.5588, "step": 2470 }, { "epoch": 0.46511627906976744, "grad_norm": 1.6627572774887085, "learning_rate": 1.635894495412844e-05, "loss": 0.4543, "step": 2480 }, { "epoch": 0.4669917479369842, "grad_norm": 1.6522067785263062, "learning_rate": 1.6301605504587156e-05, "loss": 0.6003, "step": 2490 }, { "epoch": 0.46886721680420107, "grad_norm": 2.2070651054382324, "learning_rate": 1.6244266055045873e-05, "loss": 0.6294, "step": 2500 }, { "epoch": 0.47074268567141786, "grad_norm": 2.1523821353912354, "learning_rate": 1.618692660550459e-05, "loss": 0.6067, "step": 2510 }, { "epoch": 0.47261815453863465, "grad_norm": 2.468892812728882, "learning_rate": 1.6129587155963302e-05, "loss": 0.6267, "step": 2520 }, { "epoch": 0.47449362340585144, "grad_norm": 1.9735854864120483, "learning_rate": 1.607224770642202e-05, "loss": 0.6124, "step": 2530 }, { "epoch": 0.4763690922730683, "grad_norm": 1.7900265455245972, "learning_rate": 1.6014908256880732e-05, "loss": 0.5845, "step": 2540 }, { "epoch": 0.4782445611402851, "grad_norm": 2.2069602012634277, "learning_rate": 1.5957568807339452e-05, "loss": 0.6071, "step": 2550 }, { "epoch": 0.48012003000750186, "grad_norm": 2.3752589225769043, "learning_rate": 1.5900229357798165e-05, "loss": 0.6074, "step": 2560 }, { "epoch": 0.48199549887471865, "grad_norm": 1.8114852905273438, "learning_rate": 1.5842889908256882e-05, "loss": 0.5358, "step": 2570 }, { "epoch": 0.4838709677419355, "grad_norm": 1.6503331661224365, "learning_rate": 1.5785550458715595e-05, "loss": 0.6664, "step": 2580 }, { "epoch": 0.4857464366091523, "grad_norm": 1.7421520948410034, "learning_rate": 1.5728211009174315e-05, "loss": 0.6137, "step": 2590 }, { "epoch": 0.4876219054763691, "grad_norm": 1.865038275718689, "learning_rate": 1.567087155963303e-05, "loss": 0.6328, "step": 2600 }, { "epoch": 0.4876219054763691, "eval_loss": 0.6427852511405945, "eval_runtime": 5.6723, "eval_samples_per_second": 21.155, "eval_steps_per_second": 2.644, "step": 2600 }, { "epoch": 0.4894973743435859, "grad_norm": 2.0670528411865234, "learning_rate": 1.5613532110091745e-05, "loss": 0.5686, "step": 2610 }, { "epoch": 0.4913728432108027, "grad_norm": 2.00549054145813, "learning_rate": 1.5556192660550458e-05, "loss": 0.5701, "step": 2620 }, { "epoch": 0.4932483120780195, "grad_norm": 2.3382251262664795, "learning_rate": 1.5498853211009175e-05, "loss": 0.6212, "step": 2630 }, { "epoch": 0.4951237809452363, "grad_norm": 1.849523901939392, "learning_rate": 1.544151376146789e-05, "loss": 0.5844, "step": 2640 }, { "epoch": 0.49699924981245314, "grad_norm": 2.0589709281921387, "learning_rate": 1.5384174311926605e-05, "loss": 0.5736, "step": 2650 }, { "epoch": 0.4988747186796699, "grad_norm": 2.3713736534118652, "learning_rate": 1.532683486238532e-05, "loss": 0.5543, "step": 2660 }, { "epoch": 0.5007501875468867, "grad_norm": 1.4133175611495972, "learning_rate": 1.5269495412844034e-05, "loss": 0.5916, "step": 2670 }, { "epoch": 0.5026256564141035, "grad_norm": 1.828869104385376, "learning_rate": 1.5212155963302753e-05, "loss": 0.6424, "step": 2680 }, { "epoch": 0.5045011252813203, "grad_norm": 1.8340333700180054, "learning_rate": 1.5154816513761468e-05, "loss": 0.5335, "step": 2690 }, { "epoch": 0.5063765941485371, "grad_norm": 2.287064790725708, "learning_rate": 1.5097477064220184e-05, "loss": 0.5604, "step": 2700 }, { "epoch": 0.508252063015754, "grad_norm": 2.0678904056549072, "learning_rate": 1.5040137614678897e-05, "loss": 0.6822, "step": 2710 }, { "epoch": 0.5101275318829708, "grad_norm": 1.848810076713562, "learning_rate": 1.4982798165137616e-05, "loss": 0.5741, "step": 2720 }, { "epoch": 0.5120030007501876, "grad_norm": 1.8436052799224854, "learning_rate": 1.492545871559633e-05, "loss": 0.592, "step": 2730 }, { "epoch": 0.5138784696174044, "grad_norm": 1.8554112911224365, "learning_rate": 1.4868119266055047e-05, "loss": 0.5372, "step": 2740 }, { "epoch": 0.5157539384846211, "grad_norm": 1.7678755521774292, "learning_rate": 1.4810779816513762e-05, "loss": 0.5748, "step": 2750 }, { "epoch": 0.5176294073518379, "grad_norm": 1.71146821975708, "learning_rate": 1.4753440366972479e-05, "loss": 0.6795, "step": 2760 }, { "epoch": 0.5195048762190547, "grad_norm": 1.6599249839782715, "learning_rate": 1.4696100917431192e-05, "loss": 0.5559, "step": 2770 }, { "epoch": 0.5213803450862715, "grad_norm": 2.273698568344116, "learning_rate": 1.4638761467889908e-05, "loss": 0.4864, "step": 2780 }, { "epoch": 0.5232558139534884, "grad_norm": 2.400425434112549, "learning_rate": 1.4581422018348623e-05, "loss": 0.6152, "step": 2790 }, { "epoch": 0.5251312828207052, "grad_norm": 2.1009607315063477, "learning_rate": 1.452408256880734e-05, "loss": 0.5518, "step": 2800 }, { "epoch": 0.5251312828207052, "eval_loss": 0.6440523266792297, "eval_runtime": 5.5215, "eval_samples_per_second": 21.733, "eval_steps_per_second": 2.717, "step": 2800 }, { "epoch": 0.527006751687922, "grad_norm": 1.724177360534668, "learning_rate": 1.4466743119266055e-05, "loss": 0.5635, "step": 2810 }, { "epoch": 0.5288822205551388, "grad_norm": 1.6806992292404175, "learning_rate": 1.440940366972477e-05, "loss": 0.6072, "step": 2820 }, { "epoch": 0.5307576894223556, "grad_norm": 1.914863109588623, "learning_rate": 1.4352064220183486e-05, "loss": 0.5919, "step": 2830 }, { "epoch": 0.5326331582895724, "grad_norm": 1.9246379137039185, "learning_rate": 1.4294724770642201e-05, "loss": 0.6282, "step": 2840 }, { "epoch": 0.5345086271567892, "grad_norm": 2.0513482093811035, "learning_rate": 1.4237385321100918e-05, "loss": 0.5117, "step": 2850 }, { "epoch": 0.536384096024006, "grad_norm": 2.160053253173828, "learning_rate": 1.4180045871559633e-05, "loss": 0.5916, "step": 2860 }, { "epoch": 0.5382595648912228, "grad_norm": 1.3989676237106323, "learning_rate": 1.412270642201835e-05, "loss": 0.6169, "step": 2870 }, { "epoch": 0.5401350337584396, "grad_norm": 1.9387221336364746, "learning_rate": 1.4065366972477064e-05, "loss": 0.6374, "step": 2880 }, { "epoch": 0.5420105026256564, "grad_norm": 2.054593563079834, "learning_rate": 1.4008027522935781e-05, "loss": 0.5947, "step": 2890 }, { "epoch": 0.5438859714928732, "grad_norm": 1.8106393814086914, "learning_rate": 1.3950688073394496e-05, "loss": 0.6232, "step": 2900 }, { "epoch": 0.54576144036009, "grad_norm": 2.042513132095337, "learning_rate": 1.389334862385321e-05, "loss": 0.481, "step": 2910 }, { "epoch": 0.5476369092273068, "grad_norm": 1.6872574090957642, "learning_rate": 1.3836009174311927e-05, "loss": 0.5128, "step": 2920 }, { "epoch": 0.5495123780945236, "grad_norm": 1.8918819427490234, "learning_rate": 1.3778669724770642e-05, "loss": 0.6205, "step": 2930 }, { "epoch": 0.5513878469617405, "grad_norm": 2.6372804641723633, "learning_rate": 1.3721330275229359e-05, "loss": 0.4981, "step": 2940 }, { "epoch": 0.5532633158289573, "grad_norm": 1.8915632963180542, "learning_rate": 1.3663990825688074e-05, "loss": 0.5481, "step": 2950 }, { "epoch": 0.5551387846961741, "grad_norm": 2.0230934619903564, "learning_rate": 1.360665137614679e-05, "loss": 0.5806, "step": 2960 }, { "epoch": 0.5570142535633908, "grad_norm": 2.1508560180664062, "learning_rate": 1.3549311926605505e-05, "loss": 0.6057, "step": 2970 }, { "epoch": 0.5588897224306076, "grad_norm": 1.7368062734603882, "learning_rate": 1.3491972477064222e-05, "loss": 0.5743, "step": 2980 }, { "epoch": 0.5607651912978244, "grad_norm": 1.9738160371780396, "learning_rate": 1.3434633027522937e-05, "loss": 0.6063, "step": 2990 }, { "epoch": 0.5626406601650412, "grad_norm": 1.9070963859558105, "learning_rate": 1.3377293577981652e-05, "loss": 0.5965, "step": 3000 }, { "epoch": 0.5626406601650412, "eval_loss": 0.637257993221283, "eval_runtime": 5.3838, "eval_samples_per_second": 22.289, "eval_steps_per_second": 2.786, "step": 3000 }, { "epoch": 0.5645161290322581, "grad_norm": 1.9798016548156738, "learning_rate": 1.3319954128440368e-05, "loss": 0.5758, "step": 3010 }, { "epoch": 0.5663915978994749, "grad_norm": 1.68988037109375, "learning_rate": 1.3262614678899081e-05, "loss": 0.5435, "step": 3020 }, { "epoch": 0.5682670667666917, "grad_norm": 1.9612882137298584, "learning_rate": 1.3205275229357798e-05, "loss": 0.7064, "step": 3030 }, { "epoch": 0.5701425356339085, "grad_norm": 1.9069509506225586, "learning_rate": 1.3147935779816513e-05, "loss": 0.6531, "step": 3040 }, { "epoch": 0.5720180045011253, "grad_norm": 2.185046434402466, "learning_rate": 1.309059633027523e-05, "loss": 0.548, "step": 3050 }, { "epoch": 0.5738934733683421, "grad_norm": 1.6375807523727417, "learning_rate": 1.3033256880733944e-05, "loss": 0.5555, "step": 3060 }, { "epoch": 0.5757689422355589, "grad_norm": 2.4809699058532715, "learning_rate": 1.2975917431192661e-05, "loss": 0.5071, "step": 3070 }, { "epoch": 0.5776444111027756, "grad_norm": 2.071410894393921, "learning_rate": 1.2918577981651376e-05, "loss": 0.6192, "step": 3080 }, { "epoch": 0.5795198799699925, "grad_norm": 1.9961457252502441, "learning_rate": 1.2861238532110092e-05, "loss": 0.6463, "step": 3090 }, { "epoch": 0.5813953488372093, "grad_norm": 1.7288352251052856, "learning_rate": 1.2803899082568807e-05, "loss": 0.5121, "step": 3100 }, { "epoch": 0.5832708177044261, "grad_norm": 2.855468988418579, "learning_rate": 1.2746559633027522e-05, "loss": 0.5797, "step": 3110 }, { "epoch": 0.5851462865716429, "grad_norm": 2.2987215518951416, "learning_rate": 1.2689220183486239e-05, "loss": 0.5607, "step": 3120 }, { "epoch": 0.5870217554388597, "grad_norm": 1.4077903032302856, "learning_rate": 1.2631880733944954e-05, "loss": 0.6127, "step": 3130 }, { "epoch": 0.5888972243060765, "grad_norm": 2.1426985263824463, "learning_rate": 1.257454128440367e-05, "loss": 0.5774, "step": 3140 }, { "epoch": 0.5907726931732933, "grad_norm": 1.681693196296692, "learning_rate": 1.2517201834862385e-05, "loss": 0.5311, "step": 3150 }, { "epoch": 0.5926481620405101, "grad_norm": 2.1285390853881836, "learning_rate": 1.2459862385321102e-05, "loss": 0.7334, "step": 3160 }, { "epoch": 0.594523630907727, "grad_norm": 1.7066893577575684, "learning_rate": 1.2402522935779817e-05, "loss": 0.4741, "step": 3170 }, { "epoch": 0.5963990997749438, "grad_norm": 2.3069071769714355, "learning_rate": 1.2345183486238533e-05, "loss": 0.6068, "step": 3180 }, { "epoch": 0.5982745686421606, "grad_norm": 1.898915410041809, "learning_rate": 1.2287844036697248e-05, "loss": 0.4881, "step": 3190 }, { "epoch": 0.6001500375093773, "grad_norm": 1.9187260866165161, "learning_rate": 1.2230504587155963e-05, "loss": 0.5603, "step": 3200 }, { "epoch": 0.6001500375093773, "eval_loss": 0.6345093250274658, "eval_runtime": 5.5367, "eval_samples_per_second": 21.674, "eval_steps_per_second": 2.709, "step": 3200 }, { "epoch": 0.6020255063765941, "grad_norm": 1.7056176662445068, "learning_rate": 1.217316513761468e-05, "loss": 0.4889, "step": 3210 }, { "epoch": 0.6039009752438109, "grad_norm": 1.7351319789886475, "learning_rate": 1.2115825688073395e-05, "loss": 0.5233, "step": 3220 }, { "epoch": 0.6057764441110277, "grad_norm": 3.0656421184539795, "learning_rate": 1.2058486238532111e-05, "loss": 0.5427, "step": 3230 }, { "epoch": 0.6076519129782446, "grad_norm": 2.4634621143341064, "learning_rate": 1.2001146788990826e-05, "loss": 0.5901, "step": 3240 }, { "epoch": 0.6095273818454614, "grad_norm": 1.7477375268936157, "learning_rate": 1.1943807339449543e-05, "loss": 0.6393, "step": 3250 }, { "epoch": 0.6114028507126782, "grad_norm": 2.034407377243042, "learning_rate": 1.1886467889908258e-05, "loss": 0.4688, "step": 3260 }, { "epoch": 0.613278319579895, "grad_norm": 1.604066014289856, "learning_rate": 1.1829128440366974e-05, "loss": 0.5928, "step": 3270 }, { "epoch": 0.6151537884471118, "grad_norm": 1.9047834873199463, "learning_rate": 1.1771788990825687e-05, "loss": 0.512, "step": 3280 }, { "epoch": 0.6170292573143286, "grad_norm": 2.166414737701416, "learning_rate": 1.1714449541284404e-05, "loss": 0.6807, "step": 3290 }, { "epoch": 0.6189047261815454, "grad_norm": 2.463648796081543, "learning_rate": 1.1657110091743119e-05, "loss": 0.7143, "step": 3300 }, { "epoch": 0.6207801950487621, "grad_norm": 1.8840951919555664, "learning_rate": 1.1599770642201834e-05, "loss": 0.6137, "step": 3310 }, { "epoch": 0.622655663915979, "grad_norm": 2.49739408493042, "learning_rate": 1.154243119266055e-05, "loss": 0.6415, "step": 3320 }, { "epoch": 0.6245311327831958, "grad_norm": 2.0638840198516846, "learning_rate": 1.1485091743119265e-05, "loss": 0.5118, "step": 3330 }, { "epoch": 0.6264066016504126, "grad_norm": 2.0733895301818848, "learning_rate": 1.1427752293577982e-05, "loss": 0.6573, "step": 3340 }, { "epoch": 0.6282820705176294, "grad_norm": 2.006185293197632, "learning_rate": 1.1370412844036697e-05, "loss": 0.4634, "step": 3350 }, { "epoch": 0.6301575393848462, "grad_norm": 2.2666101455688477, "learning_rate": 1.1313073394495413e-05, "loss": 0.6536, "step": 3360 }, { "epoch": 0.632033008252063, "grad_norm": 2.7148234844207764, "learning_rate": 1.1255733944954128e-05, "loss": 0.7032, "step": 3370 }, { "epoch": 0.6339084771192798, "grad_norm": 1.6289362907409668, "learning_rate": 1.1198394495412845e-05, "loss": 0.5175, "step": 3380 }, { "epoch": 0.6357839459864967, "grad_norm": 2.742385149002075, "learning_rate": 1.114105504587156e-05, "loss": 0.582, "step": 3390 }, { "epoch": 0.6376594148537135, "grad_norm": 2.092541217803955, "learning_rate": 1.1083715596330275e-05, "loss": 0.6501, "step": 3400 }, { "epoch": 0.6376594148537135, "eval_loss": 0.631032407283783, "eval_runtime": 5.5482, "eval_samples_per_second": 21.629, "eval_steps_per_second": 2.704, "step": 3400 }, { "epoch": 0.6395348837209303, "grad_norm": 1.8964581489562988, "learning_rate": 1.1026376146788991e-05, "loss": 0.5905, "step": 3410 }, { "epoch": 0.641410352588147, "grad_norm": 1.6054551601409912, "learning_rate": 1.0969036697247706e-05, "loss": 0.4636, "step": 3420 }, { "epoch": 0.6432858214553638, "grad_norm": 2.0726969242095947, "learning_rate": 1.0911697247706423e-05, "loss": 0.6542, "step": 3430 }, { "epoch": 0.6451612903225806, "grad_norm": 3.5420382022857666, "learning_rate": 1.0854357798165138e-05, "loss": 0.5573, "step": 3440 }, { "epoch": 0.6470367591897974, "grad_norm": 2.462528705596924, "learning_rate": 1.0797018348623854e-05, "loss": 0.5605, "step": 3450 }, { "epoch": 0.6489122280570142, "grad_norm": 2.0307133197784424, "learning_rate": 1.073967889908257e-05, "loss": 0.5594, "step": 3460 }, { "epoch": 0.6507876969242311, "grad_norm": 2.2088277339935303, "learning_rate": 1.0682339449541286e-05, "loss": 0.6143, "step": 3470 }, { "epoch": 0.6526631657914479, "grad_norm": 1.4962677955627441, "learning_rate": 1.0625e-05, "loss": 0.5801, "step": 3480 }, { "epoch": 0.6545386346586647, "grad_norm": 1.796766996383667, "learning_rate": 1.0567660550458716e-05, "loss": 0.6032, "step": 3490 }, { "epoch": 0.6564141035258815, "grad_norm": 2.6135787963867188, "learning_rate": 1.0510321100917432e-05, "loss": 0.5422, "step": 3500 }, { "epoch": 0.6582895723930983, "grad_norm": 2.0830154418945312, "learning_rate": 1.0452981651376147e-05, "loss": 0.5509, "step": 3510 }, { "epoch": 0.660165041260315, "grad_norm": 2.061523675918579, "learning_rate": 1.0395642201834864e-05, "loss": 0.5258, "step": 3520 }, { "epoch": 0.6620405101275318, "grad_norm": 1.8006651401519775, "learning_rate": 1.0338302752293577e-05, "loss": 0.5546, "step": 3530 }, { "epoch": 0.6639159789947486, "grad_norm": 2.187450647354126, "learning_rate": 1.0280963302752294e-05, "loss": 0.598, "step": 3540 }, { "epoch": 0.6657914478619655, "grad_norm": 1.984383463859558, "learning_rate": 1.0223623853211008e-05, "loss": 0.5181, "step": 3550 }, { "epoch": 0.6676669167291823, "grad_norm": 2.5804004669189453, "learning_rate": 1.0166284403669725e-05, "loss": 0.4769, "step": 3560 }, { "epoch": 0.6695423855963991, "grad_norm": 2.4561312198638916, "learning_rate": 1.010894495412844e-05, "loss": 0.5985, "step": 3570 }, { "epoch": 0.6714178544636159, "grad_norm": 2.456256866455078, "learning_rate": 1.0051605504587157e-05, "loss": 0.6127, "step": 3580 }, { "epoch": 0.6732933233308327, "grad_norm": 2.1540181636810303, "learning_rate": 9.994266055045871e-06, "loss": 0.6621, "step": 3590 }, { "epoch": 0.6751687921980495, "grad_norm": 2.0861988067626953, "learning_rate": 9.936926605504586e-06, "loss": 0.5981, "step": 3600 }, { "epoch": 0.6751687921980495, "eval_loss": 0.6278895735740662, "eval_runtime": 5.5329, "eval_samples_per_second": 21.688, "eval_steps_per_second": 2.711, "step": 3600 }, { "epoch": 0.6770442610652663, "grad_norm": 2.0881967544555664, "learning_rate": 9.879587155963303e-06, "loss": 0.5829, "step": 3610 }, { "epoch": 0.6789197299324832, "grad_norm": 1.6255912780761719, "learning_rate": 9.822247706422018e-06, "loss": 0.5327, "step": 3620 }, { "epoch": 0.6807951987997, "grad_norm": 1.970249891281128, "learning_rate": 9.764908256880734e-06, "loss": 0.5841, "step": 3630 }, { "epoch": 0.6826706676669168, "grad_norm": 2.4903528690338135, "learning_rate": 9.70756880733945e-06, "loss": 0.6067, "step": 3640 }, { "epoch": 0.6845461365341335, "grad_norm": 1.9478775262832642, "learning_rate": 9.650229357798166e-06, "loss": 0.5565, "step": 3650 }, { "epoch": 0.6864216054013503, "grad_norm": 1.8559181690216064, "learning_rate": 9.592889908256881e-06, "loss": 0.5934, "step": 3660 }, { "epoch": 0.6882970742685671, "grad_norm": 1.7717185020446777, "learning_rate": 9.535550458715597e-06, "loss": 0.5408, "step": 3670 }, { "epoch": 0.6901725431357839, "grad_norm": 2.0449588298797607, "learning_rate": 9.478211009174312e-06, "loss": 0.5795, "step": 3680 }, { "epoch": 0.6920480120030007, "grad_norm": 2.3706321716308594, "learning_rate": 9.420871559633027e-06, "loss": 0.5183, "step": 3690 }, { "epoch": 0.6939234808702176, "grad_norm": 1.7281607389450073, "learning_rate": 9.363532110091744e-06, "loss": 0.494, "step": 3700 }, { "epoch": 0.6957989497374344, "grad_norm": 3.5256292819976807, "learning_rate": 9.306192660550459e-06, "loss": 0.5757, "step": 3710 }, { "epoch": 0.6976744186046512, "grad_norm": 1.58797025680542, "learning_rate": 9.248853211009175e-06, "loss": 0.5399, "step": 3720 }, { "epoch": 0.699549887471868, "grad_norm": 1.9900200366973877, "learning_rate": 9.19151376146789e-06, "loss": 0.6236, "step": 3730 }, { "epoch": 0.7014253563390848, "grad_norm": 1.7843225002288818, "learning_rate": 9.134174311926607e-06, "loss": 0.549, "step": 3740 }, { "epoch": 0.7033008252063015, "grad_norm": 1.9925148487091064, "learning_rate": 9.076834862385322e-06, "loss": 0.49, "step": 3750 }, { "epoch": 0.7051762940735183, "grad_norm": 2.0657670497894287, "learning_rate": 9.019495412844038e-06, "loss": 0.5305, "step": 3760 }, { "epoch": 0.7070517629407351, "grad_norm": 2.2417612075805664, "learning_rate": 8.962155963302753e-06, "loss": 0.6312, "step": 3770 }, { "epoch": 0.708927231807952, "grad_norm": 2.196537733078003, "learning_rate": 8.904816513761468e-06, "loss": 0.6512, "step": 3780 }, { "epoch": 0.7108027006751688, "grad_norm": 1.830484390258789, "learning_rate": 8.847477064220183e-06, "loss": 0.5919, "step": 3790 }, { "epoch": 0.7126781695423856, "grad_norm": 2.0573606491088867, "learning_rate": 8.790137614678898e-06, "loss": 0.5749, "step": 3800 }, { "epoch": 0.7126781695423856, "eval_loss": 0.6254046559333801, "eval_runtime": 5.5087, "eval_samples_per_second": 21.784, "eval_steps_per_second": 2.723, "step": 3800 }, { "epoch": 0.7145536384096024, "grad_norm": 1.9237205982208252, "learning_rate": 8.732798165137615e-06, "loss": 0.5669, "step": 3810 }, { "epoch": 0.7164291072768192, "grad_norm": 1.9309799671173096, "learning_rate": 8.67545871559633e-06, "loss": 0.4759, "step": 3820 }, { "epoch": 0.718304576144036, "grad_norm": 1.7976388931274414, "learning_rate": 8.618119266055046e-06, "loss": 0.5962, "step": 3830 }, { "epoch": 0.7201800450112528, "grad_norm": 2.3641951084136963, "learning_rate": 8.560779816513761e-06, "loss": 0.627, "step": 3840 }, { "epoch": 0.7220555138784697, "grad_norm": 1.5216801166534424, "learning_rate": 8.503440366972478e-06, "loss": 0.6098, "step": 3850 }, { "epoch": 0.7239309827456865, "grad_norm": 1.8570992946624756, "learning_rate": 8.446100917431192e-06, "loss": 0.5813, "step": 3860 }, { "epoch": 0.7258064516129032, "grad_norm": 3.1294426918029785, "learning_rate": 8.388761467889909e-06, "loss": 0.5876, "step": 3870 }, { "epoch": 0.72768192048012, "grad_norm": 2.678264617919922, "learning_rate": 8.331422018348624e-06, "loss": 0.6336, "step": 3880 }, { "epoch": 0.7295573893473368, "grad_norm": 1.5208237171173096, "learning_rate": 8.274082568807339e-06, "loss": 0.5342, "step": 3890 }, { "epoch": 0.7314328582145536, "grad_norm": 2.246694326400757, "learning_rate": 8.216743119266055e-06, "loss": 0.58, "step": 3900 }, { "epoch": 0.7333083270817704, "grad_norm": 1.5300601720809937, "learning_rate": 8.15940366972477e-06, "loss": 0.5734, "step": 3910 }, { "epoch": 0.7351837959489872, "grad_norm": 2.032264471054077, "learning_rate": 8.102064220183487e-06, "loss": 0.5499, "step": 3920 }, { "epoch": 0.7370592648162041, "grad_norm": 2.2106308937072754, "learning_rate": 8.044724770642202e-06, "loss": 0.5158, "step": 3930 }, { "epoch": 0.7389347336834209, "grad_norm": 1.91170334815979, "learning_rate": 7.987385321100918e-06, "loss": 0.6973, "step": 3940 }, { "epoch": 0.7408102025506377, "grad_norm": 1.750429391860962, "learning_rate": 7.930045871559633e-06, "loss": 0.5268, "step": 3950 }, { "epoch": 0.7426856714178545, "grad_norm": 3.0469017028808594, "learning_rate": 7.87270642201835e-06, "loss": 0.6026, "step": 3960 }, { "epoch": 0.7445611402850713, "grad_norm": 1.8385506868362427, "learning_rate": 7.815366972477065e-06, "loss": 0.6316, "step": 3970 }, { "epoch": 0.746436609152288, "grad_norm": 2.0888671875, "learning_rate": 7.75802752293578e-06, "loss": 0.6271, "step": 3980 }, { "epoch": 0.7483120780195048, "grad_norm": 2.3192808628082275, "learning_rate": 7.700688073394496e-06, "loss": 0.5864, "step": 3990 }, { "epoch": 0.7501875468867217, "grad_norm": 1.7646706104278564, "learning_rate": 7.643348623853211e-06, "loss": 0.5462, "step": 4000 }, { "epoch": 0.7501875468867217, "eval_loss": 0.6223539710044861, "eval_runtime": 5.5777, "eval_samples_per_second": 21.514, "eval_steps_per_second": 2.689, "step": 4000 }, { "epoch": 0.7520630157539385, "grad_norm": 2.0803816318511963, "learning_rate": 7.586009174311928e-06, "loss": 0.7064, "step": 4010 }, { "epoch": 0.7539384846211553, "grad_norm": 2.42698073387146, "learning_rate": 7.528669724770644e-06, "loss": 0.5476, "step": 4020 }, { "epoch": 0.7558139534883721, "grad_norm": 2.320164442062378, "learning_rate": 7.471330275229358e-06, "loss": 0.6527, "step": 4030 }, { "epoch": 0.7576894223555889, "grad_norm": 2.235037088394165, "learning_rate": 7.413990825688073e-06, "loss": 0.6471, "step": 4040 }, { "epoch": 0.7595648912228057, "grad_norm": 2.3369674682617188, "learning_rate": 7.356651376146789e-06, "loss": 0.6455, "step": 4050 }, { "epoch": 0.7614403600900225, "grad_norm": 2.7365052700042725, "learning_rate": 7.299311926605505e-06, "loss": 0.6133, "step": 4060 }, { "epoch": 0.7633158289572393, "grad_norm": 1.987430453300476, "learning_rate": 7.241972477064221e-06, "loss": 0.5892, "step": 4070 }, { "epoch": 0.7651912978244562, "grad_norm": 2.2822089195251465, "learning_rate": 7.184633027522936e-06, "loss": 0.5558, "step": 4080 }, { "epoch": 0.767066766691673, "grad_norm": 2.1317837238311768, "learning_rate": 7.127293577981651e-06, "loss": 0.5939, "step": 4090 }, { "epoch": 0.7689422355588897, "grad_norm": 2.5483336448669434, "learning_rate": 7.069954128440367e-06, "loss": 0.6171, "step": 4100 }, { "epoch": 0.7708177044261065, "grad_norm": 1.9714287519454956, "learning_rate": 7.012614678899083e-06, "loss": 0.6518, "step": 4110 }, { "epoch": 0.7726931732933233, "grad_norm": 1.9111765623092651, "learning_rate": 6.9552752293577985e-06, "loss": 0.5786, "step": 4120 }, { "epoch": 0.7745686421605401, "grad_norm": 1.9817109107971191, "learning_rate": 6.8979357798165134e-06, "loss": 0.5463, "step": 4130 }, { "epoch": 0.7764441110277569, "grad_norm": 1.834665060043335, "learning_rate": 6.840596330275229e-06, "loss": 0.5541, "step": 4140 }, { "epoch": 0.7783195798949737, "grad_norm": 2.018120765686035, "learning_rate": 6.783256880733945e-06, "loss": 0.5399, "step": 4150 }, { "epoch": 0.7801950487621906, "grad_norm": 2.5197436809539795, "learning_rate": 6.725917431192661e-06, "loss": 0.5581, "step": 4160 }, { "epoch": 0.7820705176294074, "grad_norm": 2.2083163261413574, "learning_rate": 6.6685779816513764e-06, "loss": 0.5535, "step": 4170 }, { "epoch": 0.7839459864966242, "grad_norm": 2.2999789714813232, "learning_rate": 6.611238532110092e-06, "loss": 0.5212, "step": 4180 }, { "epoch": 0.785821455363841, "grad_norm": 2.2333500385284424, "learning_rate": 6.553899082568808e-06, "loss": 0.6867, "step": 4190 }, { "epoch": 0.7876969242310577, "grad_norm": 2.5943992137908936, "learning_rate": 6.496559633027524e-06, "loss": 0.4554, "step": 4200 }, { "epoch": 0.7876969242310577, "eval_loss": 0.6219611763954163, "eval_runtime": 5.5648, "eval_samples_per_second": 21.564, "eval_steps_per_second": 2.696, "step": 4200 }, { "epoch": 0.7895723930982745, "grad_norm": 2.9401698112487793, "learning_rate": 6.4392201834862394e-06, "loss": 0.6103, "step": 4210 }, { "epoch": 0.7914478619654913, "grad_norm": 2.275641679763794, "learning_rate": 6.381880733944954e-06, "loss": 0.5384, "step": 4220 }, { "epoch": 0.7933233308327082, "grad_norm": 1.5987143516540527, "learning_rate": 6.324541284403669e-06, "loss": 0.6324, "step": 4230 }, { "epoch": 0.795198799699925, "grad_norm": 1.6601738929748535, "learning_rate": 6.267201834862385e-06, "loss": 0.5049, "step": 4240 }, { "epoch": 0.7970742685671418, "grad_norm": 2.5912208557128906, "learning_rate": 6.209862385321101e-06, "loss": 0.6013, "step": 4250 }, { "epoch": 0.7989497374343586, "grad_norm": 2.051008701324463, "learning_rate": 6.1525229357798165e-06, "loss": 0.6521, "step": 4260 }, { "epoch": 0.8008252063015754, "grad_norm": 2.331805467605591, "learning_rate": 6.095183486238532e-06, "loss": 0.5396, "step": 4270 }, { "epoch": 0.8027006751687922, "grad_norm": 2.048785924911499, "learning_rate": 6.037844036697248e-06, "loss": 0.5918, "step": 4280 }, { "epoch": 0.804576144036009, "grad_norm": 2.387164354324341, "learning_rate": 5.980504587155964e-06, "loss": 0.624, "step": 4290 }, { "epoch": 0.8064516129032258, "grad_norm": 1.7921018600463867, "learning_rate": 5.9231651376146795e-06, "loss": 0.5066, "step": 4300 }, { "epoch": 0.8083270817704427, "grad_norm": 1.3692150115966797, "learning_rate": 5.865825688073395e-06, "loss": 0.509, "step": 4310 }, { "epoch": 0.8102025506376594, "grad_norm": 1.9718056917190552, "learning_rate": 5.80848623853211e-06, "loss": 0.6208, "step": 4320 }, { "epoch": 0.8120780195048762, "grad_norm": 1.9130088090896606, "learning_rate": 5.751146788990826e-06, "loss": 0.5508, "step": 4330 }, { "epoch": 0.813953488372093, "grad_norm": 2.5534584522247314, "learning_rate": 5.693807339449541e-06, "loss": 0.6473, "step": 4340 }, { "epoch": 0.8158289572393098, "grad_norm": 2.3137259483337402, "learning_rate": 5.6364678899082565e-06, "loss": 0.5723, "step": 4350 }, { "epoch": 0.8177044261065266, "grad_norm": 2.2267236709594727, "learning_rate": 5.579128440366972e-06, "loss": 0.516, "step": 4360 }, { "epoch": 0.8195798949737434, "grad_norm": 2.8468329906463623, "learning_rate": 5.521788990825688e-06, "loss": 0.6196, "step": 4370 }, { "epoch": 0.8214553638409603, "grad_norm": 1.7340741157531738, "learning_rate": 5.464449541284404e-06, "loss": 0.5489, "step": 4380 }, { "epoch": 0.8233308327081771, "grad_norm": 1.9742332696914673, "learning_rate": 5.4071100917431195e-06, "loss": 0.585, "step": 4390 }, { "epoch": 0.8252063015753939, "grad_norm": 2.408601999282837, "learning_rate": 5.349770642201835e-06, "loss": 0.5685, "step": 4400 }, { "epoch": 0.8252063015753939, "eval_loss": 0.6204274296760559, "eval_runtime": 5.5455, "eval_samples_per_second": 21.639, "eval_steps_per_second": 2.705, "step": 4400 }, { "epoch": 0.8270817704426107, "grad_norm": 2.1270008087158203, "learning_rate": 5.292431192660551e-06, "loss": 0.5759, "step": 4410 }, { "epoch": 0.8289572393098275, "grad_norm": 2.048781156539917, "learning_rate": 5.235091743119266e-06, "loss": 0.5268, "step": 4420 }, { "epoch": 0.8308327081770442, "grad_norm": 1.643114686012268, "learning_rate": 5.177752293577982e-06, "loss": 0.5481, "step": 4430 }, { "epoch": 0.832708177044261, "grad_norm": 1.9851353168487549, "learning_rate": 5.120412844036697e-06, "loss": 0.6492, "step": 4440 }, { "epoch": 0.8345836459114778, "grad_norm": 2.3454835414886475, "learning_rate": 5.063073394495413e-06, "loss": 0.5475, "step": 4450 }, { "epoch": 0.8364591147786947, "grad_norm": 2.1236870288848877, "learning_rate": 5.005733944954129e-06, "loss": 0.4889, "step": 4460 }, { "epoch": 0.8383345836459115, "grad_norm": 2.490607738494873, "learning_rate": 4.948394495412844e-06, "loss": 0.6648, "step": 4470 }, { "epoch": 0.8402100525131283, "grad_norm": 2.781184434890747, "learning_rate": 4.8910550458715596e-06, "loss": 0.6362, "step": 4480 }, { "epoch": 0.8420855213803451, "grad_norm": 1.488677740097046, "learning_rate": 4.833715596330275e-06, "loss": 0.6213, "step": 4490 }, { "epoch": 0.8439609902475619, "grad_norm": 1.9841208457946777, "learning_rate": 4.776376146788991e-06, "loss": 0.6166, "step": 4500 }, { "epoch": 0.8458364591147787, "grad_norm": 1.4909323453903198, "learning_rate": 4.719036697247707e-06, "loss": 0.4612, "step": 4510 }, { "epoch": 0.8477119279819955, "grad_norm": 1.927198886871338, "learning_rate": 4.661697247706422e-06, "loss": 0.5697, "step": 4520 }, { "epoch": 0.8495873968492123, "grad_norm": 2.1951193809509277, "learning_rate": 4.6043577981651375e-06, "loss": 0.6029, "step": 4530 }, { "epoch": 0.8514628657164292, "grad_norm": 1.6474297046661377, "learning_rate": 4.547018348623853e-06, "loss": 0.5997, "step": 4540 }, { "epoch": 0.8533383345836459, "grad_norm": 2.8692142963409424, "learning_rate": 4.489678899082569e-06, "loss": 0.5052, "step": 4550 }, { "epoch": 0.8552138034508627, "grad_norm": 2.2251393795013428, "learning_rate": 4.432339449541285e-06, "loss": 0.5406, "step": 4560 }, { "epoch": 0.8570892723180795, "grad_norm": 1.9672750234603882, "learning_rate": 4.3750000000000005e-06, "loss": 0.6556, "step": 4570 }, { "epoch": 0.8589647411852963, "grad_norm": 1.9112441539764404, "learning_rate": 4.317660550458716e-06, "loss": 0.6307, "step": 4580 }, { "epoch": 0.8608402100525131, "grad_norm": 2.0552773475646973, "learning_rate": 4.260321100917432e-06, "loss": 0.5682, "step": 4590 }, { "epoch": 0.8627156789197299, "grad_norm": 1.927811622619629, "learning_rate": 4.202981651376147e-06, "loss": 0.5006, "step": 4600 }, { "epoch": 0.8627156789197299, "eval_loss": 0.6183449625968933, "eval_runtime": 5.5214, "eval_samples_per_second": 21.734, "eval_steps_per_second": 2.717, "step": 4600 }, { "epoch": 0.8645911477869468, "grad_norm": 2.3974733352661133, "learning_rate": 4.145642201834863e-06, "loss": 0.6954, "step": 4610 }, { "epoch": 0.8664666166541636, "grad_norm": 2.214097738265991, "learning_rate": 4.0883027522935775e-06, "loss": 0.619, "step": 4620 }, { "epoch": 0.8683420855213804, "grad_norm": 2.094970464706421, "learning_rate": 4.030963302752293e-06, "loss": 0.5883, "step": 4630 }, { "epoch": 0.8702175543885972, "grad_norm": 1.908461570739746, "learning_rate": 3.973623853211009e-06, "loss": 0.4951, "step": 4640 }, { "epoch": 0.872093023255814, "grad_norm": 2.1103639602661133, "learning_rate": 3.916284403669725e-06, "loss": 0.5969, "step": 4650 }, { "epoch": 0.8739684921230307, "grad_norm": 1.8500175476074219, "learning_rate": 3.8589449541284405e-06, "loss": 0.5824, "step": 4660 }, { "epoch": 0.8758439609902475, "grad_norm": 2.222599506378174, "learning_rate": 3.8016055045871563e-06, "loss": 0.654, "step": 4670 }, { "epoch": 0.8777194298574643, "grad_norm": 2.0447375774383545, "learning_rate": 3.744266055045872e-06, "loss": 0.5774, "step": 4680 }, { "epoch": 0.8795948987246812, "grad_norm": 2.4672482013702393, "learning_rate": 3.686926605504587e-06, "loss": 0.6135, "step": 4690 }, { "epoch": 0.881470367591898, "grad_norm": 2.1856000423431396, "learning_rate": 3.6295871559633027e-06, "loss": 0.5876, "step": 4700 }, { "epoch": 0.8833458364591148, "grad_norm": 2.358637809753418, "learning_rate": 3.5722477064220184e-06, "loss": 0.5665, "step": 4710 }, { "epoch": 0.8852213053263316, "grad_norm": 1.8287360668182373, "learning_rate": 3.514908256880734e-06, "loss": 0.5067, "step": 4720 }, { "epoch": 0.8870967741935484, "grad_norm": 2.045971155166626, "learning_rate": 3.45756880733945e-06, "loss": 0.54, "step": 4730 }, { "epoch": 0.8889722430607652, "grad_norm": 2.5090229511260986, "learning_rate": 3.4002293577981652e-06, "loss": 0.6294, "step": 4740 }, { "epoch": 0.890847711927982, "grad_norm": 2.9200639724731445, "learning_rate": 3.3428899082568806e-06, "loss": 0.5443, "step": 4750 }, { "epoch": 0.8927231807951987, "grad_norm": 2.0221188068389893, "learning_rate": 3.2855504587155963e-06, "loss": 0.628, "step": 4760 }, { "epoch": 0.8945986496624156, "grad_norm": 2.6036345958709717, "learning_rate": 3.228211009174312e-06, "loss": 0.6387, "step": 4770 }, { "epoch": 0.8964741185296324, "grad_norm": 3.309267044067383, "learning_rate": 3.170871559633028e-06, "loss": 0.5863, "step": 4780 }, { "epoch": 0.8983495873968492, "grad_norm": 3.4704477787017822, "learning_rate": 3.113532110091743e-06, "loss": 0.5955, "step": 4790 }, { "epoch": 0.900225056264066, "grad_norm": 2.056976556777954, "learning_rate": 3.056192660550459e-06, "loss": 0.5984, "step": 4800 }, { "epoch": 0.900225056264066, "eval_loss": 0.6168529987335205, "eval_runtime": 5.5639, "eval_samples_per_second": 21.568, "eval_steps_per_second": 2.696, "step": 4800 }, { "epoch": 0.9021005251312828, "grad_norm": 2.358440399169922, "learning_rate": 2.9988532110091746e-06, "loss": 0.5657, "step": 4810 }, { "epoch": 0.9039759939984996, "grad_norm": 2.124436140060425, "learning_rate": 2.94151376146789e-06, "loss": 0.642, "step": 4820 }, { "epoch": 0.9058514628657164, "grad_norm": 1.5845674276351929, "learning_rate": 2.8841743119266057e-06, "loss": 0.5054, "step": 4830 }, { "epoch": 0.9077269317329333, "grad_norm": 2.296250820159912, "learning_rate": 2.8325688073394495e-06, "loss": 0.6108, "step": 4840 }, { "epoch": 0.9096024006001501, "grad_norm": 1.7618379592895508, "learning_rate": 2.7752293577981653e-06, "loss": 0.6623, "step": 4850 }, { "epoch": 0.9114778694673669, "grad_norm": 2.3502273559570312, "learning_rate": 2.7178899082568806e-06, "loss": 0.5688, "step": 4860 }, { "epoch": 0.9133533383345837, "grad_norm": 2.141451597213745, "learning_rate": 2.6605504587155964e-06, "loss": 0.6076, "step": 4870 }, { "epoch": 0.9152288072018004, "grad_norm": 2.2488343715667725, "learning_rate": 2.603211009174312e-06, "loss": 0.6399, "step": 4880 }, { "epoch": 0.9171042760690172, "grad_norm": 2.0450565814971924, "learning_rate": 2.545871559633028e-06, "loss": 0.5864, "step": 4890 }, { "epoch": 0.918979744936234, "grad_norm": 2.490226984024048, "learning_rate": 2.488532110091743e-06, "loss": 0.5644, "step": 4900 }, { "epoch": 0.9208552138034508, "grad_norm": 2.630089282989502, "learning_rate": 2.4311926605504585e-06, "loss": 0.6637, "step": 4910 }, { "epoch": 0.9227306826706677, "grad_norm": 2.2584402561187744, "learning_rate": 2.3738532110091743e-06, "loss": 0.5784, "step": 4920 }, { "epoch": 0.9246061515378845, "grad_norm": 2.9330437183380127, "learning_rate": 2.31651376146789e-06, "loss": 0.5286, "step": 4930 }, { "epoch": 0.9264816204051013, "grad_norm": 2.6167702674865723, "learning_rate": 2.2591743119266058e-06, "loss": 0.5273, "step": 4940 }, { "epoch": 0.9283570892723181, "grad_norm": 2.414607286453247, "learning_rate": 2.201834862385321e-06, "loss": 0.5177, "step": 4950 }, { "epoch": 0.9302325581395349, "grad_norm": 2.5905508995056152, "learning_rate": 2.144495412844037e-06, "loss": 0.5818, "step": 4960 }, { "epoch": 0.9321080270067517, "grad_norm": 2.9565694332122803, "learning_rate": 2.087155963302752e-06, "loss": 0.5307, "step": 4970 }, { "epoch": 0.9339834958739685, "grad_norm": 2.3778281211853027, "learning_rate": 2.029816513761468e-06, "loss": 0.5477, "step": 4980 }, { "epoch": 0.9358589647411854, "grad_norm": 2.004302978515625, "learning_rate": 1.9724770642201837e-06, "loss": 0.55, "step": 4990 }, { "epoch": 0.9377344336084021, "grad_norm": 2.098611354827881, "learning_rate": 1.915137614678899e-06, "loss": 0.6077, "step": 5000 }, { "epoch": 0.9377344336084021, "eval_loss": 0.6168031096458435, "eval_runtime": 5.423, "eval_samples_per_second": 22.128, "eval_steps_per_second": 2.766, "step": 5000 }, { "epoch": 0.9396099024756189, "grad_norm": 2.273273229598999, "learning_rate": 1.8577981651376147e-06, "loss": 0.5192, "step": 5010 }, { "epoch": 0.9414853713428357, "grad_norm": 2.40267276763916, "learning_rate": 1.8004587155963303e-06, "loss": 0.6307, "step": 5020 }, { "epoch": 0.9433608402100525, "grad_norm": 2.205829620361328, "learning_rate": 1.743119266055046e-06, "loss": 0.5307, "step": 5030 }, { "epoch": 0.9452363090772693, "grad_norm": 2.208779811859131, "learning_rate": 1.6857798165137616e-06, "loss": 0.5816, "step": 5040 }, { "epoch": 0.9471117779444861, "grad_norm": 2.550372838973999, "learning_rate": 1.628440366972477e-06, "loss": 0.588, "step": 5050 }, { "epoch": 0.9489872468117029, "grad_norm": 2.062358856201172, "learning_rate": 1.5711009174311926e-06, "loss": 0.6606, "step": 5060 }, { "epoch": 0.9508627156789198, "grad_norm": 2.3175814151763916, "learning_rate": 1.5137614678899084e-06, "loss": 0.4878, "step": 5070 }, { "epoch": 0.9527381845461366, "grad_norm": 3.7666046619415283, "learning_rate": 1.456422018348624e-06, "loss": 0.566, "step": 5080 }, { "epoch": 0.9546136534133534, "grad_norm": 2.467745304107666, "learning_rate": 1.3990825688073395e-06, "loss": 0.6029, "step": 5090 }, { "epoch": 0.9564891222805701, "grad_norm": 1.9065784215927124, "learning_rate": 1.3417431192660552e-06, "loss": 0.5577, "step": 5100 }, { "epoch": 0.9583645911477869, "grad_norm": 2.447404623031616, "learning_rate": 1.2844036697247705e-06, "loss": 0.513, "step": 5110 }, { "epoch": 0.9602400600150037, "grad_norm": 2.8181941509246826, "learning_rate": 1.2270642201834863e-06, "loss": 0.4997, "step": 5120 }, { "epoch": 0.9621155288822205, "grad_norm": 2.414186954498291, "learning_rate": 1.169724770642202e-06, "loss": 0.5174, "step": 5130 }, { "epoch": 0.9639909977494373, "grad_norm": 2.9557716846466064, "learning_rate": 1.1123853211009173e-06, "loss": 0.5591, "step": 5140 }, { "epoch": 0.9658664666166542, "grad_norm": 1.9689189195632935, "learning_rate": 1.055045871559633e-06, "loss": 0.5722, "step": 5150 }, { "epoch": 0.967741935483871, "grad_norm": 2.1190686225891113, "learning_rate": 9.977064220183486e-07, "loss": 0.558, "step": 5160 }, { "epoch": 0.9696174043510878, "grad_norm": 2.7399091720581055, "learning_rate": 9.403669724770642e-07, "loss": 0.5672, "step": 5170 }, { "epoch": 0.9714928732183046, "grad_norm": 2.2235541343688965, "learning_rate": 8.830275229357798e-07, "loss": 0.5904, "step": 5180 }, { "epoch": 0.9733683420855214, "grad_norm": 2.248394727706909, "learning_rate": 8.256880733944955e-07, "loss": 0.5505, "step": 5190 }, { "epoch": 0.9752438109527382, "grad_norm": 2.1956896781921387, "learning_rate": 7.68348623853211e-07, "loss": 0.596, "step": 5200 }, { "epoch": 0.9752438109527382, "eval_loss": 0.6171349287033081, "eval_runtime": 5.3961, "eval_samples_per_second": 22.238, "eval_steps_per_second": 2.78, "step": 5200 } ], "logging_steps": 10, "max_steps": 5332, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.618912552812544e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }