|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 3782, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00026441036488630354, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 5.277044854881267e-07, |
|
"loss": 1.8784, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013220518244315177, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 2.6385224274406333e-06, |
|
"loss": 1.8748, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0026441036488630354, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 5.2770448548812665e-06, |
|
"loss": 1.8755, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003966155473294554, |
|
"grad_norm": 3.5, |
|
"learning_rate": 7.915567282321901e-06, |
|
"loss": 1.8741, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005288207297726071, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.0554089709762533e-05, |
|
"loss": 1.8644, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006610259122157589, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.3192612137203167e-05, |
|
"loss": 1.8583, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.007932310946589107, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.5831134564643802e-05, |
|
"loss": 1.8416, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009254362771020624, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.8469656992084432e-05, |
|
"loss": 1.8236, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.010576414595452142, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.1108179419525066e-05, |
|
"loss": 1.8066, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011898466419883659, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.37467018469657e-05, |
|
"loss": 1.7951, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.013220518244315178, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 2.6385224274406334e-05, |
|
"loss": 1.7783, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014542570068746695, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 2.9023746701846964e-05, |
|
"loss": 1.7577, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.015864621893178214, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.1662269129287604e-05, |
|
"loss": 1.7413, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01718667371760973, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.430079155672823e-05, |
|
"loss": 1.7217, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01850872554204125, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.6939313984168865e-05, |
|
"loss": 1.7064, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.019830777366472766, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 3.95778364116095e-05, |
|
"loss": 1.6914, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.021152829190904283, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 4.221635883905013e-05, |
|
"loss": 1.679, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0224748810153358, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 4.4854881266490766e-05, |
|
"loss": 1.6632, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.023796932839767318, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 4.74934036939314e-05, |
|
"loss": 1.6472, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025118984664198835, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 5.0131926121372033e-05, |
|
"loss": 1.6343, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.026441036488630356, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 5.277044854881267e-05, |
|
"loss": 1.6313, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.027763088313061873, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 5.540897097625331e-05, |
|
"loss": 1.6094, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.02908514013749339, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 5.804749340369393e-05, |
|
"loss": 1.5984, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.030407191961924908, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 6.068601583113457e-05, |
|
"loss": 1.5843, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03172924378635643, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 6.332453825857521e-05, |
|
"loss": 1.5689, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03305129561078794, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 6.596306068601583e-05, |
|
"loss": 1.5575, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03437334743521946, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 6.860158311345646e-05, |
|
"loss": 1.5451, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03569539925965098, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 7.124010554089711e-05, |
|
"loss": 1.5333, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0370174510840825, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 7.387862796833773e-05, |
|
"loss": 1.5237, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03833950290851401, |
|
"grad_norm": 0.5, |
|
"learning_rate": 7.651715039577836e-05, |
|
"loss": 1.5105, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.03966155473294553, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 7.9155672823219e-05, |
|
"loss": 1.4946, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.040983606557377046, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 8.179419525065963e-05, |
|
"loss": 1.4843, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04230565838180857, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.443271767810026e-05, |
|
"loss": 1.4732, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04362771020624009, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 8.70712401055409e-05, |
|
"loss": 1.4611, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0449497620306716, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 8.970976253298153e-05, |
|
"loss": 1.4485, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04627181385510312, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 9.234828496042217e-05, |
|
"loss": 1.4406, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.047593865679534636, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 9.49868073878628e-05, |
|
"loss": 1.4236, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04891591750396616, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 9.762532981530343e-05, |
|
"loss": 1.4107, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.05023796932839767, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00010026385224274407, |
|
"loss": 1.4052, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05156002115282919, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00010290237467018471, |
|
"loss": 1.3859, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.05288207297726071, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00010554089709762533, |
|
"loss": 1.3742, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.054204124801692226, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00010817941952506597, |
|
"loss": 1.365, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.05552617662612375, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00011081794195250662, |
|
"loss": 1.3482, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05684822845055526, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00011345646437994724, |
|
"loss": 1.3375, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.05817028027498678, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00011609498680738786, |
|
"loss": 1.333, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.059492332099418295, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00011873350923482852, |
|
"loss": 1.3238, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.060814383923849816, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00012137203166226914, |
|
"loss": 1.315, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06213643574828133, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00012401055408970977, |
|
"loss": 1.3028, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.06345848757271286, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00012664907651715042, |
|
"loss": 1.2869, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06478053939714437, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00012928759894459104, |
|
"loss": 1.281, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.06610259122157588, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00013192612137203166, |
|
"loss": 1.2747, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0674246430460074, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0001345646437994723, |
|
"loss": 1.2697, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.06874669487043893, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00013720316622691292, |
|
"loss": 1.2549, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07006874669487044, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00013984168865435357, |
|
"loss": 1.246, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.07139079851930195, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00014248021108179422, |
|
"loss": 1.2412, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07271285034373347, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00014511873350923484, |
|
"loss": 1.2343, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.074034902168165, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00014775725593667546, |
|
"loss": 1.2205, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07535695399259651, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00015039577836411608, |
|
"loss": 1.2203, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.07667900581702802, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00015303430079155673, |
|
"loss": 1.2122, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07800105764145955, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00015567282321899737, |
|
"loss": 1.2035, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.07932310946589106, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.000158311345646438, |
|
"loss": 1.1933, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00016094986807387864, |
|
"loss": 1.1926, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.08196721311475409, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00016358839050131926, |
|
"loss": 1.1833, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08328926493918562, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00016622691292875988, |
|
"loss": 1.1753, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.08461131676361713, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00016886543535620053, |
|
"loss": 1.1717, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08593336858804865, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017150395778364118, |
|
"loss": 1.1684, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.08725542041248018, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001741424802110818, |
|
"loss": 1.1608, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08857747223691169, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00017678100263852244, |
|
"loss": 1.1563, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.0898995240613432, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00017941952506596306, |
|
"loss": 1.1501, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09122157588577472, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018205804749340368, |
|
"loss": 1.1451, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.09254362771020624, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00018469656992084433, |
|
"loss": 1.1338, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09386567953463776, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00018733509234828498, |
|
"loss": 1.1326, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.09518773135906927, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001899736147757256, |
|
"loss": 1.1286, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0965097831835008, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00019261213720316625, |
|
"loss": 1.1219, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.09783183500793231, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019525065963060687, |
|
"loss": 1.1238, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09915388683236383, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00019788918205804749, |
|
"loss": 1.1165, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.10047593865679534, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00019999995738663592, |
|
"loss": 1.1111, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.10179799048122687, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00019999846592270624, |
|
"loss": 1.0969, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.10312004230565838, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00019999484382688995, |
|
"loss": 1.1008, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1044420941300899, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00019998909117636182, |
|
"loss": 1.0969, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.10576414595452142, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00019998120809369154, |
|
"loss": 1.0888, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10708619777895294, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0001999711947468411, |
|
"loss": 1.0926, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.10840824960338445, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00019995905134916143, |
|
"loss": 1.08, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10973030142781597, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001999447781593875, |
|
"loss": 1.0778, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.1110523532522475, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00019992837548163316, |
|
"loss": 1.0771, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.11237440507667901, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00019990984366538442, |
|
"loss": 1.0722, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.11369645690111052, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00019988918310549222, |
|
"loss": 1.0628, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11501850872554203, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.0001998663942421637, |
|
"loss": 1.0611, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.11634056054997356, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019984147756095308, |
|
"loss": 1.0627, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11766261237440508, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0001998144335927513, |
|
"loss": 1.0562, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.11898466419883659, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00019978526291377464, |
|
"loss": 1.0543, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.12030671602326812, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0001997539661455524, |
|
"loss": 1.0532, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.12162876784769963, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00019972054395491368, |
|
"loss": 1.0477, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.12295081967213115, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00019968499705397331, |
|
"loss": 1.044, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.12427287149656266, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00019964732620011651, |
|
"loss": 1.0451, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12559492332099417, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00019960753219598281, |
|
"loss": 1.0385, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.12691697514542571, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00019956561588944897, |
|
"loss": 1.0376, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12823902696985723, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00019952157817361088, |
|
"loss": 1.0396, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.12956107879428874, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0001994754199867645, |
|
"loss": 1.0342, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.13088313061872026, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00019942714231238604, |
|
"loss": 1.036, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.13220518244315177, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00019937674617911077, |
|
"loss": 1.0231, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13352723426758328, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00019932423266071122, |
|
"loss": 1.0286, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.1348492860920148, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019926960287607436, |
|
"loss": 1.0238, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1361713379164463, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0001992128579891777, |
|
"loss": 1.0216, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.13749338974087785, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019915399920906432, |
|
"loss": 1.0237, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.13881544156530937, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0001990930277898175, |
|
"loss": 1.0137, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.14013749338974088, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00019902994503053363, |
|
"loss": 1.0206, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1414595452141724, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0001989647522752947, |
|
"loss": 1.0189, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.1427815970386039, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0001988974509131397, |
|
"loss": 1.0109, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.14410364886303542, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00019882804237803488, |
|
"loss": 1.0069, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.14542570068746694, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.00019875652814884326, |
|
"loss": 1.0097, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14674775251189848, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00019868290974929328, |
|
"loss": 1.0033, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.14806980433633, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00019860718874794602, |
|
"loss": 1.0034, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1493918561607615, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00019852936675816209, |
|
"loss": 1.0019, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.15071390798519302, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.000198449445438067, |
|
"loss": 1.0004, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.15203595980962453, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00019836742649051602, |
|
"loss": 0.998, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.15335801163405605, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019828331166305785, |
|
"loss": 0.9944, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.15468006345848756, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00019819710274789727, |
|
"loss": 0.9951, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.1560021152829191, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0001981088015818571, |
|
"loss": 0.9962, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15732416710735062, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019801841004633906, |
|
"loss": 0.9884, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.15864621893178213, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00019792593006728347, |
|
"loss": 0.9918, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15996827075621364, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00019783136361512858, |
|
"loss": 0.993, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00019773471270476822, |
|
"loss": 0.9902, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.16261237440507667, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0001976359793955091, |
|
"loss": 0.9843, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001975351657910269, |
|
"loss": 0.988, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.16525647805393973, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019743227403932134, |
|
"loss": 0.9874, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.16657852987837124, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019732730633267056, |
|
"loss": 0.9782, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16790058170280275, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00019722026490758424, |
|
"loss": 0.9813, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.16922263352723427, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00019711115204475616, |
|
"loss": 0.9793, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.17054468535166578, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0001969999700690154, |
|
"loss": 0.9749, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.1718667371760973, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00019688672134927693, |
|
"loss": 0.9768, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1731887890005288, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00019677140829849112, |
|
"loss": 0.973, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.17451084082496035, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00019665403337359227, |
|
"loss": 0.972, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.17583289264939186, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019653459907544634, |
|
"loss": 0.9759, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.17715494447382338, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0001964131079487976, |
|
"loss": 0.9691, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.1784769962982549, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001962895625822144, |
|
"loss": 0.9681, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.1797990481226864, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0001961639656080342, |
|
"loss": 0.9695, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.18112109994711792, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00019603631970230713, |
|
"loss": 0.9695, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.18244315177154943, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00019590662758473934, |
|
"loss": 0.9685, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.18376520359598097, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0001957748920186348, |
|
"loss": 0.9681, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.1850872554204125, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00019564111581083657, |
|
"loss": 0.9632, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.186409307244844, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00019550530181166692, |
|
"loss": 0.9683, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.18773135906927552, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0001953674529148666, |
|
"loss": 0.9628, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18905341089370703, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019522757205753323, |
|
"loss": 0.9625, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.19037546271813854, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019508566222005866, |
|
"loss": 0.9583, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.19169751454257006, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00019494172642606553, |
|
"loss": 0.9634, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.1930195663670016, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00019479576774234284, |
|
"loss": 0.9602, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.1943416181914331, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00019464778927878048, |
|
"loss": 0.967, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.19566367001586463, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00019449779418830322, |
|
"loss": 0.9549, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.19698572184029614, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001943457856668033, |
|
"loss": 0.9652, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.19830777366472765, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00019419176695307245, |
|
"loss": 0.9541, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19962982548915917, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00019403574132873276, |
|
"loss": 0.9506, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.20095187731359068, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00019387771211816705, |
|
"loss": 0.9572, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.20227392913802222, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00019371768268844762, |
|
"loss": 0.9525, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.20359598096245374, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001935556564492649, |
|
"loss": 0.9542, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.20491803278688525, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0001933916368528545, |
|
"loss": 0.9532, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.20624008461131677, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019322562739392394, |
|
"loss": 0.9511, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.20756213643574828, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00019305763160957788, |
|
"loss": 0.9536, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.2088841882601798, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019288765307924299, |
|
"loss": 0.9511, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2102062400846113, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019271569542459165, |
|
"loss": 0.9497, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.21152829190904285, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019254176230946462, |
|
"loss": 0.9484, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.21285034373347436, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00019236585743979322, |
|
"loss": 0.9442, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.21417239555790588, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00019218798456352012, |
|
"loss": 0.9426, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2154944473823374, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00019200814747051976, |
|
"loss": 0.9455, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.2168164992067689, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019182634999251728, |
|
"loss": 0.9419, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.21813855103120042, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019164259600300723, |
|
"loss": 0.947, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.21946060285563193, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00019145688941717075, |
|
"loss": 0.9478, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.22078265468006344, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0001912692341917922, |
|
"loss": 0.9433, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.222104706504495, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019107963432517505, |
|
"loss": 0.9461, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2234267583289265, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00019088809385705646, |
|
"loss": 0.9421, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.22474881015335801, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00019069461686852134, |
|
"loss": 0.9436, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.22607086197778953, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0001904992074819153, |
|
"loss": 0.9381, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.22739291380222104, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00019030186986075703, |
|
"loss": 0.9408, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.22871496562665256, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0001901026082096492, |
|
"loss": 0.9371, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.23003701745108407, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00018990142677418923, |
|
"loss": 0.9387, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2313590692755156, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00018969832984087873, |
|
"loss": 0.9372, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.23268112109994712, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00018949332173703206, |
|
"loss": 0.936, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.23400317292437864, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001892864068306843, |
|
"loss": 0.9405, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.23532522474881015, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00018907758953049805, |
|
"loss": 0.9333, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.23664727657324167, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00018886687428566954, |
|
"loss": 0.9356, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.23796932839767318, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00018865426558583383, |
|
"loss": 0.9344, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2392913802221047, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018843976796096917, |
|
"loss": 0.9355, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.24061343204653624, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00018822338598130047, |
|
"loss": 0.9336, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001880051242572019, |
|
"loss": 0.9304, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.24325753569539926, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00018778498743909873, |
|
"loss": 0.931, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.24457958751983078, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00018756298021736808, |
|
"loss": 0.934, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.2459016393442623, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00018733910732223925, |
|
"loss": 0.9273, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2472236911686938, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00018711337352369264, |
|
"loss": 0.9315, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.24854574299312532, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001868857836313584, |
|
"loss": 0.9316, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.24986779481755686, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00018665634249441366, |
|
"loss": 0.9319, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.25118984664198835, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0001864250550014795, |
|
"loss": 0.9297, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.25251189846641986, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00018619192608051655, |
|
"loss": 0.9237, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.25383395029085143, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00018595696069872013, |
|
"loss": 0.9293, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.25515600211528294, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00018572016386241442, |
|
"loss": 0.9308, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.25647805393971446, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018548154061694572, |
|
"loss": 0.9207, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.25780010576414597, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00018524109604657496, |
|
"loss": 0.9285, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2591221575885775, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00018499883527436947, |
|
"loss": 0.9268, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.260444209413009, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001847547634620936, |
|
"loss": 0.9284, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.2617662612374405, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00018450888581009908, |
|
"loss": 0.9264, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.263088313061872, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00018426120755721386, |
|
"loss": 0.926, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.26441036488630354, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001840117339806308, |
|
"loss": 0.9218, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.26573241671073505, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00018376047039579495, |
|
"loss": 0.9245, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.26705446853516657, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00018350742215629055, |
|
"loss": 0.9221, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2683765203595981, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00018325259465372677, |
|
"loss": 0.926, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2696985721840296, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0001829959933176229, |
|
"loss": 0.9235, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2710206240084611, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00018273762361529274, |
|
"loss": 0.9237, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.2723426758328926, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.000182477491051728, |
|
"loss": 0.9203, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2736647276573242, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00018221560116948103, |
|
"loss": 0.9206, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.2749867794817557, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00018195195954854676, |
|
"loss": 0.924, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2763088313061872, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00018168657180624384, |
|
"loss": 0.918, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.27763088313061873, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0001814194435970949, |
|
"loss": 0.9298, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.27895293495505025, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00018115058061270598, |
|
"loss": 0.9195, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.28027498677948176, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001808799885816455, |
|
"loss": 0.9196, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2815970386039133, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00018060767326932194, |
|
"loss": 0.9154, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.2829190904283448, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00018033364047786128, |
|
"loss": 0.9186, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2842411422527763, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00018005789604598303, |
|
"loss": 0.9173, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.2855631940772078, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0001797804458488762, |
|
"loss": 0.9199, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.28688524590163933, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00017950129579807374, |
|
"loss": 0.9202, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.28820729772607084, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00017922045184132698, |
|
"loss": 0.922, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.28952934955050236, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00017893791996247856, |
|
"loss": 0.9157, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.29085140137493387, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00017865370618133511, |
|
"loss": 0.9145, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.29217345319936544, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017836781655353905, |
|
"loss": 0.9114, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.29349550502379695, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017808025717043938, |
|
"loss": 0.9128, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.29481755684822847, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00017779103415896193, |
|
"loss": 0.9182, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.29613960867266, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.000177500153681479, |
|
"loss": 0.9127, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.2974616604970915, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00017720762193567787, |
|
"loss": 0.9122, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.298783712321523, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00017691344515442879, |
|
"loss": 0.9117, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3001057641459545, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00017661762960565223, |
|
"loss": 0.9097, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.30142781597038604, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0001763201815921853, |
|
"loss": 0.9085, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.30274986779481755, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0001760211074516474, |
|
"loss": 0.915, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.30407191961924906, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00017572041355630536, |
|
"loss": 0.9145, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3053939714436806, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00017541810631293742, |
|
"loss": 0.9144, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.3067160232681121, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00017511419216269695, |
|
"loss": 0.9129, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3080380750925436, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00017480867758097506, |
|
"loss": 0.9126, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.3093601269169751, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00017450156907726272, |
|
"loss": 0.9139, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.3106821787414067, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00017419287319501197, |
|
"loss": 0.9064, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.3120042305658382, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00017388259651149673, |
|
"loss": 0.9165, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3133262823902697, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00017357074563767225, |
|
"loss": 0.9102, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.31464833421470123, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00017325732721803466, |
|
"loss": 0.9102, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.31597038603913274, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0001729423479304792, |
|
"loss": 0.9092, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.31729243786356426, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00017262581448615794, |
|
"loss": 0.9076, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.31861448968799577, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00017230773362933687, |
|
"loss": 0.9073, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.3199365415124273, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0001719881121372521, |
|
"loss": 0.9074, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3212585933368588, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.00017166695681996555, |
|
"loss": 0.912, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00017134427452021974, |
|
"loss": 0.9061, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3239026969857218, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00017102007211329214, |
|
"loss": 0.9084, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.32522474881015334, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001706943565068486, |
|
"loss": 0.9091, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.32654680063458486, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.00017036713464079612, |
|
"loss": 0.9092, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001700384134871351, |
|
"loss": 0.9069, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.32919090428344794, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00016970820004981067, |
|
"loss": 0.9082, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.33051295610787945, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00016937650136456358, |
|
"loss": 0.9086, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.33183500793231097, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.0001690433244987802, |
|
"loss": 0.9041, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.3331570597567425, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.000168708676551342, |
|
"loss": 0.9098, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.334479111581174, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00016837256465247418, |
|
"loss": 0.9024, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.3358011634056055, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00016803499596359392, |
|
"loss": 0.9061, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.337123215230037, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0001676959776771577, |
|
"loss": 0.9094, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.33844526705446853, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00016735551701650803, |
|
"loss": 0.9035, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.33976731887890005, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00016701362123571959, |
|
"loss": 0.9071, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.34108937070333156, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00016667029761944466, |
|
"loss": 0.9064, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3424114225277631, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00016632555348275788, |
|
"loss": 0.9043, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.3437334743521946, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00016597939617100046, |
|
"loss": 0.9039, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3450555261766261, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00016563183305962363, |
|
"loss": 0.9042, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.3463775780010576, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0001652828715540314, |
|
"loss": 0.9026, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.34769962982548913, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00016493251908942302, |
|
"loss": 0.9043, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.3490216816499207, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0001645807831306343, |
|
"loss": 0.9035, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3503437334743522, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00016422767117197867, |
|
"loss": 0.9016, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.35166578529878373, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001638731907370876, |
|
"loss": 0.8989, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.35298783712321524, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00016351734937875007, |
|
"loss": 0.8979, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.35430988894764676, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00016316015467875188, |
|
"loss": 0.904, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.35563194077207827, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00016280161424771396, |
|
"loss": 0.8991, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3569539925965098, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001624417357249302, |
|
"loss": 0.901, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3582760444209413, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00016208052677820484, |
|
"loss": 0.904, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.3595980962453728, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00016171799510368895, |
|
"loss": 0.8977, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3609201480698043, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00016135414842571643, |
|
"loss": 0.9002, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.36224219989423584, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001609889944966396, |
|
"loss": 0.8996, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.36356425171866735, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001606225410966638, |
|
"loss": 0.9047, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.36488630354309887, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001602547960336819, |
|
"loss": 0.9004, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3662083553675304, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00015988576714310755, |
|
"loss": 0.902, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.36753040719196195, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00015951546228770868, |
|
"loss": 0.8999, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.36885245901639346, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0001591438893574396, |
|
"loss": 0.9047, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.370174510840825, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001587710562692731, |
|
"loss": 0.8982, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3714965626652565, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00015839697096703176, |
|
"loss": 0.9009, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.372818614489688, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00015802164142121854, |
|
"loss": 0.9023, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.3741406663141195, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0001576450756288471, |
|
"loss": 0.9018, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.37546271813855103, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001572672816132714, |
|
"loss": 0.9006, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.37678476996298255, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.00015688826742401465, |
|
"loss": 0.8996, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.37810682178741406, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00015650804113659793, |
|
"loss": 0.8957, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.3794288736118456, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00015612661085236807, |
|
"loss": 0.901, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.3807509254362771, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00015574398469832493, |
|
"loss": 0.8984, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.3820729772607086, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00015536017082694846, |
|
"loss": 0.8983, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.3833950290851401, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00015497517741602486, |
|
"loss": 0.8992, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.38471708090957163, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00015458901266847232, |
|
"loss": 0.8988, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.3860391327340032, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001542016848121663, |
|
"loss": 0.8982, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3873611845584347, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00015381320209976426, |
|
"loss": 0.9009, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.3886832363828662, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001534235728085297, |
|
"loss": 0.9006, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.39000528820729774, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00015303280524015585, |
|
"loss": 0.9001, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.39132734003172925, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0001526409077205889, |
|
"loss": 0.894, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.39264939185616077, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00015224788859985043, |
|
"loss": 0.8955, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.3939714436805923, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00015185375625185964, |
|
"loss": 0.902, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.3952934955050238, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00015145851907425484, |
|
"loss": 0.8995, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.3966155473294553, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0001510621854882145, |
|
"loss": 0.8997, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3979375991538868, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00015066476393827799, |
|
"loss": 0.8915, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.39925965097831834, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00015026626289216542, |
|
"loss": 0.8916, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.40058170280274985, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001498666908405975, |
|
"loss": 0.8961, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.40190375462718136, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00014946605629711425, |
|
"loss": 0.8991, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00014906436779789402, |
|
"loss": 0.8932, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.40454785827604445, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00014866163390157136, |
|
"loss": 0.8952, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.40586991010047596, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00014825786318905469, |
|
"loss": 0.893, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.4071919619249075, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00014785306426334354, |
|
"loss": 0.8961, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.408514013749339, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0001474472457493452, |
|
"loss": 0.8916, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.4098360655737705, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00014704041629369106, |
|
"loss": 0.9021, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.411158117398202, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00014663258456455212, |
|
"loss": 0.8992, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.41248016922263353, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00014622375925145455, |
|
"loss": 0.8926, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.41380222104706504, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001458139490650945, |
|
"loss": 0.894, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.41512427287149656, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001454031627371524, |
|
"loss": 0.8948, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.41644632469592807, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.000144991409020107, |
|
"loss": 0.8933, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4177683765203596, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00014457869668704895, |
|
"loss": 0.9002, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4190904283447911, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0001441650345314936, |
|
"loss": 0.8961, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.4204124801692226, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00014375043136719402, |
|
"loss": 0.8939, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4217345319936541, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00014333489602795293, |
|
"loss": 0.8915, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.4230565838180857, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001429184373674346, |
|
"loss": 0.8952, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4243786356425172, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0001425010642589762, |
|
"loss": 0.8931, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.4257006874669487, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001420827855953986, |
|
"loss": 0.891, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.42702273929138024, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00014166361028881716, |
|
"loss": 0.891, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.42834479111581175, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00014124354727045163, |
|
"loss": 0.8917, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.42966684294024327, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.00014082260549043592, |
|
"loss": 0.8897, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.4309888947646748, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001404007939176274, |
|
"loss": 0.895, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4323109465891063, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.00013997812153941584, |
|
"loss": 0.8965, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.4336329984135378, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00013955459736153183, |
|
"loss": 0.8954, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4349550502379693, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00013913023040785502, |
|
"loss": 0.8982, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.43627710206240083, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00013870502972022173, |
|
"loss": 0.8906, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.43759915388683235, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00013827900435823236, |
|
"loss": 0.8945, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.43892120571126386, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00013785216339905838, |
|
"loss": 0.8904, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4402432575356954, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0001374245159372489, |
|
"loss": 0.8924, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.4415653093601269, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00013699607108453684, |
|
"loss": 0.8855, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.44288736118455846, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00013656683796964495, |
|
"loss": 0.8914, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.44420941300899, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00013613682573809113, |
|
"loss": 0.8931, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4455314648334215, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0001357060435519936, |
|
"loss": 0.8926, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.446853516657853, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00013527450058987585, |
|
"loss": 0.8915, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.4481755684822845, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0001348422060464709, |
|
"loss": 0.8878, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.44949762030671603, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.00013440916913252536, |
|
"loss": 0.89, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.45081967213114754, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0001339753990746034, |
|
"loss": 0.8881, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.45214172395557906, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00013354090511488994, |
|
"loss": 0.8911, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.45346377578001057, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00013310569651099388, |
|
"loss": 0.8904, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.4547858276044421, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00013266978253575074, |
|
"loss": 0.8902, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.4561078794288736, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00013223317247702517, |
|
"loss": 0.8923, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.4574299312533051, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.000131795875637513, |
|
"loss": 0.8918, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4587519830777366, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00013135790133454305, |
|
"loss": 0.8906, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.46007403490216814, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00013091925889987865, |
|
"loss": 0.8894, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.4613960867265997, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.00013047995767951883, |
|
"loss": 0.8945, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.4627181385510312, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00013004000703349891, |
|
"loss": 0.8918, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.46404019037546274, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00012959941633569153, |
|
"loss": 0.8951, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.46536224219989425, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00012915819497360658, |
|
"loss": 0.8887, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.46668429402432576, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00012871635234819136, |
|
"loss": 0.8897, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.4680063458487573, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00012827389787363006, |
|
"loss": 0.892, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.4693283976731888, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.00012783084097714347, |
|
"loss": 0.8879, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.4706504494976203, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00012738719109878794, |
|
"loss": 0.891, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.4719725013220518, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00012694295769125424, |
|
"loss": 0.8906, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.47329455314648333, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001264981502196662, |
|
"loss": 0.8911, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.47461660497091485, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.000126052778161379, |
|
"loss": 0.895, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.47593865679534636, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00012560685100577738, |
|
"loss": 0.8892, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4772607086197779, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0001251603782540732, |
|
"loss": 0.8869, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.4785827604442094, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0001247133694191032, |
|
"loss": 0.8881, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.47990481226864096, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0001242658340251263, |
|
"loss": 0.8821, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.48122686409307247, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.00012381778160762058, |
|
"loss": 0.8936, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.482548915917504, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00012336922171308018, |
|
"loss": 0.8916, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00012292016389881184, |
|
"loss": 0.8879, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.485193019566367, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00012247061773273135, |
|
"loss": 0.8883, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.4865150713907985, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00012202059279315954, |
|
"loss": 0.8888, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.48783712321523004, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00012157009866861842, |
|
"loss": 0.8884, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.48915917503966155, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00012111914495762669, |
|
"loss": 0.8899, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.49048122686409307, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00012066774126849529, |
|
"loss": 0.8879, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00012021589721912277, |
|
"loss": 0.8925, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.4931253305129561, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00011976362243679014, |
|
"loss": 0.8885, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.4944473823373876, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00011931092655795597, |
|
"loss": 0.8869, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.4957694341618191, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00011885781922805101, |
|
"loss": 0.8892, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.49709148598625064, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0001184043101012726, |
|
"loss": 0.8913, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.4984135378106822, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.000117950408840379, |
|
"loss": 0.8877, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.4997355896351137, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.00011749612511648362, |
|
"loss": 0.8914, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5010576414595452, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001170414686088488, |
|
"loss": 0.8906, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.5023796932839767, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00011658644900467965, |
|
"loss": 0.8923, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5037017451084083, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00011613107599891775, |
|
"loss": 0.8924, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.5050237969328397, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00011567535929403435, |
|
"loss": 0.8869, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5063458487572713, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00011521930859982382, |
|
"loss": 0.8896, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.5076679005817029, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.00011476293363319675, |
|
"loss": 0.8869, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5089899524061343, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0001143062441179729, |
|
"loss": 0.8869, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5103120042305659, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00011384924978467397, |
|
"loss": 0.8854, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5116340560549973, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00011339196037031627, |
|
"loss": 0.8905, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5129561078794289, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00011293438561820343, |
|
"loss": 0.8885, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5142781597038604, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.00011247653527771847, |
|
"loss": 0.8869, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.5156002115282919, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00011201841910411652, |
|
"loss": 0.8873, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5169222633527234, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.00011156004685831648, |
|
"loss": 0.893, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.518244315177155, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001111014283066935, |
|
"loss": 0.8841, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5195663670015864, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001106425732208705, |
|
"loss": 0.8796, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.520888418826018, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001101834913775103, |
|
"loss": 0.8917, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5222104706504495, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00010972419255810704, |
|
"loss": 0.8918, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.523532522474881, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.000109264686548778, |
|
"loss": 0.8866, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5248545742993125, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.00010880498314005488, |
|
"loss": 0.8911, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.526176626123744, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00010834509212667537, |
|
"loss": 0.8856, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5274986779481756, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00010788502330737438, |
|
"loss": 0.8899, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5288207297726071, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00010742478648467522, |
|
"loss": 0.8883, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5301427815970386, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00010696439146468085, |
|
"loss": 0.8899, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.5314648334214701, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.00010650384805686482, |
|
"loss": 0.8894, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5327868852459017, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001060431660738624, |
|
"loss": 0.8882, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.5341089370703331, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00010558235533126142, |
|
"loss": 0.8855, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5354309888947647, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00010512142564739302, |
|
"loss": 0.8841, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5367530407191962, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.00010466038684312278, |
|
"loss": 0.8912, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5380750925436277, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0001041992487416411, |
|
"loss": 0.8883, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.5393971443680592, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00010373802116825418, |
|
"loss": 0.8863, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5407191961924908, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00010327671395017451, |
|
"loss": 0.8831, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.5420412480169222, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00010281533691631155, |
|
"loss": 0.8831, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5433632998413538, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00010235389989706232, |
|
"loss": 0.8888, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.5446853516657852, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001018924127241019, |
|
"loss": 0.893, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5460074034902168, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.000101430885230174, |
|
"loss": 0.8872, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.5473294553146484, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00010096932724888146, |
|
"loss": 0.888, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5486515071390798, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.00010050774861447662, |
|
"loss": 0.8897, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.5499735589635114, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00010004615916165191, |
|
"loss": 0.8865, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5512956107879429, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 9.958456872533031e-05, |
|
"loss": 0.8856, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.5526176626123744, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 9.91229871404557e-05, |
|
"loss": 0.8828, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.5539397144368059, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 9.866142424178341e-05, |
|
"loss": 0.8896, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.5552617662612375, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 9.819988986367053e-05, |
|
"loss": 0.8845, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5565838180856689, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 9.773839383986664e-05, |
|
"loss": 0.8877, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.5579058699101005, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 9.727694600330395e-05, |
|
"loss": 0.8835, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.559227921734532, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 9.681555618588808e-05, |
|
"loss": 0.8855, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.5605499735589635, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 9.635423421828835e-05, |
|
"loss": 0.8838, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.561872025383395, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 9.589298992972857e-05, |
|
"loss": 0.8901, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.5631940772078265, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 9.543183314777732e-05, |
|
"loss": 0.888, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 9.497077369813885e-05, |
|
"loss": 0.8865, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.5658381808566896, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.450982140444344e-05, |
|
"loss": 0.8916, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5671602326811211, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 9.404898608803836e-05, |
|
"loss": 0.8869, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.5684822845055526, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.358827756777837e-05, |
|
"loss": 0.8844, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5698043363299842, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 9.312770565981677e-05, |
|
"loss": 0.8861, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.5711263881544156, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.266728017739601e-05, |
|
"loss": 0.888, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5724484399788472, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 9.220701093063868e-05, |
|
"loss": 0.8857, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.5737704918032787, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 9.174690772633865e-05, |
|
"loss": 0.8896, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.5750925436277102, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 9.128698036775179e-05, |
|
"loss": 0.884, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.5764145954521417, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 9.082723865438747e-05, |
|
"loss": 0.8903, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5777366472765733, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 9.036769238179947e-05, |
|
"loss": 0.8927, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.5790586991010047, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.990835134137744e-05, |
|
"loss": 0.8848, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.5803807509254363, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 8.944922532013811e-05, |
|
"loss": 0.884, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.5817028027498677, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 8.899032410051708e-05, |
|
"loss": 0.8832, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5830248545742993, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 8.853165746015997e-05, |
|
"loss": 0.8851, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.5843469063987309, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.807323517171444e-05, |
|
"loss": 0.8899, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.5856689582231623, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 8.761506700262172e-05, |
|
"loss": 0.8875, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.5869910100475939, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 8.715716271490877e-05, |
|
"loss": 0.8899, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5883130618720254, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 8.669953206497994e-05, |
|
"loss": 0.8819, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.5896351136964569, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.624218480340942e-05, |
|
"loss": 0.8879, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5909571655208884, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 8.578513067473322e-05, |
|
"loss": 0.8877, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.59227921734532, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 8.532837941724181e-05, |
|
"loss": 0.8849, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5936012691697514, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 8.487194076277234e-05, |
|
"loss": 0.8826, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.594923320994183, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.441582443650161e-05, |
|
"loss": 0.8847, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5962453728186144, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 8.396004015673853e-05, |
|
"loss": 0.8825, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.597567424643046, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 8.35045976347173e-05, |
|
"loss": 0.888, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5988894764674775, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 8.304950657439033e-05, |
|
"loss": 0.885, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.600211528291909, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 8.259477667222172e-05, |
|
"loss": 0.8857, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.6015335801163406, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 8.214041761698031e-05, |
|
"loss": 0.8876, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6028556319407721, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 8.168643908953359e-05, |
|
"loss": 0.8846, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6041776837652036, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 8.12328507626411e-05, |
|
"loss": 0.8863, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.6054997355896351, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.077966230074872e-05, |
|
"loss": 0.8889, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6068217874140667, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 8.032688335978247e-05, |
|
"loss": 0.8844, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.6081438392384981, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 7.987452358694278e-05, |
|
"loss": 0.8894, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6094658910629297, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 7.942259262049911e-05, |
|
"loss": 0.8825, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.6107879428873612, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 7.897110008958441e-05, |
|
"loss": 0.8851, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6121099947117927, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 7.852005561399014e-05, |
|
"loss": 0.8903, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.6134320465362242, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.806946880396109e-05, |
|
"loss": 0.8873, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6147540983606558, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.761934925999086e-05, |
|
"loss": 0.8862, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6160761501850872, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 7.716970657261702e-05, |
|
"loss": 0.8846, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6173982020095188, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 7.672055032221709e-05, |
|
"loss": 0.8897, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.6187202538339502, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 7.627189007880413e-05, |
|
"loss": 0.8873, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6200423056583818, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 7.582373540182304e-05, |
|
"loss": 0.8904, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.6213643574828134, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 7.537609583994667e-05, |
|
"loss": 0.8834, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6226864093072448, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 7.492898093087269e-05, |
|
"loss": 0.8825, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.6240084611316764, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 7.448240020111993e-05, |
|
"loss": 0.8837, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6253305129561079, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 7.403636316582594e-05, |
|
"loss": 0.8885, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.6266525647805394, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 7.359087932854371e-05, |
|
"loss": 0.8944, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6279746166049709, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.314595818103959e-05, |
|
"loss": 0.889, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.6292966684294025, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 7.270160920309077e-05, |
|
"loss": 0.8869, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6306187202538339, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 7.225784186228359e-05, |
|
"loss": 0.8829, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.6319407720782655, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 7.181466561381152e-05, |
|
"loss": 0.8889, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.633262823902697, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 7.13720899002739e-05, |
|
"loss": 0.8823, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.6345848757271285, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 7.093012415147462e-05, |
|
"loss": 0.884, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.63590692755156, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 7.048877778422138e-05, |
|
"loss": 0.8848, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.6372289793759915, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 7.00480602021248e-05, |
|
"loss": 0.8821, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.638551031200423, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 6.960798079539835e-05, |
|
"loss": 0.8879, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.6398730830248546, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 6.916854894065796e-05, |
|
"loss": 0.886, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6411951348492861, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 6.87297740007225e-05, |
|
"loss": 0.8895, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.6425171866737176, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 6.829166532441421e-05, |
|
"loss": 0.8839, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.6438392384981492, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 6.785423224635941e-05, |
|
"loss": 0.8834, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 6.741748408678975e-05, |
|
"loss": 0.8849, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.6464833421470122, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 6.698143015134349e-05, |
|
"loss": 0.8804, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.6478053939714437, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 6.65460797308674e-05, |
|
"loss": 0.8845, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.6491274457958752, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 6.611144210121861e-05, |
|
"loss": 0.8897, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.6504494976203067, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 6.567752652306717e-05, |
|
"loss": 0.8913, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6517715494447383, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 6.52443422416985e-05, |
|
"loss": 0.8884, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.6530936012691697, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 6.48118984868167e-05, |
|
"loss": 0.8894, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.6544156530936013, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 6.438020447234759e-05, |
|
"loss": 0.8837, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 6.394926939624267e-05, |
|
"loss": 0.8864, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6570597567424643, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 6.351910244028285e-05, |
|
"loss": 0.8859, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.6583818085668959, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 6.308971276988313e-05, |
|
"loss": 0.885, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.6597038603913273, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 6.266110953389704e-05, |
|
"loss": 0.8835, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.6610259122157589, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 6.223330186442194e-05, |
|
"loss": 0.8834, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6623479640401904, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 6.180629887660425e-05, |
|
"loss": 0.8839, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.6636700158646219, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 6.138010966844538e-05, |
|
"loss": 0.8868, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.6649920676890534, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 6.095474332060776e-05, |
|
"loss": 0.8857, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.666314119513485, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 6.0530208896221527e-05, |
|
"loss": 0.8823, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6676361713379164, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 6.010651544069119e-05, |
|
"loss": 0.8835, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.668958223162348, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 5.968367198150316e-05, |
|
"loss": 0.8826, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.6702802749867794, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 5.92616875280332e-05, |
|
"loss": 0.8825, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.671602326811211, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 5.8840571071354565e-05, |
|
"loss": 0.8874, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6729243786356425, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 5.842033158404636e-05, |
|
"loss": 0.8914, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.674246430460074, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 5.800097802000257e-05, |
|
"loss": 0.8844, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6755684822845055, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 5.758251931424089e-05, |
|
"loss": 0.8809, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.6768905341089371, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 5.716496438271277e-05, |
|
"loss": 0.8828, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6782125859333686, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 5.6748322122113264e-05, |
|
"loss": 0.8832, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.6795346377578001, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 5.633260140969132e-05, |
|
"loss": 0.885, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.6808566895822317, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 5.591781110306096e-05, |
|
"loss": 0.8837, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.6821787414066631, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 5.55039600400122e-05, |
|
"loss": 0.8817, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6835007932310947, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 5.509105703832313e-05, |
|
"loss": 0.8858, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.6848228450555262, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 5.467911089557169e-05, |
|
"loss": 0.8863, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6861448968799577, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 5.426813038894839e-05, |
|
"loss": 0.8844, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.6874669487043892, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 5.3858124275069244e-05, |
|
"loss": 0.8836, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6887890005288207, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 5.344910128978934e-05, |
|
"loss": 0.8837, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.6901110523532522, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 5.304107014801645e-05, |
|
"loss": 0.8845, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.6914331041776838, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 5.263403954352564e-05, |
|
"loss": 0.8878, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.6927551560021152, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 5.222801814877369e-05, |
|
"loss": 0.8879, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6940772078265468, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 5.182301461471474e-05, |
|
"loss": 0.8905, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.6953992596509783, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 5.141903757061555e-05, |
|
"loss": 0.8889, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.6967213114754098, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 5.1016095623871865e-05, |
|
"loss": 0.8805, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.6980433632998414, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 5.0614197359824955e-05, |
|
"loss": 0.8848, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6993654151242729, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 5.021335134157885e-05, |
|
"loss": 0.883, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.7006874669487044, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 4.981356610981756e-05, |
|
"loss": 0.8817, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7020095187731359, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 4.9414850182623465e-05, |
|
"loss": 0.8831, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.7033315705975675, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 4.901721205529548e-05, |
|
"loss": 0.8845, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7046536224219989, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 4.8620660200168425e-05, |
|
"loss": 0.8821, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.7059756742464305, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 4.822520306643207e-05, |
|
"loss": 0.8835, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7072977260708619, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 4.783084907995156e-05, |
|
"loss": 0.8831, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.7086197778952935, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 4.743760664308753e-05, |
|
"loss": 0.8841, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.709941829719725, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 4.704548413451728e-05, |
|
"loss": 0.8843, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.7112638815441565, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 4.665448990905613e-05, |
|
"loss": 0.8843, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.712585933368588, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.626463229747958e-05, |
|
"loss": 0.8832, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.7139079851930196, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 4.587591960634565e-05, |
|
"loss": 0.8852, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7152300370174511, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.54883601178179e-05, |
|
"loss": 0.8844, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.7165520888418826, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 4.510196208948909e-05, |
|
"loss": 0.8873, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.7178741406663142, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 4.4716733754205064e-05, |
|
"loss": 0.8872, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.7191961924907456, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 4.433268331988956e-05, |
|
"loss": 0.8891, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.7205182443151772, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 4.394981896936908e-05, |
|
"loss": 0.8861, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.7218402961396087, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 4.356814886019871e-05, |
|
"loss": 0.8884, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7231623479640402, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 4.3187681124488196e-05, |
|
"loss": 0.8865, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.7244843997884717, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.280842386872889e-05, |
|
"loss": 0.8817, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 4.243038517362068e-05, |
|
"loss": 0.8832, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.7271285034373347, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 4.205357309390021e-05, |
|
"loss": 0.8856, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7284505552617663, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 4.167799565816889e-05, |
|
"loss": 0.8862, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.7297726070861977, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 4.130366086872215e-05, |
|
"loss": 0.8826, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7310946589106293, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 4.093057670137865e-05, |
|
"loss": 0.8871, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.7324167107350608, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 4.055875110531068e-05, |
|
"loss": 0.8862, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.7337387625594923, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 4.0188192002874404e-05, |
|
"loss": 0.8849, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.7350608143839239, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 3.9818907289441375e-05, |
|
"loss": 0.8831, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.7363828662083554, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 3.9450904833230094e-05, |
|
"loss": 0.8879, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.7377049180327869, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 3.908419247513862e-05, |
|
"loss": 0.884, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7390269698572184, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 3.871877802857714e-05, |
|
"loss": 0.8931, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.74034902168165, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 3.8354669279301914e-05, |
|
"loss": 0.8881, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7416710735060814, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 3.7991873985248996e-05, |
|
"loss": 0.8866, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.742993125330513, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 3.763039987636924e-05, |
|
"loss": 0.8826, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.7443151771549444, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 3.7270254654463356e-05, |
|
"loss": 0.8841, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.745637228979376, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 3.6911445993018076e-05, |
|
"loss": 0.8903, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.7469592808038075, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 3.6553981537042406e-05, |
|
"loss": 0.8823, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.748281332628239, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 3.619786890290482e-05, |
|
"loss": 0.8846, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.7496033844526705, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 3.5843115678171125e-05, |
|
"loss": 0.881, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.7509254362771021, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.5489729421442516e-05, |
|
"loss": 0.8845, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.7522474881015336, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 3.513771766219487e-05, |
|
"loss": 0.8857, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.7535695399259651, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 3.4787087900617955e-05, |
|
"loss": 0.8882, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.7548915917503967, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 3.443784760745593e-05, |
|
"loss": 0.8853, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.7562136435748281, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 3.409000422384797e-05, |
|
"loss": 0.8892, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.7575356953992597, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 3.3743565161169835e-05, |
|
"loss": 0.8842, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.7588577472236911, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 3.339853780087584e-05, |
|
"loss": 0.8863, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.7601797990481227, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 3.3054929494341805e-05, |
|
"loss": 0.884, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.7615018508725542, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 3.2712747562708115e-05, |
|
"loss": 0.886, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.7628239026969857, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 3.237199929672401e-05, |
|
"loss": 0.8833, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.7641459545214172, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 3.203269195659202e-05, |
|
"loss": 0.8824, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.7654680063458488, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 3.169483277181351e-05, |
|
"loss": 0.8864, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.7667900581702802, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 3.135842894103436e-05, |
|
"loss": 0.8867, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7681121099947118, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.102348763189188e-05, |
|
"loss": 0.891, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.7694341618191433, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 3.069001598086184e-05, |
|
"loss": 0.8902, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.7707562136435748, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 3.0358021093106594e-05, |
|
"loss": 0.885, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.7720782654680064, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 3.0027510042323537e-05, |
|
"loss": 0.8866, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.7734003172924379, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 2.9698489870594616e-05, |
|
"loss": 0.8831, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.7747223691168694, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 2.937096758823601e-05, |
|
"loss": 0.8869, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.7760444209413009, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 2.9044950173649e-05, |
|
"loss": 0.8816, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.7773664727657325, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 2.872044457317109e-05, |
|
"loss": 0.8875, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.7786885245901639, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 2.839745770092821e-05, |
|
"loss": 0.8828, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.7800105764145955, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 2.8075996438687148e-05, |
|
"loss": 0.8818, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.7813326282390269, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.7756067635709204e-05, |
|
"loss": 0.8911, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.7826546800634585, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 2.7437678108604003e-05, |
|
"loss": 0.8859, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.78397673188789, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 2.712083464118441e-05, |
|
"loss": 0.8892, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.7852987837123215, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 2.68055439843219e-05, |
|
"loss": 0.8826, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.786620835536753, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 2.6491812855802834e-05, |
|
"loss": 0.8877, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.7879428873611846, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.6179647940185238e-05, |
|
"loss": 0.8867, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.789264939185616, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 2.5869055888656336e-05, |
|
"loss": 0.8858, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.7905869910100476, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 2.556004331889098e-05, |
|
"loss": 0.8859, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.7919090428344792, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 2.525261681491047e-05, |
|
"loss": 0.8849, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.7932310946589106, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 2.494678292694247e-05, |
|
"loss": 0.8823, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7945531464833422, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 2.4642548171281244e-05, |
|
"loss": 0.8886, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.7958751983077736, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 2.433991903014896e-05, |
|
"loss": 0.8889, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.7971972501322052, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 2.403890195155748e-05, |
|
"loss": 0.8858, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.7985193019566367, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 2.373950334917111e-05, |
|
"loss": 0.8818, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.7998413537810682, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 2.3441729602169748e-05, |
|
"loss": 0.8859, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.8011634056054997, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 2.3145587055113183e-05, |
|
"loss": 0.8876, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.8024854574299313, |
|
"grad_norm": 0.25, |
|
"learning_rate": 2.2851082017805703e-05, |
|
"loss": 0.8843, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.8038075092543627, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 2.255822076516184e-05, |
|
"loss": 0.8846, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8051295610787943, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 2.2267009537072536e-05, |
|
"loss": 0.8877, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 2.197745453827226e-05, |
|
"loss": 0.8836, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.8077736647276573, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 2.168956193820676e-05, |
|
"loss": 0.885, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.8090957165520889, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 2.140333787090172e-05, |
|
"loss": 0.8896, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.8104177683765204, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 2.1118788434831894e-05, |
|
"loss": 0.8819, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.8117398202009519, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 2.083591969279136e-05, |
|
"loss": 0.8838, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.8130618720253834, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 2.0554737671764134e-05, |
|
"loss": 0.8825, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.814383923849815, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 2.0275248362795995e-05, |
|
"loss": 0.8857, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.8157059756742464, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 1.999745772086655e-05, |
|
"loss": 0.8852, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.817028027498678, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 1.9721371664762666e-05, |
|
"loss": 0.8864, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.8183500793231094, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 1.9446996076952094e-05, |
|
"loss": 0.88, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 1.917433680345829e-05, |
|
"loss": 0.8872, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8209941829719725, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 1.8903399653735766e-05, |
|
"loss": 0.888, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.822316234796404, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 1.8634190400546415e-05, |
|
"loss": 0.8882, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.8236382866208355, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.8366714779836434e-05, |
|
"loss": 0.8887, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.8249603384452671, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 1.8100978490614085e-05, |
|
"loss": 0.8856, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.8262823902696985, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.783698719482836e-05, |
|
"loss": 0.886, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.8276044420941301, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.7574746517248254e-05, |
|
"loss": 0.8807, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.8289264939185617, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.7314262045343e-05, |
|
"loss": 0.8861, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.8302485457429931, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 1.7055539329162963e-05, |
|
"loss": 0.8825, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.8315705975674247, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.6798583881221374e-05, |
|
"loss": 0.8803, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.8328926493918561, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 1.6543401176376916e-05, |
|
"loss": 0.8877, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.8342147012162877, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 1.628999665171712e-05, |
|
"loss": 0.8832, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.8355367530407192, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.603837570644238e-05, |
|
"loss": 0.8854, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.8368588048651507, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 1.578854370175107e-05, |
|
"loss": 0.883, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.8381808566895822, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.554050596072516e-05, |
|
"loss": 0.8855, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.8395029085140138, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 1.5294267768216975e-05, |
|
"loss": 0.8873, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.8408249603384452, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 1.5049834370736405e-05, |
|
"loss": 0.8869, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.8421470121628768, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 1.4807210976339291e-05, |
|
"loss": 0.8853, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.8434690639873083, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 1.4566402754516328e-05, |
|
"loss": 0.8862, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.8447911158117398, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 1.4327414836082976e-05, |
|
"loss": 0.884, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.8461131676361714, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.409025231307013e-05, |
|
"loss": 0.8904, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8474352194606029, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 1.3854920238615688e-05, |
|
"loss": 0.8857, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.8487572712850344, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.3621423626856756e-05, |
|
"loss": 0.8863, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.8500793231094659, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 1.3389767452822943e-05, |
|
"loss": 0.8851, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.8514013749338974, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 1.3159956652330251e-05, |
|
"loss": 0.886, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.8527234267583289, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 1.2931996121876033e-05, |
|
"loss": 0.8822, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.8540454785827605, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 1.2705890718534508e-05, |
|
"loss": 0.8833, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.8553675304071919, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 1.2481645259853436e-05, |
|
"loss": 0.8854, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.8566895822316235, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 1.2259264523751335e-05, |
|
"loss": 0.8882, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.858011634056055, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.2038753248415768e-05, |
|
"loss": 0.8862, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.8593336858804865, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 1.1820116132202374e-05, |
|
"loss": 0.8882, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.860655737704918, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.16033578335347e-05, |
|
"loss": 0.888, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.8619777895293496, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 1.138848297080507e-05, |
|
"loss": 0.887, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.863299841353781, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 1.1175496122276008e-05, |
|
"loss": 0.8841, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.8646218931782126, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 1.0964401825982895e-05, |
|
"loss": 0.8828, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.8659439450026442, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 1.075520457963708e-05, |
|
"loss": 0.8828, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.8672659968270756, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.0547908840530197e-05, |
|
"loss": 0.8892, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.8685880486515072, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 1.034251902543908e-05, |
|
"loss": 0.8842, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.8699101004759386, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 1.01390395105318e-05, |
|
"loss": 0.8858, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.8712321523003702, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 9.937474631274257e-06, |
|
"loss": 0.8896, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.8725542041248017, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 9.737828682337969e-06, |
|
"loss": 0.8844, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8738762559492332, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 9.540105917508391e-06, |
|
"loss": 0.885, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.8751983077736647, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 9.34431054959447e-06, |
|
"loss": 0.8878, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.8765203595980963, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 9.150446750338714e-06, |
|
"loss": 0.8867, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.8778424114225277, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 8.958518650328429e-06, |
|
"loss": 0.8846, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.8791644632469593, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 8.768530338907655e-06, |
|
"loss": 0.8856, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.8804865150713908, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 8.580485864090015e-06, |
|
"loss": 0.8829, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.8818085668958223, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 8.394389232472499e-06, |
|
"loss": 0.8895, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.8831306187202538, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 8.210244409150158e-06, |
|
"loss": 0.8858, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.8844526705446853, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 8.028055317631455e-06, |
|
"loss": 0.8871, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.8857747223691169, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 7.847825839754852e-06, |
|
"loss": 0.8837, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.8870967741935484, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 7.669559815605965e-06, |
|
"loss": 0.8839, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.88841882601798, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 7.493261043435828e-06, |
|
"loss": 0.8802, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.8897408778424114, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 7.318933279579909e-06, |
|
"loss": 0.8831, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.891062929666843, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 7.146580238378131e-06, |
|
"loss": 0.8866, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.8923849814912744, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 6.976205592095675e-06, |
|
"loss": 0.886, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.893707033315706, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 6.8078129708447516e-06, |
|
"loss": 0.8897, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.8950290851401375, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 6.641405962507307e-06, |
|
"loss": 0.886, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.896351136964569, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 6.4769881126584996e-06, |
|
"loss": 0.8848, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.8976731887890005, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 6.314562924491229e-06, |
|
"loss": 0.8863, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.8989952406134321, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 6.154133858741407e-06, |
|
"loss": 0.8861, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9003172924378635, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 5.995704333614327e-06, |
|
"loss": 0.8867, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.9016393442622951, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 5.839277724711733e-06, |
|
"loss": 0.8874, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.9029613960867267, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 5.6848573649599834e-06, |
|
"loss": 0.8782, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.9042834479111581, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 5.53244654453896e-06, |
|
"loss": 0.8851, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.9056054997355897, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 5.382048510812032e-06, |
|
"loss": 0.8814, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.9069275515600211, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 5.233666468256804e-06, |
|
"loss": 0.888, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.9082496033844527, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 5.0873035783969066e-06, |
|
"loss": 0.8862, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.9095716552088842, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 4.942962959734565e-06, |
|
"loss": 0.8835, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.9108937070333157, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 4.800647687684223e-06, |
|
"loss": 0.8893, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.9122157588577472, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 4.660360794506946e-06, |
|
"loss": 0.8826, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.9135378106821788, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 4.52210526924588e-06, |
|
"loss": 0.8845, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.9148598625066102, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 4.385884057662515e-06, |
|
"loss": 0.8862, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.9161819143310418, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 4.251700062173947e-06, |
|
"loss": 0.886, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.9175039661554732, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 4.119556141791048e-06, |
|
"loss": 0.888, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.9188260179799048, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 3.989455112057483e-06, |
|
"loss": 0.8857, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.9201480698043363, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 3.861399744989813e-06, |
|
"loss": 0.8871, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.9214701216287678, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 3.7353927690183867e-06, |
|
"loss": 0.8852, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.9227921734531994, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 3.6114368689291856e-06, |
|
"loss": 0.8834, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.9241142252776309, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 3.4895346858066724e-06, |
|
"loss": 0.8847, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.9254362771020624, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.3696888169774677e-06, |
|
"loss": 0.8825, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9267583289264939, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.2519018159550496e-06, |
|
"loss": 0.8884, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.9280803807509255, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 3.1361761923853095e-06, |
|
"loss": 0.8816, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.9294024325753569, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 3.0225144119931158e-06, |
|
"loss": 0.8828, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.9307244843997885, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.910918896529757e-06, |
|
"loss": 0.8851, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.93204653622422, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 2.8013920237213277e-06, |
|
"loss": 0.8851, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.9333685880486515, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 2.693936127218133e-06, |
|
"loss": 0.8877, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.934690639873083, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 2.5885534965448768e-06, |
|
"loss": 0.8848, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.9360126916975146, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.4852463770519417e-06, |
|
"loss": 0.8823, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.937334743521946, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 2.3840169698675087e-06, |
|
"loss": 0.8865, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.9386567953463776, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 2.2848674318507125e-06, |
|
"loss": 0.8829, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.9399788471708092, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 2.1877998755456266e-06, |
|
"loss": 0.8851, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.9413008989952406, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 2.0928163691362856e-06, |
|
"loss": 0.8876, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.9426229508196722, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 1.9999189364025894e-06, |
|
"loss": 0.8861, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.9439450026441036, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 1.909109556677269e-06, |
|
"loss": 0.8874, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.9452670544685352, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 1.8203901648035648e-06, |
|
"loss": 0.8865, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.9465891062929667, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.7337626510941619e-06, |
|
"loss": 0.8876, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.9479111581173982, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 1.6492288612907748e-06, |
|
"loss": 0.8889, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.9492332099418297, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 1.5667905965249363e-06, |
|
"loss": 0.8842, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.9505552617662613, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 1.486449613279539e-06, |
|
"loss": 0.8857, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.9518773135906927, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 1.408207623351454e-06, |
|
"loss": 0.8847, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9531993654151243, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 1.3320662938150485e-06, |
|
"loss": 0.8828, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.9545214172395557, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 1.2580272469866483e-06, |
|
"loss": 0.8787, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.9558434690639873, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 1.186092060389976e-06, |
|
"loss": 0.886, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.9571655208884188, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.1162622667226008e-06, |
|
"loss": 0.8837, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.9584875727128503, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.0485393538231747e-06, |
|
"loss": 0.8846, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.9598096245372819, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 9.829247646398144e-07, |
|
"loss": 0.8867, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.9611316763617134, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 9.194198971993362e-07, |
|
"loss": 0.8869, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.9624537281861449, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 8.580261045774474e-07, |
|
"loss": 0.8824, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.9637757800105764, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 7.98744694869924e-07, |
|
"loss": 0.8875, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.965097831835008, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 7.41576931164778e-07, |
|
"loss": 0.8856, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.9664198836594394, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 6.86524031515301e-07, |
|
"loss": 0.8861, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 6.335871689140959e-07, |
|
"loss": 0.8868, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.9690639873083025, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 5.827674712681752e-07, |
|
"loss": 0.8873, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.970386039132734, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 5.340660213748017e-07, |
|
"loss": 0.8833, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.9717080909571655, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 4.874838568985296e-07, |
|
"loss": 0.8803, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.973030142781597, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 4.430219703490335e-07, |
|
"loss": 0.8846, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.9743521946060285, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 4.0068130905996924e-07, |
|
"loss": 0.8873, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.9756742464304601, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 3.604627751688239e-07, |
|
"loss": 0.8817, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.9769962982548915, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 3.2236722559764176e-07, |
|
"loss": 0.8877, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.9783183500793231, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 2.86395472034795e-07, |
|
"loss": 0.8879, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9796404019037547, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 2.525482809177082e-07, |
|
"loss": 0.8908, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.9809624537281861, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 2.2082637341647173e-07, |
|
"loss": 0.881, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.9822845055526177, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 1.9123042541854265e-07, |
|
"loss": 0.8863, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 1.6376106751430087e-07, |
|
"loss": 0.8909, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.9849286092014807, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 1.3841888498361544e-07, |
|
"loss": 0.8815, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.9862506610259122, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 1.1520441778339885e-07, |
|
"loss": 0.8851, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.9875727128503438, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 9.411816053608302e-08, |
|
"loss": 0.8855, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.9888947646747752, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 7.51605625190721e-08, |
|
"loss": 0.8903, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.9902168164992068, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 5.833202765519463e-08, |
|
"loss": 0.8867, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.9915388683236382, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 4.3632914504077026e-08, |
|
"loss": 0.8839, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.9928609201480698, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 3.106353625451641e-08, |
|
"loss": 0.8824, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.9941829719725013, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 2.062416071780815e-08, |
|
"loss": 0.8881, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.9955050237969328, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 1.2315010322028198e-08, |
|
"loss": 0.8869, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.9968270756213644, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 6.136262107292456e-09, |
|
"loss": 0.8892, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.9981491274457959, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 2.0880477220042163e-09, |
|
"loss": 0.8853, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.9994711792702274, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 1.7045342003418895e-10, |
|
"loss": 0.8854, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 3782, |
|
"total_flos": 1.011923420184576e+18, |
|
"train_loss": 0.9601045426144794, |
|
"train_runtime": 898.0661, |
|
"train_samples_per_second": 2156.094, |
|
"train_steps_per_second": 4.211 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3782, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.011923420184576e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|