|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.966144517433047, |
|
"eval_steps": 500, |
|
"global_step": 478, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00202122283981809, |
|
"grad_norm": 4.962096691131592, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.6441, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00404244567963618, |
|
"grad_norm": 5.593231678009033, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.8613, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00606366851945427, |
|
"grad_norm": 5.743273735046387, |
|
"learning_rate": 3e-06, |
|
"loss": 1.9149, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00808489135927236, |
|
"grad_norm": 5.530357360839844, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.0057, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01010611419909045, |
|
"grad_norm": 6.651333332061768, |
|
"learning_rate": 5e-06, |
|
"loss": 1.9692, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01212733703890854, |
|
"grad_norm": 6.602941513061523, |
|
"learning_rate": 6e-06, |
|
"loss": 2.4343, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01414855987872663, |
|
"grad_norm": 6.895396709442139, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 2.26, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01616978271854472, |
|
"grad_norm": 7.525021553039551, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.2767, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01819100555836281, |
|
"grad_norm": 7.5351762771606445, |
|
"learning_rate": 9e-06, |
|
"loss": 2.7438, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0202122283981809, |
|
"grad_norm": 7.658970832824707, |
|
"learning_rate": 1e-05, |
|
"loss": 2.764, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02223345123799899, |
|
"grad_norm": 8.046220779418945, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 2.3894, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02425467407781708, |
|
"grad_norm": 8.3847017288208, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.5517, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02627589691763517, |
|
"grad_norm": 8.96577262878418, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 2.2152, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02829711975745326, |
|
"grad_norm": 8.063103675842285, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.1623, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03031834259727135, |
|
"grad_norm": 8.5758638381958, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.4497, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03233956543708944, |
|
"grad_norm": 8.477540969848633, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.3183, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03436078827690753, |
|
"grad_norm": 8.865395545959473, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 2.2435, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03638201111672562, |
|
"grad_norm": 8.725611686706543, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.1894, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03840323395654371, |
|
"grad_norm": 8.353998184204102, |
|
"learning_rate": 1.9e-05, |
|
"loss": 2.0811, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0404244567963618, |
|
"grad_norm": 8.999526977539062, |
|
"learning_rate": 2e-05, |
|
"loss": 2.2778, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04244567963617989, |
|
"grad_norm": 8.592598915100098, |
|
"learning_rate": 2.1e-05, |
|
"loss": 2.3943, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04446690247599798, |
|
"grad_norm": 7.57433557510376, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.1125, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.046488125315816066, |
|
"grad_norm": 8.0515775680542, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 1.9887, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04850934815563416, |
|
"grad_norm": 7.530181884765625, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.2077, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.050530570995452245, |
|
"grad_norm": 6.949326992034912, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.049, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05255179383527034, |
|
"grad_norm": 7.002259254455566, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.9994, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05457301667508843, |
|
"grad_norm": 7.145877838134766, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.7604, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05659423951490652, |
|
"grad_norm": 7.082208156585693, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 2.0095, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05861546235472461, |
|
"grad_norm": 6.70477294921875, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.6048, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0606366851945427, |
|
"grad_norm": 8.728182792663574, |
|
"learning_rate": 3e-05, |
|
"loss": 2.2502, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06265790803436079, |
|
"grad_norm": 8.69613265991211, |
|
"learning_rate": 3.1e-05, |
|
"loss": 2.0332, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06467913087417888, |
|
"grad_norm": 8.603922843933105, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.3491, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06670035371399696, |
|
"grad_norm": 7.335165977478027, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.7337, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06872157655381506, |
|
"grad_norm": 8.186851501464844, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.9627, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07074279939363315, |
|
"grad_norm": 7.595352649688721, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.5682, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07276402223345124, |
|
"grad_norm": 7.205020904541016, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.7703, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07478524507326932, |
|
"grad_norm": 7.933116436004639, |
|
"learning_rate": 3.7e-05, |
|
"loss": 1.8315, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07680646791308741, |
|
"grad_norm": 7.590288162231445, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.8305, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07882769075290551, |
|
"grad_norm": 7.468386650085449, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.6923, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0808489135927236, |
|
"grad_norm": 8.244772911071777, |
|
"learning_rate": 4e-05, |
|
"loss": 2.0191, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0828701364325417, |
|
"grad_norm": 8.714116096496582, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.9665, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.08489135927235977, |
|
"grad_norm": 8.570602416992188, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.9418, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08691258211217787, |
|
"grad_norm": 7.338136196136475, |
|
"learning_rate": 4.3e-05, |
|
"loss": 1.1443, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08893380495199596, |
|
"grad_norm": 8.277491569519043, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.5882, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09095502779181405, |
|
"grad_norm": 7.305893421173096, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.1692, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09297625063163213, |
|
"grad_norm": 7.610684871673584, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.3468, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09499747347145023, |
|
"grad_norm": 7.890575885772705, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.1566, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09701869631126832, |
|
"grad_norm": 8.964077949523926, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.635, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09903991915108641, |
|
"grad_norm": 9.524826049804688, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.4733, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.10106114199090449, |
|
"grad_norm": 9.499811172485352, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5931, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10308236483072258, |
|
"grad_norm": 5.5525898933410645, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 1.5699, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.10510358767054068, |
|
"grad_norm": 4.883670330047607, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.3653, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10712481051035877, |
|
"grad_norm": 3.8409433364868164, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 1.2819, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10914603335017686, |
|
"grad_norm": 4.300581932067871, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.5477, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.11116725618999494, |
|
"grad_norm": 3.8485517501831055, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.4893, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11318847902981304, |
|
"grad_norm": 4.364621639251709, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.6671, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11520970186963113, |
|
"grad_norm": 4.246096134185791, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.6163, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11723092470944922, |
|
"grad_norm": 4.382571697235107, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.6223, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1192521475492673, |
|
"grad_norm": 4.406397819519043, |
|
"learning_rate": 5.9e-05, |
|
"loss": 1.5827, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1212733703890854, |
|
"grad_norm": 5.563169002532959, |
|
"learning_rate": 6e-05, |
|
"loss": 2.1763, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12329459322890349, |
|
"grad_norm": 5.388707160949707, |
|
"learning_rate": 6.1e-05, |
|
"loss": 1.9962, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.12531581606872158, |
|
"grad_norm": 4.910810947418213, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.6651, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.12733703890853967, |
|
"grad_norm": 5.668425559997559, |
|
"learning_rate": 6.3e-05, |
|
"loss": 1.8851, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.12935826174835777, |
|
"grad_norm": 5.245799541473389, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.7111, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.13137948458817586, |
|
"grad_norm": 5.701318264007568, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 2.0466, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.13340070742799393, |
|
"grad_norm": 6.002028942108154, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.9546, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.13542193026781202, |
|
"grad_norm": 5.405800819396973, |
|
"learning_rate": 6.7e-05, |
|
"loss": 1.6626, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1374431531076301, |
|
"grad_norm": 5.076318740844727, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.726, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1394643759474482, |
|
"grad_norm": 5.462904930114746, |
|
"learning_rate": 6.9e-05, |
|
"loss": 1.6945, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1414855987872663, |
|
"grad_norm": 5.7171783447265625, |
|
"learning_rate": 7e-05, |
|
"loss": 1.9263, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1435068216270844, |
|
"grad_norm": 5.716061592102051, |
|
"learning_rate": 7.1e-05, |
|
"loss": 1.4844, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.14552804446690248, |
|
"grad_norm": 5.982063293457031, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.8287, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14754926730672058, |
|
"grad_norm": 5.261101722717285, |
|
"learning_rate": 7.3e-05, |
|
"loss": 1.5783, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14957049014653864, |
|
"grad_norm": 5.717907428741455, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.4726, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.15159171298635674, |
|
"grad_norm": 5.534896373748779, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.5899, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.15361293582617483, |
|
"grad_norm": 6.794299125671387, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.7261, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.15563415866599292, |
|
"grad_norm": 6.486598014831543, |
|
"learning_rate": 7.7e-05, |
|
"loss": 1.7126, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.15765538150581102, |
|
"grad_norm": 6.078768730163574, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.5131, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1596766043456291, |
|
"grad_norm": 7.305526256561279, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 2.1216, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.1616978271854472, |
|
"grad_norm": 6.43522310256958, |
|
"learning_rate": 8e-05, |
|
"loss": 1.5078, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1637190500252653, |
|
"grad_norm": 6.868276119232178, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.6478, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1657402728650834, |
|
"grad_norm": 6.5684051513671875, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.554, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.16776149570490145, |
|
"grad_norm": 7.237800121307373, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.776, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.16978271854471955, |
|
"grad_norm": 10.40848445892334, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.3637, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.17180394138453764, |
|
"grad_norm": 7.5290846824646, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.8149, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.17382516422435573, |
|
"grad_norm": 6.535577297210693, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.5878, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.17584638706417383, |
|
"grad_norm": 6.797990322113037, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.6962, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.17786760990399192, |
|
"grad_norm": 8.046355247497559, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.6756, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.17988883274381, |
|
"grad_norm": 6.245670318603516, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 1.4684, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1819100555836281, |
|
"grad_norm": 6.456711769104004, |
|
"learning_rate": 9e-05, |
|
"loss": 1.4074, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1839312784234462, |
|
"grad_norm": 6.714746475219727, |
|
"learning_rate": 9.1e-05, |
|
"loss": 1.4863, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.18595250126326426, |
|
"grad_norm": 8.266717910766602, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.8342, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.18797372410308236, |
|
"grad_norm": 7.780879497528076, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 1.9541, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.18999494694290045, |
|
"grad_norm": 6.307599067687988, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.2528, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.19201616978271854, |
|
"grad_norm": 7.502289295196533, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.5187, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.19403739262253664, |
|
"grad_norm": 6.638027667999268, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.2167, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.19605861546235473, |
|
"grad_norm": 7.040843963623047, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.3433, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.19807983830217282, |
|
"grad_norm": 6.591531753540039, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.1483, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.20010106114199092, |
|
"grad_norm": 8.779806137084961, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.8501, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.20212228398180898, |
|
"grad_norm": 8.384221076965332, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5389, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20414350682162707, |
|
"grad_norm": 4.096580982208252, |
|
"learning_rate": 9.999841055681184e-05, |
|
"loss": 1.4128, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.20616472966144517, |
|
"grad_norm": 4.011407375335693, |
|
"learning_rate": 9.999364232830052e-05, |
|
"loss": 1.2588, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.20818595250126326, |
|
"grad_norm": 3.9498050212860107, |
|
"learning_rate": 9.99856956176192e-05, |
|
"loss": 1.4214, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.21020717534108135, |
|
"grad_norm": 3.8423962593078613, |
|
"learning_rate": 9.997457093000164e-05, |
|
"loss": 1.7436, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.21222839818089945, |
|
"grad_norm": 3.859107255935669, |
|
"learning_rate": 9.996026897273024e-05, |
|
"loss": 1.5557, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.21424962102071754, |
|
"grad_norm": 4.101138591766357, |
|
"learning_rate": 9.994279065509093e-05, |
|
"loss": 1.6637, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.21627084386053563, |
|
"grad_norm": 4.0843963623046875, |
|
"learning_rate": 9.992213708831543e-05, |
|
"loss": 1.6069, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.21829206670035373, |
|
"grad_norm": 4.65712833404541, |
|
"learning_rate": 9.989830958551057e-05, |
|
"loss": 1.8503, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2203132895401718, |
|
"grad_norm": 5.231738090515137, |
|
"learning_rate": 9.987130966157486e-05, |
|
"loss": 1.7857, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.22233451237998988, |
|
"grad_norm": 5.475664138793945, |
|
"learning_rate": 9.984113903310206e-05, |
|
"loss": 2.1317, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22435573521980798, |
|
"grad_norm": 5.313016414642334, |
|
"learning_rate": 9.98077996182722e-05, |
|
"loss": 2.0104, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.22637695805962607, |
|
"grad_norm": 4.913154125213623, |
|
"learning_rate": 9.97712935367295e-05, |
|
"loss": 1.6801, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.22839818089944416, |
|
"grad_norm": 5.323668479919434, |
|
"learning_rate": 9.973162310944768e-05, |
|
"loss": 1.8818, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.23041940373926226, |
|
"grad_norm": 5.191682815551758, |
|
"learning_rate": 9.968879085858234e-05, |
|
"loss": 1.6902, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.23244062657908035, |
|
"grad_norm": 5.358806133270264, |
|
"learning_rate": 9.964279950731066e-05, |
|
"loss": 1.5777, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.23446184941889844, |
|
"grad_norm": 4.940889358520508, |
|
"learning_rate": 9.959365197965824e-05, |
|
"loss": 1.4829, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.23648307225871654, |
|
"grad_norm": 5.8027191162109375, |
|
"learning_rate": 9.954135140031321e-05, |
|
"loss": 1.743, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2385042950985346, |
|
"grad_norm": 6.13554048538208, |
|
"learning_rate": 9.948590109442754e-05, |
|
"loss": 1.6588, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2405255179383527, |
|
"grad_norm": 5.018206596374512, |
|
"learning_rate": 9.942730458740568e-05, |
|
"loss": 1.4198, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.2425467407781708, |
|
"grad_norm": 6.798793792724609, |
|
"learning_rate": 9.936556560468037e-05, |
|
"loss": 1.9842, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24456796361798888, |
|
"grad_norm": 5.7152323722839355, |
|
"learning_rate": 9.930068807147584e-05, |
|
"loss": 1.739, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.24658918645780697, |
|
"grad_norm": 5.72061824798584, |
|
"learning_rate": 9.923267611255825e-05, |
|
"loss": 1.5452, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.24861040929762507, |
|
"grad_norm": 5.586998462677002, |
|
"learning_rate": 9.916153405197332e-05, |
|
"loss": 1.5731, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.25063163213744316, |
|
"grad_norm": 5.702105522155762, |
|
"learning_rate": 9.908726641277167e-05, |
|
"loss": 1.8093, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.25265285497726125, |
|
"grad_norm": 6.596114158630371, |
|
"learning_rate": 9.9009877916721e-05, |
|
"loss": 1.6054, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.25467407781707935, |
|
"grad_norm": 5.10860013961792, |
|
"learning_rate": 9.892937348400601e-05, |
|
"loss": 1.4648, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.25669530065689744, |
|
"grad_norm": 6.096838474273682, |
|
"learning_rate": 9.88457582329156e-05, |
|
"loss": 1.5438, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.25871652349671553, |
|
"grad_norm": 6.351302146911621, |
|
"learning_rate": 9.875903747951742e-05, |
|
"loss": 1.7681, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.2607377463365336, |
|
"grad_norm": 5.202528476715088, |
|
"learning_rate": 9.866921673731992e-05, |
|
"loss": 1.2863, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.2627589691763517, |
|
"grad_norm": 6.953995227813721, |
|
"learning_rate": 9.857630171692174e-05, |
|
"loss": 2.0508, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26478019201616976, |
|
"grad_norm": 6.509955883026123, |
|
"learning_rate": 9.848029832564875e-05, |
|
"loss": 1.6179, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.26680141485598785, |
|
"grad_norm": 6.635447025299072, |
|
"learning_rate": 9.838121266717839e-05, |
|
"loss": 1.7545, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.26882263769580594, |
|
"grad_norm": 7.2970099449157715, |
|
"learning_rate": 9.827905104115166e-05, |
|
"loss": 1.7671, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.27084386053562404, |
|
"grad_norm": 5.802951335906982, |
|
"learning_rate": 9.817381994277261e-05, |
|
"loss": 1.4303, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.27286508337544213, |
|
"grad_norm": 5.940200328826904, |
|
"learning_rate": 9.80655260623953e-05, |
|
"loss": 1.6777, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2748863062152602, |
|
"grad_norm": 6.702271938323975, |
|
"learning_rate": 9.795417628509857e-05, |
|
"loss": 1.7169, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2769075290550783, |
|
"grad_norm": 5.698646068572998, |
|
"learning_rate": 9.783977769024821e-05, |
|
"loss": 1.4602, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.2789287518948964, |
|
"grad_norm": 7.149135589599609, |
|
"learning_rate": 9.772233755104694e-05, |
|
"loss": 1.6813, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2809499747347145, |
|
"grad_norm": 6.527937412261963, |
|
"learning_rate": 9.760186333407189e-05, |
|
"loss": 1.5575, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.2829711975745326, |
|
"grad_norm": 9.34347152709961, |
|
"learning_rate": 9.747836269880003e-05, |
|
"loss": 1.6409, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2849924204143507, |
|
"grad_norm": 5.21145486831665, |
|
"learning_rate": 9.735184349712109e-05, |
|
"loss": 1.0628, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2870136432541688, |
|
"grad_norm": 5.601922988891602, |
|
"learning_rate": 9.722231377283841e-05, |
|
"loss": 1.2334, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.2890348660939869, |
|
"grad_norm": 6.080234050750732, |
|
"learning_rate": 9.708978176115751e-05, |
|
"loss": 1.1516, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.29105608893380497, |
|
"grad_norm": 6.611027717590332, |
|
"learning_rate": 9.695425588816249e-05, |
|
"loss": 1.3611, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.29307731177362306, |
|
"grad_norm": 5.696514129638672, |
|
"learning_rate": 9.681574477028039e-05, |
|
"loss": 1.0987, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.29509853461344115, |
|
"grad_norm": 8.035621643066406, |
|
"learning_rate": 9.667425721373332e-05, |
|
"loss": 1.7412, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.29711975745325925, |
|
"grad_norm": 8.804828643798828, |
|
"learning_rate": 9.65298022139786e-05, |
|
"loss": 1.7839, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2991409802930773, |
|
"grad_norm": 6.8422064781188965, |
|
"learning_rate": 9.638238895513687e-05, |
|
"loss": 1.3999, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3011622031328954, |
|
"grad_norm": 9.455986022949219, |
|
"learning_rate": 9.623202680940811e-05, |
|
"loss": 1.8304, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.30318342597271347, |
|
"grad_norm": 7.243097305297852, |
|
"learning_rate": 9.607872533647584e-05, |
|
"loss": 1.2319, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.30520464881253156, |
|
"grad_norm": 3.3011507987976074, |
|
"learning_rate": 9.592249428289934e-05, |
|
"loss": 1.2937, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.30722587165234966, |
|
"grad_norm": 3.4213240146636963, |
|
"learning_rate": 9.5763343581494e-05, |
|
"loss": 1.4224, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.30924709449216775, |
|
"grad_norm": 2.8826541900634766, |
|
"learning_rate": 9.56012833506997e-05, |
|
"loss": 1.0413, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.31126831733198584, |
|
"grad_norm": 3.137408494949341, |
|
"learning_rate": 9.543632389393767e-05, |
|
"loss": 1.3462, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.31328954017180394, |
|
"grad_norm": 3.6072700023651123, |
|
"learning_rate": 9.52684756989553e-05, |
|
"loss": 1.4831, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.31531076301162203, |
|
"grad_norm": 3.7660746574401855, |
|
"learning_rate": 9.509774943715939e-05, |
|
"loss": 1.6384, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3173319858514401, |
|
"grad_norm": 4.279458522796631, |
|
"learning_rate": 9.492415596293769e-05, |
|
"loss": 1.8637, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3193532086912582, |
|
"grad_norm": 3.9801113605499268, |
|
"learning_rate": 9.474770631296881e-05, |
|
"loss": 1.47, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3213744315310763, |
|
"grad_norm": 4.086119174957275, |
|
"learning_rate": 9.456841170552053e-05, |
|
"loss": 1.7968, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3233956543708944, |
|
"grad_norm": 3.968665838241577, |
|
"learning_rate": 9.438628353973653e-05, |
|
"loss": 1.4681, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3254168772107125, |
|
"grad_norm": 4.774650573730469, |
|
"learning_rate": 9.420133339491171e-05, |
|
"loss": 1.9598, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.3274381000505306, |
|
"grad_norm": 4.956033706665039, |
|
"learning_rate": 9.401357302975599e-05, |
|
"loss": 1.7753, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3294593228903487, |
|
"grad_norm": 4.571745872497559, |
|
"learning_rate": 9.382301438164672e-05, |
|
"loss": 1.5018, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.3314805457301668, |
|
"grad_norm": 4.879904747009277, |
|
"learning_rate": 9.362966956586969e-05, |
|
"loss": 1.795, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3335017685699848, |
|
"grad_norm": 4.576776027679443, |
|
"learning_rate": 9.343355087484894e-05, |
|
"loss": 1.6005, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3355229914098029, |
|
"grad_norm": 4.379726409912109, |
|
"learning_rate": 9.323467077736511e-05, |
|
"loss": 1.3807, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.337544214249621, |
|
"grad_norm": 5.608097553253174, |
|
"learning_rate": 9.303304191776291e-05, |
|
"loss": 1.7641, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3395654370894391, |
|
"grad_norm": 4.803214073181152, |
|
"learning_rate": 9.282867711514702e-05, |
|
"loss": 1.5029, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3415866599292572, |
|
"grad_norm": 5.842015266418457, |
|
"learning_rate": 9.262158936256717e-05, |
|
"loss": 1.9833, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3436078827690753, |
|
"grad_norm": 5.395236968994141, |
|
"learning_rate": 9.241179182619206e-05, |
|
"loss": 1.6689, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.34562910560889337, |
|
"grad_norm": 6.057033061981201, |
|
"learning_rate": 9.219929784447231e-05, |
|
"loss": 1.9053, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.34765032844871147, |
|
"grad_norm": 5.41602897644043, |
|
"learning_rate": 9.19841209272924e-05, |
|
"loss": 1.56, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.34967155128852956, |
|
"grad_norm": 5.450588703155518, |
|
"learning_rate": 9.17662747551117e-05, |
|
"loss": 1.5914, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.35169277412834765, |
|
"grad_norm": 5.975580215454102, |
|
"learning_rate": 9.154577317809482e-05, |
|
"loss": 1.7152, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.35371399696816574, |
|
"grad_norm": 5.361056804656982, |
|
"learning_rate": 9.132263021523096e-05, |
|
"loss": 1.804, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.35573521980798384, |
|
"grad_norm": 5.825102806091309, |
|
"learning_rate": 9.109686005344258e-05, |
|
"loss": 1.5358, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.35775644264780193, |
|
"grad_norm": 5.946747779846191, |
|
"learning_rate": 9.086847704668351e-05, |
|
"loss": 1.964, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.35977766548762, |
|
"grad_norm": 5.494548320770264, |
|
"learning_rate": 9.063749571502634e-05, |
|
"loss": 1.4712, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3617988883274381, |
|
"grad_norm": 5.114996433258057, |
|
"learning_rate": 9.040393074373921e-05, |
|
"loss": 1.5819, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3638201111672562, |
|
"grad_norm": 5.512078285217285, |
|
"learning_rate": 9.016779698235227e-05, |
|
"loss": 1.6159, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3658413340070743, |
|
"grad_norm": 5.07801628112793, |
|
"learning_rate": 8.992910944371342e-05, |
|
"loss": 1.4474, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.3678625568468924, |
|
"grad_norm": 6.879641056060791, |
|
"learning_rate": 8.9687883303034e-05, |
|
"loss": 1.7858, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.36988377968671043, |
|
"grad_norm": 5.092156887054443, |
|
"learning_rate": 8.94441338969238e-05, |
|
"loss": 1.4168, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.3719050025265285, |
|
"grad_norm": 5.373355865478516, |
|
"learning_rate": 8.919787672241619e-05, |
|
"loss": 1.5532, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.3739262253663466, |
|
"grad_norm": 5.264693737030029, |
|
"learning_rate": 8.894912743598268e-05, |
|
"loss": 1.3256, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3759474482061647, |
|
"grad_norm": 5.930539608001709, |
|
"learning_rate": 8.869790185253766e-05, |
|
"loss": 1.6318, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3779686710459828, |
|
"grad_norm": 6.245593547821045, |
|
"learning_rate": 8.84442159444328e-05, |
|
"loss": 1.6553, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3799898938858009, |
|
"grad_norm": 6.605845928192139, |
|
"learning_rate": 8.818808584044162e-05, |
|
"loss": 1.1972, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.382011116725619, |
|
"grad_norm": 5.590306758880615, |
|
"learning_rate": 8.792952782473413e-05, |
|
"loss": 1.2016, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3840323395654371, |
|
"grad_norm": 6.213796138763428, |
|
"learning_rate": 8.76685583358414e-05, |
|
"loss": 1.393, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3860535624052552, |
|
"grad_norm": 6.904628753662109, |
|
"learning_rate": 8.740519396561044e-05, |
|
"loss": 1.5803, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.3880747852450733, |
|
"grad_norm": 6.069091796875, |
|
"learning_rate": 8.713945145814946e-05, |
|
"loss": 1.4332, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.39009600808489137, |
|
"grad_norm": 6.582734107971191, |
|
"learning_rate": 8.687134770876319e-05, |
|
"loss": 1.3049, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.39211723092470946, |
|
"grad_norm": 5.841890811920166, |
|
"learning_rate": 8.660089976287875e-05, |
|
"loss": 1.3647, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.39413845376452755, |
|
"grad_norm": 5.880384922027588, |
|
"learning_rate": 8.632812481496195e-05, |
|
"loss": 1.37, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.39615967660434565, |
|
"grad_norm": 5.35994291305542, |
|
"learning_rate": 8.60530402074241e-05, |
|
"loss": 1.103, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.39818089944416374, |
|
"grad_norm": 5.617462635040283, |
|
"learning_rate": 8.577566342951943e-05, |
|
"loss": 0.9969, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.40020212228398183, |
|
"grad_norm": 5.336188793182373, |
|
"learning_rate": 8.549601211623316e-05, |
|
"loss": 0.9578, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4022233451237999, |
|
"grad_norm": 7.3241047859191895, |
|
"learning_rate": 8.521410404716028e-05, |
|
"loss": 1.4591, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.40424456796361796, |
|
"grad_norm": 5.509421348571777, |
|
"learning_rate": 8.492995714537518e-05, |
|
"loss": 1.113, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.40626579080343606, |
|
"grad_norm": 2.5618975162506104, |
|
"learning_rate": 8.464358947629218e-05, |
|
"loss": 1.4167, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.40828701364325415, |
|
"grad_norm": 3.311715602874756, |
|
"learning_rate": 8.435501924651691e-05, |
|
"loss": 1.3956, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.41030823648307224, |
|
"grad_norm": 2.9615299701690674, |
|
"learning_rate": 8.406426480268881e-05, |
|
"loss": 1.1263, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.41232945932289033, |
|
"grad_norm": 3.3075735569000244, |
|
"learning_rate": 8.377134463031469e-05, |
|
"loss": 1.3577, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.41435068216270843, |
|
"grad_norm": 3.4304211139678955, |
|
"learning_rate": 8.347627735259343e-05, |
|
"loss": 1.6109, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4163719050025265, |
|
"grad_norm": 3.1346213817596436, |
|
"learning_rate": 8.317908172923205e-05, |
|
"loss": 1.1557, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4183931278423446, |
|
"grad_norm": 3.6796648502349854, |
|
"learning_rate": 8.287977665525292e-05, |
|
"loss": 1.3844, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.4204143506821627, |
|
"grad_norm": 3.5582165718078613, |
|
"learning_rate": 8.257838115979244e-05, |
|
"loss": 1.4043, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.4224355735219808, |
|
"grad_norm": 4.690045356750488, |
|
"learning_rate": 8.227491440489133e-05, |
|
"loss": 1.9057, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.4244567963617989, |
|
"grad_norm": 4.603220462799072, |
|
"learning_rate": 8.196939568427624e-05, |
|
"loss": 2.0138, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.426478019201617, |
|
"grad_norm": 4.161036014556885, |
|
"learning_rate": 8.166184442213313e-05, |
|
"loss": 1.5959, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.4284992420414351, |
|
"grad_norm": 4.600170612335205, |
|
"learning_rate": 8.135228017187237e-05, |
|
"loss": 1.656, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.4305204648812532, |
|
"grad_norm": 4.487064838409424, |
|
"learning_rate": 8.10407226148855e-05, |
|
"loss": 1.8356, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.43254168772107127, |
|
"grad_norm": 4.709319591522217, |
|
"learning_rate": 8.0727191559294e-05, |
|
"loss": 1.6691, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.43456291056088936, |
|
"grad_norm": 4.594611644744873, |
|
"learning_rate": 8.041170693868985e-05, |
|
"loss": 1.5736, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.43658413340070745, |
|
"grad_norm": 4.373988628387451, |
|
"learning_rate": 8.009428881086835e-05, |
|
"loss": 1.5365, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4386053562405255, |
|
"grad_norm": 4.379314422607422, |
|
"learning_rate": 7.977495735655272e-05, |
|
"loss": 1.3149, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.4406265790803436, |
|
"grad_norm": 5.018416881561279, |
|
"learning_rate": 7.945373287811116e-05, |
|
"loss": 1.6653, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.4426478019201617, |
|
"grad_norm": 5.0816969871521, |
|
"learning_rate": 7.913063579826601e-05, |
|
"loss": 1.6499, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.44466902475997977, |
|
"grad_norm": 4.877917289733887, |
|
"learning_rate": 7.880568665879542e-05, |
|
"loss": 1.6308, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.44669024759979786, |
|
"grad_norm": 4.950286388397217, |
|
"learning_rate": 7.847890611922721e-05, |
|
"loss": 1.4406, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.44871147043961596, |
|
"grad_norm": 5.2699480056762695, |
|
"learning_rate": 7.815031495552549e-05, |
|
"loss": 1.5366, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.45073269327943405, |
|
"grad_norm": 5.018312931060791, |
|
"learning_rate": 7.781993405876972e-05, |
|
"loss": 1.3646, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.45275391611925214, |
|
"grad_norm": 5.903739929199219, |
|
"learning_rate": 7.748778443382658e-05, |
|
"loss": 1.713, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.45477513895907024, |
|
"grad_norm": 5.679375648498535, |
|
"learning_rate": 7.715388719801438e-05, |
|
"loss": 1.6164, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.45679636179888833, |
|
"grad_norm": 5.779827117919922, |
|
"learning_rate": 7.68182635797606e-05, |
|
"loss": 1.8518, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4588175846387064, |
|
"grad_norm": 5.680889129638672, |
|
"learning_rate": 7.648093491725223e-05, |
|
"loss": 1.6413, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4608388074785245, |
|
"grad_norm": 6.258683681488037, |
|
"learning_rate": 7.614192265707905e-05, |
|
"loss": 1.4253, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4628600303183426, |
|
"grad_norm": 6.006860733032227, |
|
"learning_rate": 7.580124835287013e-05, |
|
"loss": 1.5382, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.4648812531581607, |
|
"grad_norm": 5.656060695648193, |
|
"learning_rate": 7.545893366392358e-05, |
|
"loss": 1.491, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4669024759979788, |
|
"grad_norm": 6.140045642852783, |
|
"learning_rate": 7.511500035382942e-05, |
|
"loss": 1.754, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.4689236988377969, |
|
"grad_norm": 6.063406467437744, |
|
"learning_rate": 7.476947028908594e-05, |
|
"loss": 1.5948, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.470944921677615, |
|
"grad_norm": 5.987018585205078, |
|
"learning_rate": 7.442236543770944e-05, |
|
"loss": 1.6062, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.4729661445174331, |
|
"grad_norm": 5.543860912322998, |
|
"learning_rate": 7.407370786783757e-05, |
|
"loss": 1.5026, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.4749873673572511, |
|
"grad_norm": 5.393790245056152, |
|
"learning_rate": 7.372351974632634e-05, |
|
"loss": 1.5174, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4770085901970692, |
|
"grad_norm": 6.32440710067749, |
|
"learning_rate": 7.33718233373407e-05, |
|
"loss": 1.6588, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4790298130368873, |
|
"grad_norm": 5.640255928039551, |
|
"learning_rate": 7.301864100093912e-05, |
|
"loss": 1.3006, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.4810510358767054, |
|
"grad_norm": 5.789175033569336, |
|
"learning_rate": 7.266399519165192e-05, |
|
"loss": 1.5062, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.4830722587165235, |
|
"grad_norm": 5.77946662902832, |
|
"learning_rate": 7.230790845705379e-05, |
|
"loss": 1.3896, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.4850934815563416, |
|
"grad_norm": 5.353186130523682, |
|
"learning_rate": 7.195040343633007e-05, |
|
"loss": 1.321, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.48711470439615967, |
|
"grad_norm": 7.649440765380859, |
|
"learning_rate": 7.159150285883756e-05, |
|
"loss": 2.033, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.48913592723597776, |
|
"grad_norm": 5.809901237487793, |
|
"learning_rate": 7.123122954265941e-05, |
|
"loss": 1.2447, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.49115715007579586, |
|
"grad_norm": 5.7666215896606445, |
|
"learning_rate": 7.086960639315436e-05, |
|
"loss": 1.3123, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.49317837291561395, |
|
"grad_norm": 6.335599422454834, |
|
"learning_rate": 7.050665640150045e-05, |
|
"loss": 1.6371, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.49519959575543204, |
|
"grad_norm": 5.487225532531738, |
|
"learning_rate": 7.014240264323334e-05, |
|
"loss": 1.2528, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.49722081859525014, |
|
"grad_norm": 5.477118968963623, |
|
"learning_rate": 6.977686827677926e-05, |
|
"loss": 1.1761, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.49924204143506823, |
|
"grad_norm": 4.509613037109375, |
|
"learning_rate": 6.941007654198254e-05, |
|
"loss": 1.0277, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.5012632642748863, |
|
"grad_norm": 7.54964542388916, |
|
"learning_rate": 6.904205075862816e-05, |
|
"loss": 1.6264, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.5032844871147044, |
|
"grad_norm": 7.848708152770996, |
|
"learning_rate": 6.867281432495912e-05, |
|
"loss": 1.4215, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.5053057099545225, |
|
"grad_norm": 7.514161109924316, |
|
"learning_rate": 6.830239071618873e-05, |
|
"loss": 1.4708, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5073269327943406, |
|
"grad_norm": 2.8156673908233643, |
|
"learning_rate": 6.793080348300833e-05, |
|
"loss": 1.4503, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.5093481556341587, |
|
"grad_norm": 2.978626012802124, |
|
"learning_rate": 6.755807625008974e-05, |
|
"loss": 1.17, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5113693784739768, |
|
"grad_norm": 3.320791482925415, |
|
"learning_rate": 6.718423271458343e-05, |
|
"loss": 1.4699, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5133906013137949, |
|
"grad_norm": 3.1808319091796875, |
|
"learning_rate": 6.680929664461185e-05, |
|
"loss": 1.2698, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.515411824153613, |
|
"grad_norm": 3.3711514472961426, |
|
"learning_rate": 6.643329187775827e-05, |
|
"loss": 1.5507, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5174330469934311, |
|
"grad_norm": 3.6619150638580322, |
|
"learning_rate": 6.605624231955131e-05, |
|
"loss": 1.6664, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5194542698332492, |
|
"grad_norm": 4.01786994934082, |
|
"learning_rate": 6.567817194194507e-05, |
|
"loss": 1.7517, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.5214754926730673, |
|
"grad_norm": 4.117989540100098, |
|
"learning_rate": 6.529910478179499e-05, |
|
"loss": 1.7831, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.5234967155128853, |
|
"grad_norm": 3.7679736614227295, |
|
"learning_rate": 6.491906493932968e-05, |
|
"loss": 1.4514, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.5255179383527034, |
|
"grad_norm": 4.316635608673096, |
|
"learning_rate": 6.45380765766187e-05, |
|
"loss": 1.6405, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5275391611925214, |
|
"grad_norm": 3.958988904953003, |
|
"learning_rate": 6.415616391603638e-05, |
|
"loss": 1.5774, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.5295603840323395, |
|
"grad_norm": 4.332874774932861, |
|
"learning_rate": 6.377335123872177e-05, |
|
"loss": 1.8736, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5315816068721576, |
|
"grad_norm": 4.065393447875977, |
|
"learning_rate": 6.338966288303499e-05, |
|
"loss": 1.5071, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5336028297119757, |
|
"grad_norm": 4.553988456726074, |
|
"learning_rate": 6.300512324300975e-05, |
|
"loss": 1.806, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.5356240525517938, |
|
"grad_norm": 4.563177108764648, |
|
"learning_rate": 6.261975676680252e-05, |
|
"loss": 1.567, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5376452753916119, |
|
"grad_norm": 4.3816142082214355, |
|
"learning_rate": 6.223358795513812e-05, |
|
"loss": 1.6037, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.53966649823143, |
|
"grad_norm": 4.977108001708984, |
|
"learning_rate": 6.184664135975203e-05, |
|
"loss": 1.8076, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.5416877210712481, |
|
"grad_norm": 4.56311559677124, |
|
"learning_rate": 6.145894158182944e-05, |
|
"loss": 1.6309, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5437089439110662, |
|
"grad_norm": 5.014670372009277, |
|
"learning_rate": 6.107051327044124e-05, |
|
"loss": 1.6022, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.5457301667508843, |
|
"grad_norm": 4.538066864013672, |
|
"learning_rate": 6.068138112097674e-05, |
|
"loss": 1.494, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5477513895907024, |
|
"grad_norm": 5.387009143829346, |
|
"learning_rate": 6.029156987357373e-05, |
|
"loss": 1.7367, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.5497726124305204, |
|
"grad_norm": 4.673946857452393, |
|
"learning_rate": 5.9901104311545487e-05, |
|
"loss": 1.6585, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5517938352703385, |
|
"grad_norm": 5.331272125244141, |
|
"learning_rate": 5.9510009259805085e-05, |
|
"loss": 1.7205, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5538150581101566, |
|
"grad_norm": 5.224307537078857, |
|
"learning_rate": 5.91183095832872e-05, |
|
"loss": 1.8472, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5558362809499747, |
|
"grad_norm": 4.787731170654297, |
|
"learning_rate": 5.872603018536713e-05, |
|
"loss": 1.5981, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5578575037897928, |
|
"grad_norm": 5.033946990966797, |
|
"learning_rate": 5.833319600627753e-05, |
|
"loss": 1.5519, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5598787266296109, |
|
"grad_norm": 5.653214931488037, |
|
"learning_rate": 5.793983202152282e-05, |
|
"loss": 1.9657, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.561899949469429, |
|
"grad_norm": 5.225715160369873, |
|
"learning_rate": 5.7545963240291246e-05, |
|
"loss": 1.4663, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5639211723092471, |
|
"grad_norm": 5.952142715454102, |
|
"learning_rate": 5.715161470386485e-05, |
|
"loss": 1.8356, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5659423951490652, |
|
"grad_norm": 5.057675361633301, |
|
"learning_rate": 5.6756811484027425e-05, |
|
"loss": 1.6058, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5679636179888833, |
|
"grad_norm": 4.865301132202148, |
|
"learning_rate": 5.636157868147054e-05, |
|
"loss": 1.2382, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5699848408287014, |
|
"grad_norm": 5.245824337005615, |
|
"learning_rate": 5.596594142419759e-05, |
|
"loss": 1.4634, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5720060636685195, |
|
"grad_norm": 5.30856990814209, |
|
"learning_rate": 5.556992486592634e-05, |
|
"loss": 1.5013, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5740272865083376, |
|
"grad_norm": 6.301365375518799, |
|
"learning_rate": 5.517355418448961e-05, |
|
"loss": 1.683, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5760485093481557, |
|
"grad_norm": 5.439041614532471, |
|
"learning_rate": 5.477685458023459e-05, |
|
"loss": 1.4477, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5780697321879738, |
|
"grad_norm": 5.788546085357666, |
|
"learning_rate": 5.437985127442065e-05, |
|
"loss": 1.5466, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5800909550277918, |
|
"grad_norm": 4.990469932556152, |
|
"learning_rate": 5.3982569507615775e-05, |
|
"loss": 1.4082, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5821121778676099, |
|
"grad_norm": 6.371885776519775, |
|
"learning_rate": 5.3585034538091885e-05, |
|
"loss": 1.5582, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.584133400707428, |
|
"grad_norm": 5.770207405090332, |
|
"learning_rate": 5.318727164021896e-05, |
|
"loss": 1.6081, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5861546235472461, |
|
"grad_norm": 5.396596908569336, |
|
"learning_rate": 5.278930610285813e-05, |
|
"loss": 1.1804, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5881758463870642, |
|
"grad_norm": 5.193579196929932, |
|
"learning_rate": 5.239116322775391e-05, |
|
"loss": 1.1155, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5901970692268823, |
|
"grad_norm": 6.787877559661865, |
|
"learning_rate": 5.1992868327925526e-05, |
|
"loss": 1.7875, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5922182920667004, |
|
"grad_norm": 5.3459696769714355, |
|
"learning_rate": 5.159444672605759e-05, |
|
"loss": 1.3469, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5942395149065185, |
|
"grad_norm": 5.520552635192871, |
|
"learning_rate": 5.119592375289015e-05, |
|
"loss": 1.187, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5962607377463366, |
|
"grad_norm": 5.6787519454956055, |
|
"learning_rate": 5.079732474560821e-05, |
|
"loss": 1.4493, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5982819605861546, |
|
"grad_norm": 5.608784198760986, |
|
"learning_rate": 5.0398675046230835e-05, |
|
"loss": 1.2803, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.6003031834259727, |
|
"grad_norm": 4.6050872802734375, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8699, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.6023244062657908, |
|
"grad_norm": 5.878370761871338, |
|
"learning_rate": 4.960132495376918e-05, |
|
"loss": 1.3753, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.6043456291056089, |
|
"grad_norm": 6.23378324508667, |
|
"learning_rate": 4.92026752543918e-05, |
|
"loss": 1.3375, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.6063668519454269, |
|
"grad_norm": 8.689998626708984, |
|
"learning_rate": 4.8804076247109865e-05, |
|
"loss": 1.3833, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.608388074785245, |
|
"grad_norm": 2.740164279937744, |
|
"learning_rate": 4.840555327394241e-05, |
|
"loss": 1.2242, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.6104092976250631, |
|
"grad_norm": 2.697038173675537, |
|
"learning_rate": 4.800713167207449e-05, |
|
"loss": 1.2152, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.6124305204648812, |
|
"grad_norm": 3.111619234085083, |
|
"learning_rate": 4.760883677224609e-05, |
|
"loss": 1.4117, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.6144517433046993, |
|
"grad_norm": 2.8779137134552, |
|
"learning_rate": 4.721069389714188e-05, |
|
"loss": 1.1105, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.6164729661445174, |
|
"grad_norm": 3.4639029502868652, |
|
"learning_rate": 4.681272835978107e-05, |
|
"loss": 1.4196, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6184941889843355, |
|
"grad_norm": 4.024080753326416, |
|
"learning_rate": 4.6414965461908126e-05, |
|
"loss": 1.9051, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.6205154118241536, |
|
"grad_norm": 3.319389581680298, |
|
"learning_rate": 4.601743049238424e-05, |
|
"loss": 1.3579, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.6225366346639717, |
|
"grad_norm": 4.286203384399414, |
|
"learning_rate": 4.562014872557935e-05, |
|
"loss": 1.8763, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.6245578575037898, |
|
"grad_norm": 4.204199314117432, |
|
"learning_rate": 4.522314541976541e-05, |
|
"loss": 1.6859, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.6265790803436079, |
|
"grad_norm": 4.099395751953125, |
|
"learning_rate": 4.482644581551039e-05, |
|
"loss": 1.5438, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.628600303183426, |
|
"grad_norm": 4.58848237991333, |
|
"learning_rate": 4.443007513407368e-05, |
|
"loss": 1.9432, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.6306215260232441, |
|
"grad_norm": 4.800745010375977, |
|
"learning_rate": 4.4034058575802424e-05, |
|
"loss": 1.7121, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.6326427488630622, |
|
"grad_norm": 4.463362216949463, |
|
"learning_rate": 4.3638421318529474e-05, |
|
"loss": 1.6288, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.6346639717028802, |
|
"grad_norm": 4.3990864753723145, |
|
"learning_rate": 4.324318851597258e-05, |
|
"loss": 1.5733, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.6366851945426983, |
|
"grad_norm": 4.997748851776123, |
|
"learning_rate": 4.284838529613516e-05, |
|
"loss": 1.9203, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6387064173825164, |
|
"grad_norm": 5.128242015838623, |
|
"learning_rate": 4.2454036759708765e-05, |
|
"loss": 1.8486, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.6407276402223345, |
|
"grad_norm": 4.875668525695801, |
|
"learning_rate": 4.2060167978477184e-05, |
|
"loss": 1.5951, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.6427488630621526, |
|
"grad_norm": 4.408964157104492, |
|
"learning_rate": 4.166680399372248e-05, |
|
"loss": 1.3977, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.6447700859019707, |
|
"grad_norm": 4.5807905197143555, |
|
"learning_rate": 4.1273969814632894e-05, |
|
"loss": 1.4649, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.6467913087417888, |
|
"grad_norm": 5.094422817230225, |
|
"learning_rate": 4.0881690416712805e-05, |
|
"loss": 1.6607, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6488125315816069, |
|
"grad_norm": 4.936136722564697, |
|
"learning_rate": 4.0489990740194926e-05, |
|
"loss": 1.6117, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.650833754421425, |
|
"grad_norm": 5.264697074890137, |
|
"learning_rate": 4.009889568845453e-05, |
|
"loss": 1.6412, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.6528549772612431, |
|
"grad_norm": 4.067869663238525, |
|
"learning_rate": 3.9708430126426284e-05, |
|
"loss": 1.2319, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.6548762001010612, |
|
"grad_norm": 5.502519607543945, |
|
"learning_rate": 3.9318618879023256e-05, |
|
"loss": 1.6435, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6568974229408793, |
|
"grad_norm": 5.370153903961182, |
|
"learning_rate": 3.892948672955877e-05, |
|
"loss": 1.6528, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6589186457806974, |
|
"grad_norm": 4.501315593719482, |
|
"learning_rate": 3.854105841817056e-05, |
|
"loss": 1.4033, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.6609398686205155, |
|
"grad_norm": 5.458628177642822, |
|
"learning_rate": 3.815335864024799e-05, |
|
"loss": 1.7448, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.6629610914603336, |
|
"grad_norm": 5.266726016998291, |
|
"learning_rate": 3.776641204486191e-05, |
|
"loss": 1.6844, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6649823143001516, |
|
"grad_norm": 4.972016334533691, |
|
"learning_rate": 3.738024323319749e-05, |
|
"loss": 1.66, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.6670035371399696, |
|
"grad_norm": 4.560551643371582, |
|
"learning_rate": 3.699487675699026e-05, |
|
"loss": 1.2507, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6690247599797877, |
|
"grad_norm": 5.447690010070801, |
|
"learning_rate": 3.661033711696501e-05, |
|
"loss": 1.4381, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.6710459828196058, |
|
"grad_norm": 6.1798787117004395, |
|
"learning_rate": 3.6226648761278235e-05, |
|
"loss": 1.6519, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6730672056594239, |
|
"grad_norm": 8.073100090026855, |
|
"learning_rate": 3.584383608396362e-05, |
|
"loss": 1.5615, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.675088428499242, |
|
"grad_norm": 6.568238735198975, |
|
"learning_rate": 3.546192342338131e-05, |
|
"loss": 1.5244, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6771096513390601, |
|
"grad_norm": 5.141592979431152, |
|
"learning_rate": 3.508093506067034e-05, |
|
"loss": 1.3669, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6791308741788782, |
|
"grad_norm": 5.7515950202941895, |
|
"learning_rate": 3.470089521820502e-05, |
|
"loss": 1.4939, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6811520970186963, |
|
"grad_norm": 5.398025035858154, |
|
"learning_rate": 3.432182805805495e-05, |
|
"loss": 1.3243, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6831733198585144, |
|
"grad_norm": 5.3287272453308105, |
|
"learning_rate": 3.394375768044869e-05, |
|
"loss": 1.3026, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6851945426983325, |
|
"grad_norm": 5.701461315155029, |
|
"learning_rate": 3.3566708122241756e-05, |
|
"loss": 1.5187, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6872157655381506, |
|
"grad_norm": 6.3350067138671875, |
|
"learning_rate": 3.3190703355388166e-05, |
|
"loss": 1.9201, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6892369883779687, |
|
"grad_norm": 6.057320594787598, |
|
"learning_rate": 3.2815767285416576e-05, |
|
"loss": 1.3178, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6912582112177867, |
|
"grad_norm": 4.327114105224609, |
|
"learning_rate": 3.244192374991027e-05, |
|
"loss": 1.0027, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6932794340576048, |
|
"grad_norm": 5.175032615661621, |
|
"learning_rate": 3.2069196516991686e-05, |
|
"loss": 1.1705, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6953006568974229, |
|
"grad_norm": 5.2318644523620605, |
|
"learning_rate": 3.169760928381127e-05, |
|
"loss": 1.1488, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.697321879737241, |
|
"grad_norm": 4.282010555267334, |
|
"learning_rate": 3.13271856750409e-05, |
|
"loss": 0.9302, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6993431025770591, |
|
"grad_norm": 6.124838829040527, |
|
"learning_rate": 3.095794924137184e-05, |
|
"loss": 1.328, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.7013643254168772, |
|
"grad_norm": 4.892329216003418, |
|
"learning_rate": 3.058992345801747e-05, |
|
"loss": 1.0958, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.7033855482566953, |
|
"grad_norm": 5.745382308959961, |
|
"learning_rate": 3.0223131723220756e-05, |
|
"loss": 1.224, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.7054067710965134, |
|
"grad_norm": 7.191976547241211, |
|
"learning_rate": 2.9857597356766674e-05, |
|
"loss": 1.1652, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.7074279939363315, |
|
"grad_norm": 5.166662693023682, |
|
"learning_rate": 2.9493343598499567e-05, |
|
"loss": 1.0203, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7094492167761496, |
|
"grad_norm": 2.4570610523223877, |
|
"learning_rate": 2.913039360684565e-05, |
|
"loss": 1.2315, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.7114704396159677, |
|
"grad_norm": 2.5633747577667236, |
|
"learning_rate": 2.8768770457340575e-05, |
|
"loss": 1.2238, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.7134916624557858, |
|
"grad_norm": 2.9392590522766113, |
|
"learning_rate": 2.8408497141162438e-05, |
|
"loss": 1.2994, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.7155128852956039, |
|
"grad_norm": 2.7536635398864746, |
|
"learning_rate": 2.8049596563669932e-05, |
|
"loss": 1.0344, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.717534108135422, |
|
"grad_norm": 3.7724685668945312, |
|
"learning_rate": 2.769209154294623e-05, |
|
"loss": 1.6256, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.71955533097524, |
|
"grad_norm": 3.661170721054077, |
|
"learning_rate": 2.7336004808348093e-05, |
|
"loss": 1.4207, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.7215765538150581, |
|
"grad_norm": 4.080316066741943, |
|
"learning_rate": 2.69813589990609e-05, |
|
"loss": 1.6689, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.7235977766548762, |
|
"grad_norm": 4.936508655548096, |
|
"learning_rate": 2.662817666265932e-05, |
|
"loss": 1.996, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.7256189994946943, |
|
"grad_norm": 4.0, |
|
"learning_rate": 2.6276480253673662e-05, |
|
"loss": 1.6679, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.7276402223345124, |
|
"grad_norm": 4.3267903327941895, |
|
"learning_rate": 2.5926292132162433e-05, |
|
"loss": 1.6934, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7296614451743305, |
|
"grad_norm": 4.0602593421936035, |
|
"learning_rate": 2.5577634562290564e-05, |
|
"loss": 1.5712, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.7316826680141486, |
|
"grad_norm": 4.487820148468018, |
|
"learning_rate": 2.5230529710914076e-05, |
|
"loss": 1.5552, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.7337038908539667, |
|
"grad_norm": 4.205357074737549, |
|
"learning_rate": 2.4884999646170597e-05, |
|
"loss": 1.5065, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.7357251136937848, |
|
"grad_norm": 4.995065689086914, |
|
"learning_rate": 2.4541066336076434e-05, |
|
"loss": 1.7604, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.7377463365336028, |
|
"grad_norm": 4.657031059265137, |
|
"learning_rate": 2.4198751647129897e-05, |
|
"loss": 1.6113, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7397675593734209, |
|
"grad_norm": 4.953658103942871, |
|
"learning_rate": 2.3858077342920972e-05, |
|
"loss": 1.4499, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.741788782213239, |
|
"grad_norm": 4.663423538208008, |
|
"learning_rate": 2.3519065082747778e-05, |
|
"loss": 1.6878, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.743810005053057, |
|
"grad_norm": 4.570845127105713, |
|
"learning_rate": 2.3181736420239385e-05, |
|
"loss": 1.5128, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.7458312278928751, |
|
"grad_norm": 4.937878608703613, |
|
"learning_rate": 2.2846112801985632e-05, |
|
"loss": 1.7156, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.7478524507326932, |
|
"grad_norm": 4.761300086975098, |
|
"learning_rate": 2.251221556617344e-05, |
|
"loss": 1.7288, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7498736735725113, |
|
"grad_norm": 4.429453372955322, |
|
"learning_rate": 2.2180065941230277e-05, |
|
"loss": 1.4495, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.7518948964123294, |
|
"grad_norm": 3.9287309646606445, |
|
"learning_rate": 2.1849685044474533e-05, |
|
"loss": 1.2037, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.7539161192521475, |
|
"grad_norm": 5.036333084106445, |
|
"learning_rate": 2.15210938807728e-05, |
|
"loss": 1.4006, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.7559373420919656, |
|
"grad_norm": 4.479243755340576, |
|
"learning_rate": 2.1194313341204597e-05, |
|
"loss": 1.3916, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.7579585649317837, |
|
"grad_norm": 4.996969699859619, |
|
"learning_rate": 2.0869364201733987e-05, |
|
"loss": 1.2482, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7599797877716018, |
|
"grad_norm": 5.1381449699401855, |
|
"learning_rate": 2.054626712188886e-05, |
|
"loss": 1.6205, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7620010106114199, |
|
"grad_norm": 5.011663913726807, |
|
"learning_rate": 2.0225042643447283e-05, |
|
"loss": 1.6553, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.764022233451238, |
|
"grad_norm": 4.932290554046631, |
|
"learning_rate": 1.990571118913166e-05, |
|
"loss": 1.3811, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.7660434562910561, |
|
"grad_norm": 5.215028285980225, |
|
"learning_rate": 1.9588293061310163e-05, |
|
"loss": 1.4943, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.7680646791308742, |
|
"grad_norm": 4.604588985443115, |
|
"learning_rate": 1.9272808440706026e-05, |
|
"loss": 1.1947, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7700859019706923, |
|
"grad_norm": 5.850764274597168, |
|
"learning_rate": 1.8959277385114514e-05, |
|
"loss": 1.4795, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.7721071248105104, |
|
"grad_norm": 5.08169412612915, |
|
"learning_rate": 1.864771982812763e-05, |
|
"loss": 1.4163, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7741283476503285, |
|
"grad_norm": 5.118016719818115, |
|
"learning_rate": 1.8338155577866873e-05, |
|
"loss": 1.4816, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7761495704901465, |
|
"grad_norm": 5.448619842529297, |
|
"learning_rate": 1.8030604315723766e-05, |
|
"loss": 1.3162, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7781707933299646, |
|
"grad_norm": 6.636441707611084, |
|
"learning_rate": 1.7725085595108682e-05, |
|
"loss": 1.4221, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7801920161697827, |
|
"grad_norm": 5.057902812957764, |
|
"learning_rate": 1.7421618840207578e-05, |
|
"loss": 1.4411, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7822132390096008, |
|
"grad_norm": 4.244833946228027, |
|
"learning_rate": 1.71202233447471e-05, |
|
"loss": 1.0515, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.7842344618494189, |
|
"grad_norm": 4.543421745300293, |
|
"learning_rate": 1.682091827076796e-05, |
|
"loss": 1.3154, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.786255684689237, |
|
"grad_norm": 5.357529163360596, |
|
"learning_rate": 1.6523722647406576e-05, |
|
"loss": 1.3857, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.7882769075290551, |
|
"grad_norm": 5.082853317260742, |
|
"learning_rate": 1.622865536968534e-05, |
|
"loss": 1.2635, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7902981303688732, |
|
"grad_norm": 5.57720947265625, |
|
"learning_rate": 1.5935735197311202e-05, |
|
"loss": 1.422, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.7923193532086913, |
|
"grad_norm": 5.431817054748535, |
|
"learning_rate": 1.5644980753483107e-05, |
|
"loss": 1.2788, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7943405760485094, |
|
"grad_norm": 5.892661094665527, |
|
"learning_rate": 1.5356410523707825e-05, |
|
"loss": 1.5827, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7963617988883275, |
|
"grad_norm": 5.396627902984619, |
|
"learning_rate": 1.5070042854624834e-05, |
|
"loss": 1.2314, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7983830217281456, |
|
"grad_norm": 5.037806510925293, |
|
"learning_rate": 1.4785895952839734e-05, |
|
"loss": 1.2281, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8004042445679637, |
|
"grad_norm": 6.198902130126953, |
|
"learning_rate": 1.4503987883766857e-05, |
|
"loss": 1.3708, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.8024254674077818, |
|
"grad_norm": 4.604989051818848, |
|
"learning_rate": 1.4224336570480573e-05, |
|
"loss": 0.9869, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.8044466902475998, |
|
"grad_norm": 6.081021785736084, |
|
"learning_rate": 1.3946959792575915e-05, |
|
"loss": 1.3921, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.8064679130874179, |
|
"grad_norm": 4.676353931427002, |
|
"learning_rate": 1.3671875185038063e-05, |
|
"loss": 0.9632, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.8084891359272359, |
|
"grad_norm": 6.441154956817627, |
|
"learning_rate": 1.3399100237121265e-05, |
|
"loss": 1.4048, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.810510358767054, |
|
"grad_norm": 2.4489996433258057, |
|
"learning_rate": 1.312865229123681e-05, |
|
"loss": 1.2112, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.8125315816068721, |
|
"grad_norm": 2.628634214401245, |
|
"learning_rate": 1.2860548541850542e-05, |
|
"loss": 1.1693, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.8145528044466902, |
|
"grad_norm": 3.1994330883026123, |
|
"learning_rate": 1.2594806034389556e-05, |
|
"loss": 1.3584, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.8165740272865083, |
|
"grad_norm": 2.7979207038879395, |
|
"learning_rate": 1.2331441664158611e-05, |
|
"loss": 1.1963, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.8185952501263264, |
|
"grad_norm": 3.172201156616211, |
|
"learning_rate": 1.2070472175265856e-05, |
|
"loss": 1.308, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8206164729661445, |
|
"grad_norm": 3.279130458831787, |
|
"learning_rate": 1.1811914159558374e-05, |
|
"loss": 1.4214, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.8226376958059626, |
|
"grad_norm": 3.5428879261016846, |
|
"learning_rate": 1.155578405556722e-05, |
|
"loss": 1.5145, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.8246589186457807, |
|
"grad_norm": 4.0169830322265625, |
|
"learning_rate": 1.1302098147462347e-05, |
|
"loss": 1.6019, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.8266801414855988, |
|
"grad_norm": 4.363892078399658, |
|
"learning_rate": 1.1050872564017328e-05, |
|
"loss": 1.9656, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.8287013643254169, |
|
"grad_norm": 4.119546890258789, |
|
"learning_rate": 1.0802123277583819e-05, |
|
"loss": 1.6649, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.830722587165235, |
|
"grad_norm": 4.996504783630371, |
|
"learning_rate": 1.0555866103076212e-05, |
|
"loss": 1.8195, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.832743810005053, |
|
"grad_norm": 4.646332263946533, |
|
"learning_rate": 1.0312116696966012e-05, |
|
"loss": 1.9406, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.8347650328448711, |
|
"grad_norm": 4.677328109741211, |
|
"learning_rate": 1.0070890556286577e-05, |
|
"loss": 1.6832, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.8367862556846892, |
|
"grad_norm": 5.113035202026367, |
|
"learning_rate": 9.832203017647745e-06, |
|
"loss": 1.7782, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.8388074785245073, |
|
"grad_norm": 4.52169942855835, |
|
"learning_rate": 9.596069256260792e-06, |
|
"loss": 1.5927, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8408287013643254, |
|
"grad_norm": 4.952767848968506, |
|
"learning_rate": 9.362504284973683e-06, |
|
"loss": 1.9372, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.8428499242041435, |
|
"grad_norm": 5.01066255569458, |
|
"learning_rate": 9.131522953316501e-06, |
|
"loss": 1.5843, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.8448711470439616, |
|
"grad_norm": 4.628442764282227, |
|
"learning_rate": 8.903139946557438e-06, |
|
"loss": 1.4746, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.8468923698837797, |
|
"grad_norm": 4.917454242706299, |
|
"learning_rate": 8.67736978476904e-06, |
|
"loss": 1.6411, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.8489135927235978, |
|
"grad_norm": 5.071280002593994, |
|
"learning_rate": 8.45422682190517e-06, |
|
"loss": 1.6055, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8509348155634159, |
|
"grad_norm": 4.860715866088867, |
|
"learning_rate": 8.233725244888291e-06, |
|
"loss": 1.4997, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.852956038403234, |
|
"grad_norm": 4.802791595458984, |
|
"learning_rate": 8.01587907270761e-06, |
|
"loss": 1.3955, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.8549772612430521, |
|
"grad_norm": 4.843109607696533, |
|
"learning_rate": 7.800702155527696e-06, |
|
"loss": 1.4801, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.8569984840828702, |
|
"grad_norm": 5.050382137298584, |
|
"learning_rate": 7.588208173807943e-06, |
|
"loss": 1.5216, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.8590197069226883, |
|
"grad_norm": 4.331174373626709, |
|
"learning_rate": 7.378410637432847e-06, |
|
"loss": 1.4275, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8610409297625063, |
|
"grad_norm": 5.057369709014893, |
|
"learning_rate": 7.171322884852988e-06, |
|
"loss": 1.6479, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.8630621526023244, |
|
"grad_norm": 5.058509826660156, |
|
"learning_rate": 6.966958082237096e-06, |
|
"loss": 1.483, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.8650833754421425, |
|
"grad_norm": 5.39819860458374, |
|
"learning_rate": 6.765329222634892e-06, |
|
"loss": 1.6475, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.8671045982819606, |
|
"grad_norm": 5.942454814910889, |
|
"learning_rate": 6.566449125151086e-06, |
|
"loss": 1.4512, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.8691258211217787, |
|
"grad_norm": 5.583020210266113, |
|
"learning_rate": 6.370330434130317e-06, |
|
"loss": 1.5802, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8711470439615968, |
|
"grad_norm": 5.76262903213501, |
|
"learning_rate": 6.176985618353282e-06, |
|
"loss": 1.7253, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.8731682668014149, |
|
"grad_norm": 5.057263374328613, |
|
"learning_rate": 5.9864269702440075e-06, |
|
"loss": 1.424, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.875189489641233, |
|
"grad_norm": 7.444010257720947, |
|
"learning_rate": 5.798666605088293e-06, |
|
"loss": 1.4872, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.877210712481051, |
|
"grad_norm": 4.746302127838135, |
|
"learning_rate": 5.613716460263485e-06, |
|
"loss": 1.1225, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.8792319353208691, |
|
"grad_norm": 4.994941234588623, |
|
"learning_rate": 5.431588294479478e-06, |
|
"loss": 1.2954, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8812531581606872, |
|
"grad_norm": 5.951868057250977, |
|
"learning_rate": 5.2522936870311955e-06, |
|
"loss": 1.4107, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8832743810005053, |
|
"grad_norm": 5.421668529510498, |
|
"learning_rate": 5.0758440370623214e-06, |
|
"loss": 1.8054, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.8852956038403234, |
|
"grad_norm": 6.007269382476807, |
|
"learning_rate": 4.902250562840621e-06, |
|
"loss": 1.6518, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8873168266801414, |
|
"grad_norm": 4.964114189147949, |
|
"learning_rate": 4.731524301044715e-06, |
|
"loss": 1.1705, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.8893380495199595, |
|
"grad_norm": 5.869405269622803, |
|
"learning_rate": 4.563676106062331e-06, |
|
"loss": 1.6333, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8913592723597776, |
|
"grad_norm": 5.429330348968506, |
|
"learning_rate": 4.398716649300311e-06, |
|
"loss": 1.277, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.8933804951995957, |
|
"grad_norm": 6.709824562072754, |
|
"learning_rate": 4.236656418506013e-06, |
|
"loss": 1.4216, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.8954017180394138, |
|
"grad_norm": 5.117186546325684, |
|
"learning_rate": 4.077505717100666e-06, |
|
"loss": 1.2206, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.8974229408792319, |
|
"grad_norm": 4.522552967071533, |
|
"learning_rate": 3.921274663524182e-06, |
|
"loss": 0.9311, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.89944416371905, |
|
"grad_norm": 6.130990505218506, |
|
"learning_rate": 3.767973190591906e-06, |
|
"loss": 1.3967, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9014653865588681, |
|
"grad_norm": 4.977992057800293, |
|
"learning_rate": 3.6176110448631394e-06, |
|
"loss": 1.062, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.9034866093986862, |
|
"grad_norm": 4.500341892242432, |
|
"learning_rate": 3.4701977860213953e-06, |
|
"loss": 0.9393, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.9055078322385043, |
|
"grad_norm": 5.150080680847168, |
|
"learning_rate": 3.325742786266689e-06, |
|
"loss": 1.1028, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.9075290550783224, |
|
"grad_norm": 5.322256088256836, |
|
"learning_rate": 3.184255229719624e-06, |
|
"loss": 1.0733, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.9095502779181405, |
|
"grad_norm": 7.01747989654541, |
|
"learning_rate": 3.0457441118375283e-06, |
|
"loss": 1.5135, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9115715007579586, |
|
"grad_norm": 2.3920202255249023, |
|
"learning_rate": 2.91021823884251e-06, |
|
"loss": 1.0922, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.9135927235977767, |
|
"grad_norm": 2.5058305263519287, |
|
"learning_rate": 2.7776862271615912e-06, |
|
"loss": 1.1713, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.9156139464375948, |
|
"grad_norm": 2.848264217376709, |
|
"learning_rate": 2.6481565028789067e-06, |
|
"loss": 1.1941, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.9176351692774128, |
|
"grad_norm": 3.537458896636963, |
|
"learning_rate": 2.5216373011999695e-06, |
|
"loss": 1.5756, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.9196563921172309, |
|
"grad_norm": 3.5460147857666016, |
|
"learning_rate": 2.3981366659281134e-06, |
|
"loss": 1.3457, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.921677614957049, |
|
"grad_norm": 4.265406608581543, |
|
"learning_rate": 2.277662448953066e-06, |
|
"loss": 1.7724, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.9236988377968671, |
|
"grad_norm": 3.961560010910034, |
|
"learning_rate": 2.1602223097517913e-06, |
|
"loss": 1.7004, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.9257200606366852, |
|
"grad_norm": 4.308675289154053, |
|
"learning_rate": 2.0458237149014347e-06, |
|
"loss": 1.7709, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.9277412834765033, |
|
"grad_norm": 3.9854753017425537, |
|
"learning_rate": 1.9344739376047083e-06, |
|
"loss": 1.6308, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.9297625063163214, |
|
"grad_norm": 4.271021366119385, |
|
"learning_rate": 1.8261800572274001e-06, |
|
"loss": 1.65, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9317837291561395, |
|
"grad_norm": 4.231322288513184, |
|
"learning_rate": 1.7209489588483395e-06, |
|
"loss": 1.529, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.9338049519959576, |
|
"grad_norm": 3.8486905097961426, |
|
"learning_rate": 1.6187873328216142e-06, |
|
"loss": 1.3554, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.9358261748357757, |
|
"grad_norm": 3.996654510498047, |
|
"learning_rate": 1.519701674351265e-06, |
|
"loss": 1.4264, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.9378473976755938, |
|
"grad_norm": 4.656818389892578, |
|
"learning_rate": 1.4236982830782674e-06, |
|
"loss": 1.5068, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.9398686205154119, |
|
"grad_norm": 4.660806179046631, |
|
"learning_rate": 1.3307832626800964e-06, |
|
"loss": 1.5453, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.94188984335523, |
|
"grad_norm": 4.372664928436279, |
|
"learning_rate": 1.2409625204825803e-06, |
|
"loss": 1.6273, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.943911066195048, |
|
"grad_norm": 4.731912612915039, |
|
"learning_rate": 1.1542417670844074e-06, |
|
"loss": 1.5745, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.9459322890348661, |
|
"grad_norm": 4.093106269836426, |
|
"learning_rate": 1.0706265159939943e-06, |
|
"loss": 1.2102, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.9479535118746841, |
|
"grad_norm": 4.671316623687744, |
|
"learning_rate": 9.901220832790103e-07, |
|
"loss": 1.3927, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.9499747347145022, |
|
"grad_norm": 5.522085666656494, |
|
"learning_rate": 9.12733587228326e-07, |
|
"loss": 1.7408, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9519959575543203, |
|
"grad_norm": 5.045165061950684, |
|
"learning_rate": 8.384659480266732e-07, |
|
"loss": 1.4933, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.9540171803941384, |
|
"grad_norm": 5.388870716094971, |
|
"learning_rate": 7.673238874417677e-07, |
|
"loss": 1.6984, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.9560384032339565, |
|
"grad_norm": 4.820789813995361, |
|
"learning_rate": 6.993119285241601e-07, |
|
"loss": 1.5436, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.9580596260737746, |
|
"grad_norm": 5.232425212860107, |
|
"learning_rate": 6.344343953196385e-07, |
|
"loss": 1.5351, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.9600808489135927, |
|
"grad_norm": 4.9825639724731445, |
|
"learning_rate": 5.726954125943318e-07, |
|
"loss": 1.5842, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9621020717534108, |
|
"grad_norm": 5.142207145690918, |
|
"learning_rate": 5.140989055724687e-07, |
|
"loss": 1.6371, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.9641232945932289, |
|
"grad_norm": 5.354437828063965, |
|
"learning_rate": 4.5864859968679506e-07, |
|
"loss": 1.4612, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.966144517433047, |
|
"grad_norm": 5.071149826049805, |
|
"learning_rate": 4.0634802034176244e-07, |
|
"loss": 1.3784, |
|
"step": 478 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 494, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 239, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1989875311968256e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|