|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 29459, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0033945483553413217, |
|
"grad_norm": 1.450656533241272, |
|
"learning_rate": 4.983027258223293e-05, |
|
"loss": 1.3353, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0067890967106826435, |
|
"grad_norm": 1.7437301874160767, |
|
"learning_rate": 4.966224243864354e-05, |
|
"loss": 1.2966, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.010183645066023966, |
|
"grad_norm": 1.5103371143341064, |
|
"learning_rate": 4.949251502087647e-05, |
|
"loss": 1.2919, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.013578193421365287, |
|
"grad_norm": 1.2713204622268677, |
|
"learning_rate": 4.932278760310941e-05, |
|
"loss": 1.417, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01697274177670661, |
|
"grad_norm": 1.9554837942123413, |
|
"learning_rate": 4.915306018534234e-05, |
|
"loss": 1.3327, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02036729013204793, |
|
"grad_norm": 5.13068962097168, |
|
"learning_rate": 4.898333276757528e-05, |
|
"loss": 1.3058, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.023761838487389254, |
|
"grad_norm": 1.6552410125732422, |
|
"learning_rate": 4.881360534980821e-05, |
|
"loss": 1.3226, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.027156386842730574, |
|
"grad_norm": 1.807737946510315, |
|
"learning_rate": 4.864387793204114e-05, |
|
"loss": 1.334, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.030550935198071897, |
|
"grad_norm": 1.570865273475647, |
|
"learning_rate": 4.847415051427408e-05, |
|
"loss": 1.3456, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.03394548355341322, |
|
"grad_norm": 1.8917468786239624, |
|
"learning_rate": 4.830442309650701e-05, |
|
"loss": 1.3255, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03734003190875454, |
|
"grad_norm": 1.649667739868164, |
|
"learning_rate": 4.813469567873995e-05, |
|
"loss": 1.3013, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.04073458026409586, |
|
"grad_norm": 2.6449484825134277, |
|
"learning_rate": 4.796666553515055e-05, |
|
"loss": 1.3369, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.044129128619437186, |
|
"grad_norm": 1.874516487121582, |
|
"learning_rate": 4.779693811738348e-05, |
|
"loss": 1.27, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.04752367697477851, |
|
"grad_norm": 1.7284377813339233, |
|
"learning_rate": 4.762721069961642e-05, |
|
"loss": 1.3931, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.050918225330119825, |
|
"grad_norm": 3.4324212074279785, |
|
"learning_rate": 4.745748328184935e-05, |
|
"loss": 1.3457, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.05431277368546115, |
|
"grad_norm": 1.3490582704544067, |
|
"learning_rate": 4.7287755864082286e-05, |
|
"loss": 1.2487, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.05770732204080247, |
|
"grad_norm": 1.493403673171997, |
|
"learning_rate": 4.711802844631522e-05, |
|
"loss": 1.227, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.061101870396143794, |
|
"grad_norm": 1.8711298704147339, |
|
"learning_rate": 4.694830102854815e-05, |
|
"loss": 1.2983, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.06449641875148511, |
|
"grad_norm": 1.6430071592330933, |
|
"learning_rate": 4.677857361078109e-05, |
|
"loss": 1.3099, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.06789096710682643, |
|
"grad_norm": 1.425639033317566, |
|
"learning_rate": 4.660884619301402e-05, |
|
"loss": 1.2607, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.07128551546216776, |
|
"grad_norm": 1.7377012968063354, |
|
"learning_rate": 4.6439118775246956e-05, |
|
"loss": 1.2527, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.07468006381750908, |
|
"grad_norm": 1.5430407524108887, |
|
"learning_rate": 4.626939135747989e-05, |
|
"loss": 1.2877, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.0780746121728504, |
|
"grad_norm": 1.3293898105621338, |
|
"learning_rate": 4.6101361213890495e-05, |
|
"loss": 1.1781, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.08146916052819173, |
|
"grad_norm": 2.114269733428955, |
|
"learning_rate": 4.593163379612343e-05, |
|
"loss": 1.3405, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.08486370888353305, |
|
"grad_norm": 1.370626449584961, |
|
"learning_rate": 4.576190637835636e-05, |
|
"loss": 1.303, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.08825825723887437, |
|
"grad_norm": 1.6202032566070557, |
|
"learning_rate": 4.5592178960589295e-05, |
|
"loss": 1.2427, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.0916528055942157, |
|
"grad_norm": 1.6252601146697998, |
|
"learning_rate": 4.542245154282223e-05, |
|
"loss": 1.2768, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.09504735394955702, |
|
"grad_norm": 1.4289278984069824, |
|
"learning_rate": 4.5252724125055165e-05, |
|
"loss": 1.2522, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.09844190230489833, |
|
"grad_norm": 1.1954665184020996, |
|
"learning_rate": 4.50829967072881e-05, |
|
"loss": 1.3008, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.10183645066023965, |
|
"grad_norm": 2.3695414066314697, |
|
"learning_rate": 4.491326928952103e-05, |
|
"loss": 1.3439, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10523099901558097, |
|
"grad_norm": 1.5015544891357422, |
|
"learning_rate": 4.4743541871753965e-05, |
|
"loss": 1.2042, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.1086255473709223, |
|
"grad_norm": 1.3509881496429443, |
|
"learning_rate": 4.45738144539869e-05, |
|
"loss": 1.2834, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.11202009572626362, |
|
"grad_norm": 1.1645573377609253, |
|
"learning_rate": 4.4404087036219835e-05, |
|
"loss": 1.3342, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.11541464408160494, |
|
"grad_norm": 1.3376731872558594, |
|
"learning_rate": 4.4234359618452764e-05, |
|
"loss": 1.2501, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.11880919243694626, |
|
"grad_norm": 1.184652328491211, |
|
"learning_rate": 4.40646322006857e-05, |
|
"loss": 1.2067, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.12220374079228759, |
|
"grad_norm": 1.5623388290405273, |
|
"learning_rate": 4.3894904782918635e-05, |
|
"loss": 1.2109, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.1255982891476289, |
|
"grad_norm": 1.537017583847046, |
|
"learning_rate": 4.372517736515157e-05, |
|
"loss": 1.2207, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.12899283750297022, |
|
"grad_norm": 1.5127204656600952, |
|
"learning_rate": 4.3555449947384505e-05, |
|
"loss": 1.3273, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.13238738585831156, |
|
"grad_norm": 1.504813313484192, |
|
"learning_rate": 4.3385722529617434e-05, |
|
"loss": 1.2347, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.13578193421365287, |
|
"grad_norm": 1.5462582111358643, |
|
"learning_rate": 4.321599511185037e-05, |
|
"loss": 1.3142, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1391764825689942, |
|
"grad_norm": 1.377742886543274, |
|
"learning_rate": 4.3046267694083305e-05, |
|
"loss": 1.2685, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.1425710309243355, |
|
"grad_norm": 1.5139976739883423, |
|
"learning_rate": 4.287654027631624e-05, |
|
"loss": 1.2718, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.14596557927967685, |
|
"grad_norm": 1.2067663669586182, |
|
"learning_rate": 4.2706812858549175e-05, |
|
"loss": 1.2405, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.14936012763501816, |
|
"grad_norm": 1.2739530801773071, |
|
"learning_rate": 4.2537085440782104e-05, |
|
"loss": 1.2186, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.1527546759903595, |
|
"grad_norm": 1.4101356267929077, |
|
"learning_rate": 4.236735802301504e-05, |
|
"loss": 1.222, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.1561492243457008, |
|
"grad_norm": 1.8474892377853394, |
|
"learning_rate": 4.2197630605247975e-05, |
|
"loss": 1.2685, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.1595437727010421, |
|
"grad_norm": 1.5274946689605713, |
|
"learning_rate": 4.202790318748091e-05, |
|
"loss": 1.2161, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.16293832105638345, |
|
"grad_norm": 1.8831485509872437, |
|
"learning_rate": 4.1859873043891514e-05, |
|
"loss": 1.2828, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.16633286941172476, |
|
"grad_norm": 2.1567959785461426, |
|
"learning_rate": 4.169014562612444e-05, |
|
"loss": 1.3027, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.1697274177670661, |
|
"grad_norm": 1.4506981372833252, |
|
"learning_rate": 4.152041820835738e-05, |
|
"loss": 1.2094, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.1731219661224074, |
|
"grad_norm": 1.2342296838760376, |
|
"learning_rate": 4.1350690790590314e-05, |
|
"loss": 1.2523, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.17651651447774874, |
|
"grad_norm": 1.6375709772109985, |
|
"learning_rate": 4.118096337282325e-05, |
|
"loss": 1.2418, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.17991106283309005, |
|
"grad_norm": 1.3406407833099365, |
|
"learning_rate": 4.1011235955056184e-05, |
|
"loss": 1.2777, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.1833056111884314, |
|
"grad_norm": 1.2170027494430542, |
|
"learning_rate": 4.084150853728911e-05, |
|
"loss": 1.1458, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.1867001595437727, |
|
"grad_norm": 1.4051603078842163, |
|
"learning_rate": 4.067178111952205e-05, |
|
"loss": 1.2022, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.19009470789911403, |
|
"grad_norm": 1.3835875988006592, |
|
"learning_rate": 4.0502053701754984e-05, |
|
"loss": 1.2601, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.19348925625445534, |
|
"grad_norm": 1.7600008249282837, |
|
"learning_rate": 4.033232628398791e-05, |
|
"loss": 1.2172, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.19688380460979665, |
|
"grad_norm": 1.4803307056427002, |
|
"learning_rate": 4.0162598866220854e-05, |
|
"loss": 1.3049, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.200278352965138, |
|
"grad_norm": 1.2911592721939087, |
|
"learning_rate": 3.999287144845378e-05, |
|
"loss": 1.2018, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.2036729013204793, |
|
"grad_norm": 1.5094434022903442, |
|
"learning_rate": 3.982314403068672e-05, |
|
"loss": 1.1798, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.20706744967582064, |
|
"grad_norm": 1.44119131565094, |
|
"learning_rate": 3.9653416612919654e-05, |
|
"loss": 1.2012, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.21046199803116195, |
|
"grad_norm": 1.3899762630462646, |
|
"learning_rate": 3.948368919515258e-05, |
|
"loss": 1.1851, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.21385654638650328, |
|
"grad_norm": 2.047968864440918, |
|
"learning_rate": 3.9313961777385524e-05, |
|
"loss": 1.2239, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.2172510947418446, |
|
"grad_norm": 1.827493667602539, |
|
"learning_rate": 3.914423435961845e-05, |
|
"loss": 1.2358, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.22064564309718593, |
|
"grad_norm": 1.4631316661834717, |
|
"learning_rate": 3.8976204216029064e-05, |
|
"loss": 1.2058, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.22404019145252724, |
|
"grad_norm": 1.5218262672424316, |
|
"learning_rate": 3.880647679826199e-05, |
|
"loss": 1.1717, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.22743473980786857, |
|
"grad_norm": 1.3896803855895996, |
|
"learning_rate": 3.863674938049492e-05, |
|
"loss": 1.1912, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.23082928816320988, |
|
"grad_norm": 1.587547779083252, |
|
"learning_rate": 3.846702196272786e-05, |
|
"loss": 1.2109, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.23422383651855122, |
|
"grad_norm": 1.35820472240448, |
|
"learning_rate": 3.829729454496079e-05, |
|
"loss": 1.1828, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.23761838487389253, |
|
"grad_norm": 1.2581636905670166, |
|
"learning_rate": 3.8127567127193734e-05, |
|
"loss": 1.1701, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.24101293322923384, |
|
"grad_norm": 1.617680549621582, |
|
"learning_rate": 3.795783970942666e-05, |
|
"loss": 1.2159, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.24440748158457518, |
|
"grad_norm": 1.3621796369552612, |
|
"learning_rate": 3.778811229165959e-05, |
|
"loss": 1.1951, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.24780202993991648, |
|
"grad_norm": 1.8783783912658691, |
|
"learning_rate": 3.761838487389253e-05, |
|
"loss": 1.2664, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.2511965782952578, |
|
"grad_norm": 1.1315891742706299, |
|
"learning_rate": 3.744865745612546e-05, |
|
"loss": 1.206, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.25459112665059913, |
|
"grad_norm": 1.3531254529953003, |
|
"learning_rate": 3.7278930038358404e-05, |
|
"loss": 1.152, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.25798567500594044, |
|
"grad_norm": 1.7136415243148804, |
|
"learning_rate": 3.710920262059133e-05, |
|
"loss": 1.3282, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.2613802233612818, |
|
"grad_norm": 1.5798516273498535, |
|
"learning_rate": 3.693947520282426e-05, |
|
"loss": 1.2192, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.2647747717166231, |
|
"grad_norm": 2.0638535022735596, |
|
"learning_rate": 3.67697477850572e-05, |
|
"loss": 1.2233, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.2681693200719644, |
|
"grad_norm": 1.6473902463912964, |
|
"learning_rate": 3.660002036729013e-05, |
|
"loss": 1.2384, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.27156386842730573, |
|
"grad_norm": 1.4174180030822754, |
|
"learning_rate": 3.643029294952307e-05, |
|
"loss": 1.3018, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.27495841678264704, |
|
"grad_norm": 1.470323920249939, |
|
"learning_rate": 3.6260565531756e-05, |
|
"loss": 1.1662, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.2783529651379884, |
|
"grad_norm": 1.1814874410629272, |
|
"learning_rate": 3.609083811398893e-05, |
|
"loss": 1.1641, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.2817475134933297, |
|
"grad_norm": 1.496795892715454, |
|
"learning_rate": 3.592111069622187e-05, |
|
"loss": 1.208, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.285142061848671, |
|
"grad_norm": 1.602959394454956, |
|
"learning_rate": 3.57513832784548e-05, |
|
"loss": 1.2323, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.28853661020401233, |
|
"grad_norm": 1.4096314907073975, |
|
"learning_rate": 3.558165586068774e-05, |
|
"loss": 1.1804, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2919311585593537, |
|
"grad_norm": 1.2292312383651733, |
|
"learning_rate": 3.541192844292067e-05, |
|
"loss": 1.1747, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.295325706914695, |
|
"grad_norm": 1.3961174488067627, |
|
"learning_rate": 3.52422010251536e-05, |
|
"loss": 1.1459, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.2987202552700363, |
|
"grad_norm": 1.2199640274047852, |
|
"learning_rate": 3.507247360738654e-05, |
|
"loss": 1.1992, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.3021148036253776, |
|
"grad_norm": 1.316805362701416, |
|
"learning_rate": 3.490274618961947e-05, |
|
"loss": 1.2202, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.305509351980719, |
|
"grad_norm": 1.3120840787887573, |
|
"learning_rate": 3.473301877185241e-05, |
|
"loss": 1.1095, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.3089039003360603, |
|
"grad_norm": 1.14743971824646, |
|
"learning_rate": 3.456329135408534e-05, |
|
"loss": 1.2154, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.3122984486914016, |
|
"grad_norm": 1.6754459142684937, |
|
"learning_rate": 3.439356393631827e-05, |
|
"loss": 1.1927, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.3156929970467429, |
|
"grad_norm": 1.2429569959640503, |
|
"learning_rate": 3.422383651855121e-05, |
|
"loss": 1.2623, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.3190875454020842, |
|
"grad_norm": 1.5485316514968872, |
|
"learning_rate": 3.405410910078414e-05, |
|
"loss": 1.176, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.3224820937574256, |
|
"grad_norm": 1.3292936086654663, |
|
"learning_rate": 3.388438168301708e-05, |
|
"loss": 1.1637, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.3258766421127669, |
|
"grad_norm": 1.4114725589752197, |
|
"learning_rate": 3.371465426525001e-05, |
|
"loss": 1.151, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.3292711904681082, |
|
"grad_norm": 1.6183195114135742, |
|
"learning_rate": 3.354492684748294e-05, |
|
"loss": 1.1929, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.3326657388234495, |
|
"grad_norm": 1.7640340328216553, |
|
"learning_rate": 3.3375199429715876e-05, |
|
"loss": 1.1472, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.3360602871787909, |
|
"grad_norm": 1.300631046295166, |
|
"learning_rate": 3.320547201194881e-05, |
|
"loss": 1.2158, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.3394548355341322, |
|
"grad_norm": 1.5510449409484863, |
|
"learning_rate": 3.303574459418175e-05, |
|
"loss": 1.231, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3428493838894735, |
|
"grad_norm": 1.7029348611831665, |
|
"learning_rate": 3.286601717641468e-05, |
|
"loss": 1.152, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.3462439322448148, |
|
"grad_norm": 2.094801902770996, |
|
"learning_rate": 3.269628975864761e-05, |
|
"loss": 1.1792, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.3496384806001562, |
|
"grad_norm": 1.2476887702941895, |
|
"learning_rate": 3.2526562340880546e-05, |
|
"loss": 1.1297, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.3530330289554975, |
|
"grad_norm": 1.2222412824630737, |
|
"learning_rate": 3.235683492311348e-05, |
|
"loss": 1.2194, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.3564275773108388, |
|
"grad_norm": 1.2689149379730225, |
|
"learning_rate": 3.218710750534642e-05, |
|
"loss": 1.109, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.3598221256661801, |
|
"grad_norm": 1.1400436162948608, |
|
"learning_rate": 3.201738008757935e-05, |
|
"loss": 1.1778, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.3632166740215214, |
|
"grad_norm": 1.5304007530212402, |
|
"learning_rate": 3.184765266981228e-05, |
|
"loss": 1.2015, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.3666112223768628, |
|
"grad_norm": 1.4382191896438599, |
|
"learning_rate": 3.1677925252045216e-05, |
|
"loss": 1.2023, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.3700057707322041, |
|
"grad_norm": 1.2539787292480469, |
|
"learning_rate": 3.150819783427815e-05, |
|
"loss": 1.1627, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.3734003190875454, |
|
"grad_norm": 1.6526975631713867, |
|
"learning_rate": 3.133847041651109e-05, |
|
"loss": 1.1909, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3767948674428867, |
|
"grad_norm": 1.477150559425354, |
|
"learning_rate": 3.1168742998744016e-05, |
|
"loss": 1.1767, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.38018941579822807, |
|
"grad_norm": 1.655372142791748, |
|
"learning_rate": 3.100071285515462e-05, |
|
"loss": 1.1715, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.3835839641535694, |
|
"grad_norm": 1.237518310546875, |
|
"learning_rate": 3.0830985437387555e-05, |
|
"loss": 1.1148, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.3869785125089107, |
|
"grad_norm": 2.0262339115142822, |
|
"learning_rate": 3.066125801962049e-05, |
|
"loss": 1.056, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.390373060864252, |
|
"grad_norm": 1.4669376611709595, |
|
"learning_rate": 3.0491530601853423e-05, |
|
"loss": 1.1773, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.3937676092195933, |
|
"grad_norm": 1.6047866344451904, |
|
"learning_rate": 3.032180318408636e-05, |
|
"loss": 1.1846, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.39716215757493467, |
|
"grad_norm": 1.5415077209472656, |
|
"learning_rate": 3.0152075766319293e-05, |
|
"loss": 1.1481, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.400556705930276, |
|
"grad_norm": 1.2356903553009033, |
|
"learning_rate": 2.9982348348552225e-05, |
|
"loss": 1.1914, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.4039512542856173, |
|
"grad_norm": 1.691815733909607, |
|
"learning_rate": 2.9814318204962833e-05, |
|
"loss": 1.2595, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.4073458026409586, |
|
"grad_norm": 1.3964107036590576, |
|
"learning_rate": 2.9644590787195765e-05, |
|
"loss": 1.137, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.41074035099629996, |
|
"grad_norm": 1.4641882181167603, |
|
"learning_rate": 2.94748633694287e-05, |
|
"loss": 1.2194, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.4141348993516413, |
|
"grad_norm": 1.2686254978179932, |
|
"learning_rate": 2.9305135951661632e-05, |
|
"loss": 1.1666, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.4175294477069826, |
|
"grad_norm": 1.5064525604248047, |
|
"learning_rate": 2.9135408533894564e-05, |
|
"loss": 1.2265, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.4209239960623239, |
|
"grad_norm": 1.3071587085723877, |
|
"learning_rate": 2.8965681116127503e-05, |
|
"loss": 1.0625, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.42431854441766526, |
|
"grad_norm": 1.4859912395477295, |
|
"learning_rate": 2.8795953698360435e-05, |
|
"loss": 1.1239, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.42771309277300656, |
|
"grad_norm": 1.4131548404693604, |
|
"learning_rate": 2.862622628059337e-05, |
|
"loss": 1.2125, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.4311076411283479, |
|
"grad_norm": 1.1708953380584717, |
|
"learning_rate": 2.8456498862826302e-05, |
|
"loss": 1.145, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.4345021894836892, |
|
"grad_norm": 1.4931575059890747, |
|
"learning_rate": 2.8286771445059234e-05, |
|
"loss": 1.102, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.4378967378390305, |
|
"grad_norm": 1.6308887004852295, |
|
"learning_rate": 2.8117044027292173e-05, |
|
"loss": 1.1574, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.44129128619437186, |
|
"grad_norm": 1.532914638519287, |
|
"learning_rate": 2.7947316609525105e-05, |
|
"loss": 1.1901, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.44468583454971317, |
|
"grad_norm": 1.5746792554855347, |
|
"learning_rate": 2.7777589191758037e-05, |
|
"loss": 1.2077, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.4480803829050545, |
|
"grad_norm": 1.7640366554260254, |
|
"learning_rate": 2.7607861773990972e-05, |
|
"loss": 1.2147, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.4514749312603958, |
|
"grad_norm": 1.4942810535430908, |
|
"learning_rate": 2.7438134356223904e-05, |
|
"loss": 1.2471, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.45486947961573715, |
|
"grad_norm": 1.449723243713379, |
|
"learning_rate": 2.7268406938456843e-05, |
|
"loss": 1.1991, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.45826402797107846, |
|
"grad_norm": 1.0219964981079102, |
|
"learning_rate": 2.7098679520689775e-05, |
|
"loss": 1.0989, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.46165857632641977, |
|
"grad_norm": 1.4733655452728271, |
|
"learning_rate": 2.6928952102922707e-05, |
|
"loss": 1.1652, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.4650531246817611, |
|
"grad_norm": 1.4748992919921875, |
|
"learning_rate": 2.6759224685155642e-05, |
|
"loss": 1.1522, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.46844767303710244, |
|
"grad_norm": 1.918239712715149, |
|
"learning_rate": 2.6589497267388574e-05, |
|
"loss": 1.0624, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.47184222139244375, |
|
"grad_norm": 1.4620022773742676, |
|
"learning_rate": 2.6419769849621513e-05, |
|
"loss": 1.2269, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.47523676974778506, |
|
"grad_norm": 1.647291898727417, |
|
"learning_rate": 2.6250042431854445e-05, |
|
"loss": 1.0928, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.47863131810312637, |
|
"grad_norm": 1.4002645015716553, |
|
"learning_rate": 2.6080315014087377e-05, |
|
"loss": 1.1475, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.4820258664584677, |
|
"grad_norm": 1.329160451889038, |
|
"learning_rate": 2.5910587596320312e-05, |
|
"loss": 1.1787, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.48542041481380904, |
|
"grad_norm": 1.0468798875808716, |
|
"learning_rate": 2.5740860178553244e-05, |
|
"loss": 1.1257, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.48881496316915035, |
|
"grad_norm": 1.1814810037612915, |
|
"learning_rate": 2.5571132760786176e-05, |
|
"loss": 1.2252, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.49220951152449166, |
|
"grad_norm": 1.442358136177063, |
|
"learning_rate": 2.5401405343019115e-05, |
|
"loss": 1.1474, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.49560405987983297, |
|
"grad_norm": 1.2082366943359375, |
|
"learning_rate": 2.5231677925252044e-05, |
|
"loss": 1.1271, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.49899860823517433, |
|
"grad_norm": 1.3044782876968384, |
|
"learning_rate": 2.5061950507484982e-05, |
|
"loss": 1.1204, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.5023931565905156, |
|
"grad_norm": 1.257338047027588, |
|
"learning_rate": 2.4893920363895583e-05, |
|
"loss": 1.1891, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.505787704945857, |
|
"grad_norm": 1.6963568925857544, |
|
"learning_rate": 2.472419294612852e-05, |
|
"loss": 1.0711, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.5091822533011983, |
|
"grad_norm": 1.4593158960342407, |
|
"learning_rate": 2.4554465528361454e-05, |
|
"loss": 1.1764, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5125768016565396, |
|
"grad_norm": 1.2803332805633545, |
|
"learning_rate": 2.438473811059439e-05, |
|
"loss": 1.1213, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.5159713500118809, |
|
"grad_norm": 1.0880329608917236, |
|
"learning_rate": 2.421501069282732e-05, |
|
"loss": 1.0686, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.5193658983672222, |
|
"grad_norm": 1.350434422492981, |
|
"learning_rate": 2.4045283275060253e-05, |
|
"loss": 1.1244, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.5227604467225636, |
|
"grad_norm": 1.4851505756378174, |
|
"learning_rate": 2.387555585729319e-05, |
|
"loss": 1.1519, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.5261549950779049, |
|
"grad_norm": 1.4524593353271484, |
|
"learning_rate": 2.3705828439526124e-05, |
|
"loss": 1.1139, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.5295495434332462, |
|
"grad_norm": 1.3715015649795532, |
|
"learning_rate": 2.3536101021759056e-05, |
|
"loss": 1.1176, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.5329440917885875, |
|
"grad_norm": 1.3227180242538452, |
|
"learning_rate": 2.3366373603991988e-05, |
|
"loss": 1.1547, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.5363386401439288, |
|
"grad_norm": 1.742480754852295, |
|
"learning_rate": 2.3196646186224923e-05, |
|
"loss": 1.2338, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.5397331884992702, |
|
"grad_norm": 1.3990530967712402, |
|
"learning_rate": 2.302691876845786e-05, |
|
"loss": 1.1808, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.5431277368546115, |
|
"grad_norm": 1.6087653636932373, |
|
"learning_rate": 2.285719135069079e-05, |
|
"loss": 1.2029, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5465222852099528, |
|
"grad_norm": 1.3504618406295776, |
|
"learning_rate": 2.2687463932923726e-05, |
|
"loss": 1.138, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.5499168335652941, |
|
"grad_norm": 1.226248025894165, |
|
"learning_rate": 2.2517736515156658e-05, |
|
"loss": 1.1006, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.5533113819206354, |
|
"grad_norm": 1.0794544219970703, |
|
"learning_rate": 2.2348009097389593e-05, |
|
"loss": 1.111, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.5567059302759768, |
|
"grad_norm": 1.3800761699676514, |
|
"learning_rate": 2.217828167962253e-05, |
|
"loss": 1.1554, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.5601004786313181, |
|
"grad_norm": 1.1783385276794434, |
|
"learning_rate": 2.200855426185546e-05, |
|
"loss": 1.157, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.5634950269866594, |
|
"grad_norm": 1.483588457107544, |
|
"learning_rate": 2.1838826844088396e-05, |
|
"loss": 1.1443, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.5668895753420008, |
|
"grad_norm": 1.847670555114746, |
|
"learning_rate": 2.1669099426321328e-05, |
|
"loss": 1.1667, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.570284123697342, |
|
"grad_norm": 1.524003028869629, |
|
"learning_rate": 2.1499372008554263e-05, |
|
"loss": 1.1555, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.5736786720526834, |
|
"grad_norm": 1.6308820247650146, |
|
"learning_rate": 2.1329644590787195e-05, |
|
"loss": 1.0674, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.5770732204080247, |
|
"grad_norm": 1.4396891593933105, |
|
"learning_rate": 2.115991717302013e-05, |
|
"loss": 1.1481, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.580467768763366, |
|
"grad_norm": 1.6904021501541138, |
|
"learning_rate": 2.0990189755253066e-05, |
|
"loss": 1.044, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.5838623171187074, |
|
"grad_norm": 1.8386590480804443, |
|
"learning_rate": 2.0820462337485998e-05, |
|
"loss": 1.0662, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.5872568654740487, |
|
"grad_norm": 1.3602131605148315, |
|
"learning_rate": 2.0650734919718933e-05, |
|
"loss": 1.072, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.59065141382939, |
|
"grad_norm": 1.2853094339370728, |
|
"learning_rate": 2.0481007501951865e-05, |
|
"loss": 1.1799, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.5940459621847313, |
|
"grad_norm": 1.418142557144165, |
|
"learning_rate": 2.03112800841848e-05, |
|
"loss": 1.1163, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5974405105400726, |
|
"grad_norm": 1.3810557126998901, |
|
"learning_rate": 2.0141552666417736e-05, |
|
"loss": 1.1246, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.600835058895414, |
|
"grad_norm": 1.3166576623916626, |
|
"learning_rate": 1.9971825248650668e-05, |
|
"loss": 1.0635, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.6042296072507553, |
|
"grad_norm": 1.2918510437011719, |
|
"learning_rate": 1.98020978308836e-05, |
|
"loss": 1.1338, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.6076241556060966, |
|
"grad_norm": 1.3206653594970703, |
|
"learning_rate": 1.9632370413116535e-05, |
|
"loss": 1.1538, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.611018703961438, |
|
"grad_norm": 1.1084457635879517, |
|
"learning_rate": 1.946264299534947e-05, |
|
"loss": 1.0151, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6144132523167792, |
|
"grad_norm": 1.6946609020233154, |
|
"learning_rate": 1.9292915577582406e-05, |
|
"loss": 1.173, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.6178078006721206, |
|
"grad_norm": 1.5061676502227783, |
|
"learning_rate": 1.9123188159815334e-05, |
|
"loss": 1.1463, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.6212023490274619, |
|
"grad_norm": 1.400976300239563, |
|
"learning_rate": 1.895346074204827e-05, |
|
"loss": 1.147, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.6245968973828032, |
|
"grad_norm": 1.398390769958496, |
|
"learning_rate": 1.8783733324281205e-05, |
|
"loss": 1.0824, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.6279914457381446, |
|
"grad_norm": 1.0492353439331055, |
|
"learning_rate": 1.861570318069181e-05, |
|
"loss": 1.1912, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.6313859940934858, |
|
"grad_norm": 1.5323091745376587, |
|
"learning_rate": 1.8445975762924745e-05, |
|
"loss": 1.1324, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.6347805424488272, |
|
"grad_norm": 1.2045379877090454, |
|
"learning_rate": 1.827624834515768e-05, |
|
"loss": 1.1011, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.6381750908041685, |
|
"grad_norm": 1.4627662897109985, |
|
"learning_rate": 1.8106520927390612e-05, |
|
"loss": 1.0789, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.6415696391595098, |
|
"grad_norm": 1.492099642753601, |
|
"learning_rate": 1.7936793509623544e-05, |
|
"loss": 1.1794, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.6449641875148512, |
|
"grad_norm": 1.8160879611968994, |
|
"learning_rate": 1.776706609185648e-05, |
|
"loss": 1.087, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.6483587358701924, |
|
"grad_norm": 1.135730504989624, |
|
"learning_rate": 1.7597338674089415e-05, |
|
"loss": 1.1466, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.6517532842255338, |
|
"grad_norm": 1.2633298635482788, |
|
"learning_rate": 1.7427611256322347e-05, |
|
"loss": 1.1453, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.6551478325808752, |
|
"grad_norm": 1.3639088869094849, |
|
"learning_rate": 1.725788383855528e-05, |
|
"loss": 1.1772, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.6585423809362164, |
|
"grad_norm": 1.5371415615081787, |
|
"learning_rate": 1.7088156420788214e-05, |
|
"loss": 1.1676, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.6619369292915578, |
|
"grad_norm": 1.1205295324325562, |
|
"learning_rate": 1.691842900302115e-05, |
|
"loss": 1.1312, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.665331477646899, |
|
"grad_norm": 1.3705852031707764, |
|
"learning_rate": 1.6748701585254085e-05, |
|
"loss": 1.2032, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.6687260260022404, |
|
"grad_norm": 1.6704633235931396, |
|
"learning_rate": 1.6578974167487017e-05, |
|
"loss": 1.1594, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.6721205743575818, |
|
"grad_norm": 1.3317358493804932, |
|
"learning_rate": 1.640924674971995e-05, |
|
"loss": 1.1118, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.675515122712923, |
|
"grad_norm": 1.666467547416687, |
|
"learning_rate": 1.6239519331952884e-05, |
|
"loss": 1.1402, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.6789096710682644, |
|
"grad_norm": 1.5140140056610107, |
|
"learning_rate": 1.606979191418582e-05, |
|
"loss": 1.0712, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6823042194236056, |
|
"grad_norm": 1.5290478467941284, |
|
"learning_rate": 1.590006449641875e-05, |
|
"loss": 1.1054, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.685698767778947, |
|
"grad_norm": 1.408411979675293, |
|
"learning_rate": 1.5730337078651687e-05, |
|
"loss": 1.1755, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.6890933161342884, |
|
"grad_norm": 1.8979178667068481, |
|
"learning_rate": 1.556060966088462e-05, |
|
"loss": 1.0911, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.6924878644896296, |
|
"grad_norm": 1.3804025650024414, |
|
"learning_rate": 1.5390882243117554e-05, |
|
"loss": 1.1299, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.695882412844971, |
|
"grad_norm": 1.1603401899337769, |
|
"learning_rate": 1.5221154825350486e-05, |
|
"loss": 1.081, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.6992769612003124, |
|
"grad_norm": 1.4648966789245605, |
|
"learning_rate": 1.5051427407583421e-05, |
|
"loss": 1.1228, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.7026715095556536, |
|
"grad_norm": 1.589272379875183, |
|
"learning_rate": 1.4881699989816355e-05, |
|
"loss": 1.1068, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.706066057910995, |
|
"grad_norm": 1.337220311164856, |
|
"learning_rate": 1.471197257204929e-05, |
|
"loss": 1.1538, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.7094606062663362, |
|
"grad_norm": 1.5323350429534912, |
|
"learning_rate": 1.4542245154282224e-05, |
|
"loss": 1.0592, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.7128551546216776, |
|
"grad_norm": 1.6231937408447266, |
|
"learning_rate": 1.4372517736515156e-05, |
|
"loss": 1.1526, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.716249702977019, |
|
"grad_norm": 1.8754550218582153, |
|
"learning_rate": 1.4202790318748091e-05, |
|
"loss": 1.0773, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.7196442513323602, |
|
"grad_norm": 1.1128793954849243, |
|
"learning_rate": 1.4033062900981025e-05, |
|
"loss": 1.0372, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.7230387996877016, |
|
"grad_norm": 1.5695431232452393, |
|
"learning_rate": 1.386333548321396e-05, |
|
"loss": 1.1535, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.7264333480430428, |
|
"grad_norm": 1.327945351600647, |
|
"learning_rate": 1.3693608065446892e-05, |
|
"loss": 1.1107, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.7298278963983842, |
|
"grad_norm": 1.3291347026824951, |
|
"learning_rate": 1.3523880647679826e-05, |
|
"loss": 1.1303, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.7332224447537256, |
|
"grad_norm": 1.3102412223815918, |
|
"learning_rate": 1.3354153229912761e-05, |
|
"loss": 1.1271, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.7366169931090668, |
|
"grad_norm": 1.532332181930542, |
|
"learning_rate": 1.3184425812145695e-05, |
|
"loss": 1.1259, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.7400115414644082, |
|
"grad_norm": 1.8076393604278564, |
|
"learning_rate": 1.301469839437863e-05, |
|
"loss": 1.0277, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.7434060898197494, |
|
"grad_norm": 1.560998558998108, |
|
"learning_rate": 1.2844970976611562e-05, |
|
"loss": 1.0944, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.7468006381750908, |
|
"grad_norm": 1.4512039422988892, |
|
"learning_rate": 1.2675243558844496e-05, |
|
"loss": 1.1439, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.7501951865304322, |
|
"grad_norm": 1.1790564060211182, |
|
"learning_rate": 1.2505516141077431e-05, |
|
"loss": 1.1109, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.7535897348857734, |
|
"grad_norm": 0.8725073337554932, |
|
"learning_rate": 1.2337485997488036e-05, |
|
"loss": 1.1064, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.7569842832411148, |
|
"grad_norm": 1.7705230712890625, |
|
"learning_rate": 1.2167758579720968e-05, |
|
"loss": 1.1094, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.7603788315964561, |
|
"grad_norm": 1.72670578956604, |
|
"learning_rate": 1.1998031161953903e-05, |
|
"loss": 1.0784, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.7637733799517974, |
|
"grad_norm": 1.0623925924301147, |
|
"learning_rate": 1.1828303744186837e-05, |
|
"loss": 1.1441, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.7671679283071388, |
|
"grad_norm": 1.4572324752807617, |
|
"learning_rate": 1.165857632641977e-05, |
|
"loss": 1.0754, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.77056247666248, |
|
"grad_norm": 1.4778876304626465, |
|
"learning_rate": 1.1488848908652704e-05, |
|
"loss": 1.0816, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.7739570250178214, |
|
"grad_norm": 1.5544917583465576, |
|
"learning_rate": 1.1319121490885638e-05, |
|
"loss": 1.1449, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.7773515733731627, |
|
"grad_norm": 1.4993566274642944, |
|
"learning_rate": 1.1149394073118571e-05, |
|
"loss": 1.0315, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.780746121728504, |
|
"grad_norm": 1.5602749586105347, |
|
"learning_rate": 1.0979666655351507e-05, |
|
"loss": 1.151, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.7841406700838454, |
|
"grad_norm": 1.2788993120193481, |
|
"learning_rate": 1.080993923758444e-05, |
|
"loss": 1.1134, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.7875352184391866, |
|
"grad_norm": 1.4655214548110962, |
|
"learning_rate": 1.0640211819817374e-05, |
|
"loss": 1.1033, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.790929766794528, |
|
"grad_norm": 1.4986985921859741, |
|
"learning_rate": 1.0470484402050308e-05, |
|
"loss": 1.1025, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.7943243151498693, |
|
"grad_norm": 1.651713490486145, |
|
"learning_rate": 1.0300756984283241e-05, |
|
"loss": 1.173, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.7977188635052106, |
|
"grad_norm": 1.4241468906402588, |
|
"learning_rate": 1.0131029566516175e-05, |
|
"loss": 1.1499, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.801113411860552, |
|
"grad_norm": 1.4987541437149048, |
|
"learning_rate": 9.961302148749109e-06, |
|
"loss": 1.0355, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 0.8045079602158933, |
|
"grad_norm": 1.6847175359725952, |
|
"learning_rate": 9.791574730982044e-06, |
|
"loss": 1.0974, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 0.8079025085712346, |
|
"grad_norm": 1.319767713546753, |
|
"learning_rate": 9.621847313214976e-06, |
|
"loss": 1.0246, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 0.8112970569265759, |
|
"grad_norm": 0.8837277293205261, |
|
"learning_rate": 9.452119895447911e-06, |
|
"loss": 1.0828, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 0.8146916052819172, |
|
"grad_norm": 1.3911470174789429, |
|
"learning_rate": 9.282392477680845e-06, |
|
"loss": 1.1347, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8180861536372586, |
|
"grad_norm": 1.6268776655197144, |
|
"learning_rate": 9.112665059913779e-06, |
|
"loss": 1.1426, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 0.8214807019925999, |
|
"grad_norm": 1.229019284248352, |
|
"learning_rate": 8.942937642146714e-06, |
|
"loss": 1.147, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 0.8248752503479412, |
|
"grad_norm": 1.4097239971160889, |
|
"learning_rate": 8.773210224379646e-06, |
|
"loss": 1.1377, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 0.8282697987032825, |
|
"grad_norm": 1.1406160593032837, |
|
"learning_rate": 8.603482806612581e-06, |
|
"loss": 1.1035, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 0.8316643470586238, |
|
"grad_norm": 1.0381433963775635, |
|
"learning_rate": 8.433755388845515e-06, |
|
"loss": 1.091, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.8350588954139652, |
|
"grad_norm": 1.3789398670196533, |
|
"learning_rate": 8.264027971078449e-06, |
|
"loss": 1.0108, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 0.8384534437693065, |
|
"grad_norm": 1.2343610525131226, |
|
"learning_rate": 8.094300553311382e-06, |
|
"loss": 1.1196, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 0.8418479921246478, |
|
"grad_norm": 1.3978173732757568, |
|
"learning_rate": 7.924573135544316e-06, |
|
"loss": 1.0261, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 0.8452425404799891, |
|
"grad_norm": 1.2678471803665161, |
|
"learning_rate": 7.75484571777725e-06, |
|
"loss": 1.0813, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 0.8486370888353305, |
|
"grad_norm": 1.5607575178146362, |
|
"learning_rate": 7.585118300010184e-06, |
|
"loss": 1.167, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8520316371906718, |
|
"grad_norm": 1.1577645540237427, |
|
"learning_rate": 7.415390882243117e-06, |
|
"loss": 1.1326, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 0.8554261855460131, |
|
"grad_norm": 1.0699902772903442, |
|
"learning_rate": 7.2456634644760515e-06, |
|
"loss": 1.0751, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 0.8588207339013544, |
|
"grad_norm": 1.3334201574325562, |
|
"learning_rate": 7.075936046708986e-06, |
|
"loss": 1.083, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 0.8622152822566957, |
|
"grad_norm": 1.493215799331665, |
|
"learning_rate": 6.90620862894192e-06, |
|
"loss": 1.152, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 0.8656098306120371, |
|
"grad_norm": 1.5618408918380737, |
|
"learning_rate": 6.736481211174854e-06, |
|
"loss": 1.0474, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.8690043789673784, |
|
"grad_norm": 1.4898067712783813, |
|
"learning_rate": 6.566753793407787e-06, |
|
"loss": 1.1261, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 0.8723989273227197, |
|
"grad_norm": 1.1436446905136108, |
|
"learning_rate": 6.3970263756407215e-06, |
|
"loss": 1.0708, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 0.875793475678061, |
|
"grad_norm": 1.4544737339019775, |
|
"learning_rate": 6.227298957873655e-06, |
|
"loss": 1.0788, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 0.8791880240334023, |
|
"grad_norm": 1.0729115009307861, |
|
"learning_rate": 6.05757154010659e-06, |
|
"loss": 1.0525, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 0.8825825723887437, |
|
"grad_norm": 1.5516784191131592, |
|
"learning_rate": 5.887844122339523e-06, |
|
"loss": 1.0874, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.885977120744085, |
|
"grad_norm": 1.4837692975997925, |
|
"learning_rate": 5.718116704572457e-06, |
|
"loss": 1.1142, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 0.8893716690994263, |
|
"grad_norm": 1.1491631269454956, |
|
"learning_rate": 5.548389286805391e-06, |
|
"loss": 1.0618, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 0.8927662174547677, |
|
"grad_norm": 1.5417340993881226, |
|
"learning_rate": 5.378661869038324e-06, |
|
"loss": 0.993, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 0.896160765810109, |
|
"grad_norm": 0.9728216528892517, |
|
"learning_rate": 5.208934451271259e-06, |
|
"loss": 1.1583, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 0.8995553141654503, |
|
"grad_norm": 1.0447022914886475, |
|
"learning_rate": 5.0392070335041925e-06, |
|
"loss": 1.0472, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.9029498625207916, |
|
"grad_norm": 1.2869070768356323, |
|
"learning_rate": 4.869479615737126e-06, |
|
"loss": 1.0823, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 0.9063444108761329, |
|
"grad_norm": 1.6531902551651, |
|
"learning_rate": 4.69975219797006e-06, |
|
"loss": 1.0836, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 0.9097389592314743, |
|
"grad_norm": 1.559571385383606, |
|
"learning_rate": 4.530024780202994e-06, |
|
"loss": 1.0009, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 0.9131335075868156, |
|
"grad_norm": 1.3163347244262695, |
|
"learning_rate": 4.360297362435928e-06, |
|
"loss": 1.1214, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 0.9165280559421569, |
|
"grad_norm": 1.1032936573028564, |
|
"learning_rate": 4.1905699446688625e-06, |
|
"loss": 1.113, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.9199226042974982, |
|
"grad_norm": 1.4257267713546753, |
|
"learning_rate": 4.020842526901796e-06, |
|
"loss": 1.0477, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 0.9233171526528395, |
|
"grad_norm": 2.0018675327301025, |
|
"learning_rate": 3.85111510913473e-06, |
|
"loss": 1.1487, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 0.9267117010081809, |
|
"grad_norm": 1.38235342502594, |
|
"learning_rate": 3.681387691367664e-06, |
|
"loss": 1.0816, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 0.9301062493635222, |
|
"grad_norm": 1.4731274843215942, |
|
"learning_rate": 3.5116602736005976e-06, |
|
"loss": 1.0882, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 0.9335007977188635, |
|
"grad_norm": 1.225797414779663, |
|
"learning_rate": 3.3419328558335317e-06, |
|
"loss": 1.0551, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.9368953460742049, |
|
"grad_norm": 1.597345232963562, |
|
"learning_rate": 3.1722054380664653e-06, |
|
"loss": 1.0565, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 0.9402898944295461, |
|
"grad_norm": 1.092685341835022, |
|
"learning_rate": 3.0024780202993994e-06, |
|
"loss": 1.0821, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 0.9436844427848875, |
|
"grad_norm": 1.3143861293792725, |
|
"learning_rate": 2.832750602532333e-06, |
|
"loss": 1.0049, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 0.9470789911402288, |
|
"grad_norm": 1.1611847877502441, |
|
"learning_rate": 2.663023184765267e-06, |
|
"loss": 1.031, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 0.9504735394955701, |
|
"grad_norm": 1.3087385892868042, |
|
"learning_rate": 2.4932957669982012e-06, |
|
"loss": 0.9714, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.9538680878509115, |
|
"grad_norm": 1.118117094039917, |
|
"learning_rate": 2.323568349231135e-06, |
|
"loss": 1.0057, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 0.9572626362062527, |
|
"grad_norm": 1.5035566091537476, |
|
"learning_rate": 2.1538409314640686e-06, |
|
"loss": 1.1181, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 0.9606571845615941, |
|
"grad_norm": 1.372116208076477, |
|
"learning_rate": 1.9841135136970026e-06, |
|
"loss": 1.042, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 0.9640517329169354, |
|
"grad_norm": 1.189626693725586, |
|
"learning_rate": 1.8160833701076074e-06, |
|
"loss": 1.0346, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 0.9674462812722767, |
|
"grad_norm": 1.6013319492340088, |
|
"learning_rate": 1.6463559523405412e-06, |
|
"loss": 1.06, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.9708408296276181, |
|
"grad_norm": 1.7186366319656372, |
|
"learning_rate": 1.4766285345734751e-06, |
|
"loss": 1.1076, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 0.9742353779829593, |
|
"grad_norm": 1.3533858060836792, |
|
"learning_rate": 1.306901116806409e-06, |
|
"loss": 1.1391, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 0.9776299263383007, |
|
"grad_norm": 1.34947669506073, |
|
"learning_rate": 1.1371736990393429e-06, |
|
"loss": 1.0501, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 0.981024474693642, |
|
"grad_norm": 1.5835421085357666, |
|
"learning_rate": 9.674462812722767e-07, |
|
"loss": 1.1168, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 0.9844190230489833, |
|
"grad_norm": 1.2659107446670532, |
|
"learning_rate": 7.977188635052106e-07, |
|
"loss": 1.0592, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.9878135714043247, |
|
"grad_norm": 1.212120532989502, |
|
"learning_rate": 6.279914457381446e-07, |
|
"loss": 1.0867, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 0.9912081197596659, |
|
"grad_norm": 1.5085951089859009, |
|
"learning_rate": 4.582640279710785e-07, |
|
"loss": 1.0909, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 0.9946026681150073, |
|
"grad_norm": 1.6492177248001099, |
|
"learning_rate": 2.885366102040124e-07, |
|
"loss": 1.0747, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 0.9979972164703487, |
|
"grad_norm": 1.368004322052002, |
|
"learning_rate": 1.1880919243694626e-07, |
|
"loss": 0.9943, |
|
"step": 29400 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 29459, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 29459, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.93185924572119e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|