|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.896817068905212, |
|
"eval_steps": 500, |
|
"global_step": 21000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.023318176518596245, |
|
"grad_norm": 0.3708130121231079, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1701, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04663635303719249, |
|
"grad_norm": 0.7055436968803406, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9527, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06995452955578874, |
|
"grad_norm": 0.310996949672699, |
|
"learning_rate": 0.0002, |
|
"loss": 0.871, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09327270607438498, |
|
"grad_norm": 0.34611570835113525, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8128, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11659088259298123, |
|
"grad_norm": 0.2793200612068176, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8008, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13990905911157747, |
|
"grad_norm": 0.2440558820962906, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7364, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.16322723563017372, |
|
"grad_norm": 0.20660006999969482, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7016, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.18654541214876996, |
|
"grad_norm": 0.3151717782020569, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6986, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2098635886673662, |
|
"grad_norm": 0.4207448363304138, |
|
"learning_rate": 0.0002, |
|
"loss": 0.649, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.23318176518596245, |
|
"grad_norm": 0.43152570724487305, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6725, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2564999417045587, |
|
"grad_norm": 0.31539487838745117, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6395, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.27981811822315494, |
|
"grad_norm": 0.3349384665489197, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6033, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3031362947417512, |
|
"grad_norm": 0.2724147140979767, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6076, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.32645447126034743, |
|
"grad_norm": 0.2925530970096588, |
|
"learning_rate": 0.0002, |
|
"loss": 0.585, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3497726477789437, |
|
"grad_norm": 0.4674293100833893, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5657, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3730908242975399, |
|
"grad_norm": 0.3915441930294037, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5453, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.39640900081613617, |
|
"grad_norm": 0.24304556846618652, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5198, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4197271773347324, |
|
"grad_norm": 0.5447902679443359, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5427, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.44304535385332866, |
|
"grad_norm": 0.4133426547050476, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5204, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4663635303719249, |
|
"grad_norm": 0.41733473539352417, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5204, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.48968170689052115, |
|
"grad_norm": 0.3181161880493164, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4698, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5129998834091174, |
|
"grad_norm": 0.34142622351646423, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4871, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5363180599277136, |
|
"grad_norm": 0.1926470398902893, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4649, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5596362364463099, |
|
"grad_norm": 0.30340591073036194, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4665, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5829544129649061, |
|
"grad_norm": 0.3195839524269104, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4667, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6062725894835024, |
|
"grad_norm": 0.2145429104566574, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4463, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6295907660020986, |
|
"grad_norm": 0.15962275862693787, |
|
"learning_rate": 0.0002, |
|
"loss": 0.429, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6529089425206949, |
|
"grad_norm": 0.3597501516342163, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4277, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6762271190392911, |
|
"grad_norm": 0.44612497091293335, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4123, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6995452955578874, |
|
"grad_norm": 0.21562007069587708, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4074, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7228634720764836, |
|
"grad_norm": 0.23217037320137024, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4037, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7461816485950798, |
|
"grad_norm": 0.3096787631511688, |
|
"learning_rate": 0.0002, |
|
"loss": 0.401, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7694998251136761, |
|
"grad_norm": 0.18558426201343536, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3983, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7928180016322723, |
|
"grad_norm": 0.2520066797733307, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4056, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8161361781508686, |
|
"grad_norm": 0.41013041138648987, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3706, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8394543546694648, |
|
"grad_norm": 0.14811871945858002, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3829, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8627725311880611, |
|
"grad_norm": 0.36381468176841736, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3744, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8860907077066573, |
|
"grad_norm": 0.28783467411994934, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3538, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9094088842252536, |
|
"grad_norm": 0.23508860170841217, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3277, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9327270607438498, |
|
"grad_norm": 0.3819214403629303, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3317, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9560452372624461, |
|
"grad_norm": 0.298714816570282, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3329, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9793634137810423, |
|
"grad_norm": 0.17287446558475494, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3418, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.0026815902996387, |
|
"grad_norm": 0.3725602328777313, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3224, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.0259997668182348, |
|
"grad_norm": 0.6124657988548279, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2589, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.0493179433368311, |
|
"grad_norm": 0.5308946371078491, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2718, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0726361198554273, |
|
"grad_norm": 0.3070002496242523, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2662, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.0959542963740236, |
|
"grad_norm": 0.44111424684524536, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2516, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.1192724728926198, |
|
"grad_norm": 0.32735341787338257, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2652, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.1425906494112161, |
|
"grad_norm": 0.3475642800331116, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2498, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.1659088259298123, |
|
"grad_norm": 0.41938111186027527, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2577, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.1892270024484086, |
|
"grad_norm": 0.47618812322616577, |
|
"learning_rate": 0.0002, |
|
"loss": 0.251, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.2125451789670048, |
|
"grad_norm": 0.27327144145965576, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2511, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.2358633554856011, |
|
"grad_norm": 0.3251878321170807, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2264, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.2591815320041972, |
|
"grad_norm": 0.5156410336494446, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2617, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.2824997085227934, |
|
"grad_norm": 0.30861613154411316, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2441, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.3058178850413897, |
|
"grad_norm": 0.43310919404029846, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2331, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.329136061559986, |
|
"grad_norm": 0.36176246404647827, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2431, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.3524542380785822, |
|
"grad_norm": 0.3790377974510193, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2458, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.3757724145971786, |
|
"grad_norm": 0.4052121341228485, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2446, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.3990905911157747, |
|
"grad_norm": 0.35783982276916504, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2465, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.422408767634371, |
|
"grad_norm": 0.35436511039733887, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2569, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.4457269441529672, |
|
"grad_norm": 0.2950509488582611, |
|
"learning_rate": 0.0002, |
|
"loss": 0.22, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.4690451206715636, |
|
"grad_norm": 0.36950767040252686, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2433, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.4923632971901597, |
|
"grad_norm": 0.35253265500068665, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2269, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.515681473708756, |
|
"grad_norm": 0.3378414213657379, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2329, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.5389996502273522, |
|
"grad_norm": 0.4102073311805725, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2404, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.5623178267459483, |
|
"grad_norm": 0.4430312216281891, |
|
"learning_rate": 0.0002, |
|
"loss": 0.235, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.5856360032645447, |
|
"grad_norm": 0.3363936245441437, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2288, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.608954179783141, |
|
"grad_norm": 0.3177776634693146, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2443, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.6322723563017372, |
|
"grad_norm": 0.33283111453056335, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2267, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.6555905328203333, |
|
"grad_norm": 0.4799099564552307, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2355, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.6789087093389297, |
|
"grad_norm": 0.38987642526626587, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2268, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.702226885857526, |
|
"grad_norm": 0.32820141315460205, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2098, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.7255450623761222, |
|
"grad_norm": 0.4211929142475128, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2291, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.7488632388947183, |
|
"grad_norm": 0.42743125557899475, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2192, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.7721814154133146, |
|
"grad_norm": 0.33759135007858276, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2301, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.795499591931911, |
|
"grad_norm": 0.24578171968460083, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2233, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.8188177684505071, |
|
"grad_norm": 0.3331544101238251, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2308, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.8421359449691033, |
|
"grad_norm": 0.4028831720352173, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2112, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.8654541214876996, |
|
"grad_norm": 0.3874329924583435, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1998, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.888772298006296, |
|
"grad_norm": 0.30130070447921753, |
|
"learning_rate": 0.0002, |
|
"loss": 0.203, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.9120904745248921, |
|
"grad_norm": 0.41124048829078674, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2184, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.9354086510434882, |
|
"grad_norm": 0.3104913532733917, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2211, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.9587268275620846, |
|
"grad_norm": 0.30567994713783264, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2039, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.982045004080681, |
|
"grad_norm": 0.3126045763492584, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2107, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.0053631805992773, |
|
"grad_norm": 0.29460686445236206, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1901, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.0286813571178732, |
|
"grad_norm": 0.4113939106464386, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1621, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.0519995336364696, |
|
"grad_norm": 0.33105671405792236, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1657, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.075317710155066, |
|
"grad_norm": 0.33191269636154175, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1773, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.0986358866736623, |
|
"grad_norm": 0.3344513475894928, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1654, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.121954063192258, |
|
"grad_norm": 0.31760096549987793, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1677, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.1452722397108546, |
|
"grad_norm": 0.32853373885154724, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1775, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.168590416229451, |
|
"grad_norm": 0.38260915875434875, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1644, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.1919085927480473, |
|
"grad_norm": 0.3272022604942322, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1632, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.215226769266643, |
|
"grad_norm": 0.40181514620780945, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1672, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.2385449457852395, |
|
"grad_norm": 0.285182923078537, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1695, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.261863122303836, |
|
"grad_norm": 0.3401045799255371, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1658, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.2851812988224323, |
|
"grad_norm": 0.45088696479797363, |
|
"learning_rate": 0.0002, |
|
"loss": 0.173, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.308499475341028, |
|
"grad_norm": 0.09891465306282043, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1725, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.3318176518596245, |
|
"grad_norm": 0.3077000081539154, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1777, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.355135828378221, |
|
"grad_norm": 0.2650957703590393, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1606, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.3784540048968172, |
|
"grad_norm": 0.2967466413974762, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1626, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.401772181415413, |
|
"grad_norm": 0.21177765727043152, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1762, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.4250903579340095, |
|
"grad_norm": 0.34562838077545166, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1653, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.448408534452606, |
|
"grad_norm": 0.2537182569503784, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1722, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.4717267109712022, |
|
"grad_norm": 0.22955211997032166, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1713, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.495044887489798, |
|
"grad_norm": 0.3709162175655365, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1679, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.5183630640083945, |
|
"grad_norm": 0.24581150710582733, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1604, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.541681240526991, |
|
"grad_norm": 0.20854513347148895, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1687, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.5649994170455868, |
|
"grad_norm": 0.2496633380651474, |
|
"learning_rate": 0.0002, |
|
"loss": 0.163, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.588317593564183, |
|
"grad_norm": 0.23603980243206024, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1748, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.6116357700827795, |
|
"grad_norm": 0.36322489380836487, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1798, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.634953946601376, |
|
"grad_norm": 0.32981303334236145, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1588, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.658272123119972, |
|
"grad_norm": 0.4760492742061615, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1723, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.681590299638568, |
|
"grad_norm": 0.22435927391052246, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1742, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.7049084761571645, |
|
"grad_norm": 0.2695131003856659, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1602, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.728226652675761, |
|
"grad_norm": 0.16897708177566528, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1698, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.751544829194357, |
|
"grad_norm": 0.2540949881076813, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1641, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.7748630057129535, |
|
"grad_norm": 0.40854746103286743, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1747, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.7981811822315494, |
|
"grad_norm": 0.3012579679489136, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1619, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.821499358750146, |
|
"grad_norm": 0.18468593060970306, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1686, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.844817535268742, |
|
"grad_norm": 0.3668818175792694, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1588, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 2.868135711787338, |
|
"grad_norm": 0.5856422185897827, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1784, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 2.8914538883059344, |
|
"grad_norm": 0.37487712502479553, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1701, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 2.9147720648245308, |
|
"grad_norm": 0.29282090067863464, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1613, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.938090241343127, |
|
"grad_norm": 0.306607186794281, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1655, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 2.9614084178617235, |
|
"grad_norm": 0.1990358531475067, |
|
"learning_rate": 0.0002, |
|
"loss": 0.17, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 2.9847265943803194, |
|
"grad_norm": 0.4855429232120514, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1722, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 3.0080447708989158, |
|
"grad_norm": 0.39795544743537903, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1548, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.031362947417512, |
|
"grad_norm": 0.3113553524017334, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1396, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.054681123936108, |
|
"grad_norm": 0.3086554706096649, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1364, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 3.0779993004547044, |
|
"grad_norm": 0.24818335473537445, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1414, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 3.1013174769733007, |
|
"grad_norm": 0.37954941391944885, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1388, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 3.124635653491897, |
|
"grad_norm": 0.2943727672100067, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1408, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 3.147953830010493, |
|
"grad_norm": 0.35590696334838867, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1363, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.1712720065290894, |
|
"grad_norm": 0.19578373432159424, |
|
"learning_rate": 0.0002, |
|
"loss": 0.137, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 3.1945901830476857, |
|
"grad_norm": 0.25028303265571594, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1348, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 3.217908359566282, |
|
"grad_norm": 0.18405300378799438, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1372, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 3.241226536084878, |
|
"grad_norm": 0.31417056918144226, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1428, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 3.2645447126034743, |
|
"grad_norm": 0.22496923804283142, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1378, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.2878628891220707, |
|
"grad_norm": 0.23862232267856598, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1362, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 3.311181065640667, |
|
"grad_norm": 0.2142096310853958, |
|
"learning_rate": 0.0002, |
|
"loss": 0.139, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 3.334499242159263, |
|
"grad_norm": 0.2794269025325775, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1376, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 3.3578174186778593, |
|
"grad_norm": 0.14498618245124817, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1416, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 3.3811355951964557, |
|
"grad_norm": 0.2895399332046509, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1379, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.404453771715052, |
|
"grad_norm": 0.2537992000579834, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1356, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 3.427771948233648, |
|
"grad_norm": 0.20395183563232422, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1424, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 3.4510901247522443, |
|
"grad_norm": 0.15283405780792236, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1395, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 3.4744083012708407, |
|
"grad_norm": 0.4268224537372589, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1359, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 3.497726477789437, |
|
"grad_norm": 0.22292669117450714, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1386, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.5210446543080334, |
|
"grad_norm": 0.11900927871465683, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1442, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 3.5443628308266293, |
|
"grad_norm": 0.45133286714553833, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1365, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 3.5676810073452256, |
|
"grad_norm": 0.30186957120895386, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1416, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 3.590999183863822, |
|
"grad_norm": 0.31408384442329407, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1387, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 3.614317360382418, |
|
"grad_norm": 0.36072710156440735, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1428, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.6376355369010143, |
|
"grad_norm": 0.28984448313713074, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1393, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 3.6609537134196106, |
|
"grad_norm": 0.2014656662940979, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1435, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 3.684271889938207, |
|
"grad_norm": 0.41273656487464905, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1369, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 3.7075900664568033, |
|
"grad_norm": 0.48672163486480713, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1433, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 3.7309082429753992, |
|
"grad_norm": 0.19120950996875763, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1405, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.7542264194939956, |
|
"grad_norm": 0.19792740046977997, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1451, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 3.777544596012592, |
|
"grad_norm": 0.14919213950634003, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1382, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 3.800862772531188, |
|
"grad_norm": 0.4650104343891144, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1339, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 3.8241809490497842, |
|
"grad_norm": 0.3627985417842865, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1422, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 3.8474991255683806, |
|
"grad_norm": 0.7782896161079407, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1432, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.870817302086977, |
|
"grad_norm": 0.2858645021915436, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1413, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 3.8941354786055733, |
|
"grad_norm": 0.22150644659996033, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1437, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 3.917453655124169, |
|
"grad_norm": 0.3596114218235016, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1463, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 3.9407718316427656, |
|
"grad_norm": 0.14949366450309753, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1449, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 3.964090008161362, |
|
"grad_norm": 0.32889851927757263, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1396, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.987408184679958, |
|
"grad_norm": 0.1940721869468689, |
|
"learning_rate": 0.0002, |
|
"loss": 0.14, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 4.010726361198555, |
|
"grad_norm": 0.1328798085451126, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1316, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 4.0340445377171505, |
|
"grad_norm": 0.09979192912578583, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1224, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 4.0573627142357465, |
|
"grad_norm": 0.22828274965286255, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1184, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 4.080680890754343, |
|
"grad_norm": 0.1396108716726303, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1189, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.103999067272939, |
|
"grad_norm": 0.1849929839372635, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1231, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 4.127317243791535, |
|
"grad_norm": 0.14947502315044403, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1158, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 4.150635420310132, |
|
"grad_norm": 0.3471536934375763, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1204, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 4.173953596828728, |
|
"grad_norm": 0.23290419578552246, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1175, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 4.197271773347325, |
|
"grad_norm": 0.17477743327617645, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1205, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.2205899498659205, |
|
"grad_norm": 0.1214243695139885, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1188, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 4.243908126384516, |
|
"grad_norm": 0.12706777453422546, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1196, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 4.267226302903113, |
|
"grad_norm": 0.18115375936031342, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1179, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 4.290544479421709, |
|
"grad_norm": 0.05149231478571892, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1224, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 4.313862655940305, |
|
"grad_norm": 0.47274354100227356, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1192, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.337180832458902, |
|
"grad_norm": 0.218338742852211, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1244, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 4.360499008977498, |
|
"grad_norm": 0.1247347891330719, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1267, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 4.383817185496095, |
|
"grad_norm": 0.2586764991283417, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1236, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 4.4071353620146905, |
|
"grad_norm": 0.11474807560443878, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1252, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 4.430453538533286, |
|
"grad_norm": 0.34646329283714294, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1237, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.453771715051883, |
|
"grad_norm": 0.17445826530456543, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1183, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 4.477089891570479, |
|
"grad_norm": 0.3867531716823578, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1248, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 4.500408068089076, |
|
"grad_norm": 0.15927106142044067, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1258, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 4.523726244607672, |
|
"grad_norm": 0.2284346967935562, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1244, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 4.547044421126268, |
|
"grad_norm": 0.3231777250766754, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1257, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.5703625976448645, |
|
"grad_norm": 0.10116703063249588, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1293, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 4.59368077416346, |
|
"grad_norm": 0.2922173738479614, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1262, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 4.616998950682056, |
|
"grad_norm": 0.1958065629005432, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1258, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 4.640317127200653, |
|
"grad_norm": 0.08755222707986832, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1293, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 4.663635303719249, |
|
"grad_norm": 0.1416950523853302, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1227, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.686953480237845, |
|
"grad_norm": 0.21383579075336456, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1272, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 4.710271656756442, |
|
"grad_norm": 0.27910149097442627, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1298, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 4.733589833275038, |
|
"grad_norm": 0.07715137302875519, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1266, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 4.7569080097936345, |
|
"grad_norm": 0.08127077668905258, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1269, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 4.78022618631223, |
|
"grad_norm": 0.3075973391532898, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1308, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 4.803544362830826, |
|
"grad_norm": 0.23989351093769073, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1217, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 4.826862539349423, |
|
"grad_norm": 0.1361120343208313, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1237, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 4.850180715868019, |
|
"grad_norm": 0.3711351156234741, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1248, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 4.873498892386616, |
|
"grad_norm": 0.3196912109851837, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1236, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 4.896817068905212, |
|
"grad_norm": 0.10089880973100662, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1248, |
|
"step": 21000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 21440, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9056254817400013e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|