Vinallama_patch_file_v2 / trainer_state.json
anhvv200053's picture
Upload 11 files
2e619f9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.896817068905212,
"eval_steps": 500,
"global_step": 21000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023318176518596245,
"grad_norm": 0.3708130121231079,
"learning_rate": 0.0002,
"loss": 1.1701,
"step": 100
},
{
"epoch": 0.04663635303719249,
"grad_norm": 0.7055436968803406,
"learning_rate": 0.0002,
"loss": 0.9527,
"step": 200
},
{
"epoch": 0.06995452955578874,
"grad_norm": 0.310996949672699,
"learning_rate": 0.0002,
"loss": 0.871,
"step": 300
},
{
"epoch": 0.09327270607438498,
"grad_norm": 0.34611570835113525,
"learning_rate": 0.0002,
"loss": 0.8128,
"step": 400
},
{
"epoch": 0.11659088259298123,
"grad_norm": 0.2793200612068176,
"learning_rate": 0.0002,
"loss": 0.8008,
"step": 500
},
{
"epoch": 0.13990905911157747,
"grad_norm": 0.2440558820962906,
"learning_rate": 0.0002,
"loss": 0.7364,
"step": 600
},
{
"epoch": 0.16322723563017372,
"grad_norm": 0.20660006999969482,
"learning_rate": 0.0002,
"loss": 0.7016,
"step": 700
},
{
"epoch": 0.18654541214876996,
"grad_norm": 0.3151717782020569,
"learning_rate": 0.0002,
"loss": 0.6986,
"step": 800
},
{
"epoch": 0.2098635886673662,
"grad_norm": 0.4207448363304138,
"learning_rate": 0.0002,
"loss": 0.649,
"step": 900
},
{
"epoch": 0.23318176518596245,
"grad_norm": 0.43152570724487305,
"learning_rate": 0.0002,
"loss": 0.6725,
"step": 1000
},
{
"epoch": 0.2564999417045587,
"grad_norm": 0.31539487838745117,
"learning_rate": 0.0002,
"loss": 0.6395,
"step": 1100
},
{
"epoch": 0.27981811822315494,
"grad_norm": 0.3349384665489197,
"learning_rate": 0.0002,
"loss": 0.6033,
"step": 1200
},
{
"epoch": 0.3031362947417512,
"grad_norm": 0.2724147140979767,
"learning_rate": 0.0002,
"loss": 0.6076,
"step": 1300
},
{
"epoch": 0.32645447126034743,
"grad_norm": 0.2925530970096588,
"learning_rate": 0.0002,
"loss": 0.585,
"step": 1400
},
{
"epoch": 0.3497726477789437,
"grad_norm": 0.4674293100833893,
"learning_rate": 0.0002,
"loss": 0.5657,
"step": 1500
},
{
"epoch": 0.3730908242975399,
"grad_norm": 0.3915441930294037,
"learning_rate": 0.0002,
"loss": 0.5453,
"step": 1600
},
{
"epoch": 0.39640900081613617,
"grad_norm": 0.24304556846618652,
"learning_rate": 0.0002,
"loss": 0.5198,
"step": 1700
},
{
"epoch": 0.4197271773347324,
"grad_norm": 0.5447902679443359,
"learning_rate": 0.0002,
"loss": 0.5427,
"step": 1800
},
{
"epoch": 0.44304535385332866,
"grad_norm": 0.4133426547050476,
"learning_rate": 0.0002,
"loss": 0.5204,
"step": 1900
},
{
"epoch": 0.4663635303719249,
"grad_norm": 0.41733473539352417,
"learning_rate": 0.0002,
"loss": 0.5204,
"step": 2000
},
{
"epoch": 0.48968170689052115,
"grad_norm": 0.3181161880493164,
"learning_rate": 0.0002,
"loss": 0.4698,
"step": 2100
},
{
"epoch": 0.5129998834091174,
"grad_norm": 0.34142622351646423,
"learning_rate": 0.0002,
"loss": 0.4871,
"step": 2200
},
{
"epoch": 0.5363180599277136,
"grad_norm": 0.1926470398902893,
"learning_rate": 0.0002,
"loss": 0.4649,
"step": 2300
},
{
"epoch": 0.5596362364463099,
"grad_norm": 0.30340591073036194,
"learning_rate": 0.0002,
"loss": 0.4665,
"step": 2400
},
{
"epoch": 0.5829544129649061,
"grad_norm": 0.3195839524269104,
"learning_rate": 0.0002,
"loss": 0.4667,
"step": 2500
},
{
"epoch": 0.6062725894835024,
"grad_norm": 0.2145429104566574,
"learning_rate": 0.0002,
"loss": 0.4463,
"step": 2600
},
{
"epoch": 0.6295907660020986,
"grad_norm": 0.15962275862693787,
"learning_rate": 0.0002,
"loss": 0.429,
"step": 2700
},
{
"epoch": 0.6529089425206949,
"grad_norm": 0.3597501516342163,
"learning_rate": 0.0002,
"loss": 0.4277,
"step": 2800
},
{
"epoch": 0.6762271190392911,
"grad_norm": 0.44612497091293335,
"learning_rate": 0.0002,
"loss": 0.4123,
"step": 2900
},
{
"epoch": 0.6995452955578874,
"grad_norm": 0.21562007069587708,
"learning_rate": 0.0002,
"loss": 0.4074,
"step": 3000
},
{
"epoch": 0.7228634720764836,
"grad_norm": 0.23217037320137024,
"learning_rate": 0.0002,
"loss": 0.4037,
"step": 3100
},
{
"epoch": 0.7461816485950798,
"grad_norm": 0.3096787631511688,
"learning_rate": 0.0002,
"loss": 0.401,
"step": 3200
},
{
"epoch": 0.7694998251136761,
"grad_norm": 0.18558426201343536,
"learning_rate": 0.0002,
"loss": 0.3983,
"step": 3300
},
{
"epoch": 0.7928180016322723,
"grad_norm": 0.2520066797733307,
"learning_rate": 0.0002,
"loss": 0.4056,
"step": 3400
},
{
"epoch": 0.8161361781508686,
"grad_norm": 0.41013041138648987,
"learning_rate": 0.0002,
"loss": 0.3706,
"step": 3500
},
{
"epoch": 0.8394543546694648,
"grad_norm": 0.14811871945858002,
"learning_rate": 0.0002,
"loss": 0.3829,
"step": 3600
},
{
"epoch": 0.8627725311880611,
"grad_norm": 0.36381468176841736,
"learning_rate": 0.0002,
"loss": 0.3744,
"step": 3700
},
{
"epoch": 0.8860907077066573,
"grad_norm": 0.28783467411994934,
"learning_rate": 0.0002,
"loss": 0.3538,
"step": 3800
},
{
"epoch": 0.9094088842252536,
"grad_norm": 0.23508860170841217,
"learning_rate": 0.0002,
"loss": 0.3277,
"step": 3900
},
{
"epoch": 0.9327270607438498,
"grad_norm": 0.3819214403629303,
"learning_rate": 0.0002,
"loss": 0.3317,
"step": 4000
},
{
"epoch": 0.9560452372624461,
"grad_norm": 0.298714816570282,
"learning_rate": 0.0002,
"loss": 0.3329,
"step": 4100
},
{
"epoch": 0.9793634137810423,
"grad_norm": 0.17287446558475494,
"learning_rate": 0.0002,
"loss": 0.3418,
"step": 4200
},
{
"epoch": 1.0026815902996387,
"grad_norm": 0.3725602328777313,
"learning_rate": 0.0002,
"loss": 0.3224,
"step": 4300
},
{
"epoch": 1.0259997668182348,
"grad_norm": 0.6124657988548279,
"learning_rate": 0.0002,
"loss": 0.2589,
"step": 4400
},
{
"epoch": 1.0493179433368311,
"grad_norm": 0.5308946371078491,
"learning_rate": 0.0002,
"loss": 0.2718,
"step": 4500
},
{
"epoch": 1.0726361198554273,
"grad_norm": 0.3070002496242523,
"learning_rate": 0.0002,
"loss": 0.2662,
"step": 4600
},
{
"epoch": 1.0959542963740236,
"grad_norm": 0.44111424684524536,
"learning_rate": 0.0002,
"loss": 0.2516,
"step": 4700
},
{
"epoch": 1.1192724728926198,
"grad_norm": 0.32735341787338257,
"learning_rate": 0.0002,
"loss": 0.2652,
"step": 4800
},
{
"epoch": 1.1425906494112161,
"grad_norm": 0.3475642800331116,
"learning_rate": 0.0002,
"loss": 0.2498,
"step": 4900
},
{
"epoch": 1.1659088259298123,
"grad_norm": 0.41938111186027527,
"learning_rate": 0.0002,
"loss": 0.2577,
"step": 5000
},
{
"epoch": 1.1892270024484086,
"grad_norm": 0.47618812322616577,
"learning_rate": 0.0002,
"loss": 0.251,
"step": 5100
},
{
"epoch": 1.2125451789670048,
"grad_norm": 0.27327144145965576,
"learning_rate": 0.0002,
"loss": 0.2511,
"step": 5200
},
{
"epoch": 1.2358633554856011,
"grad_norm": 0.3251878321170807,
"learning_rate": 0.0002,
"loss": 0.2264,
"step": 5300
},
{
"epoch": 1.2591815320041972,
"grad_norm": 0.5156410336494446,
"learning_rate": 0.0002,
"loss": 0.2617,
"step": 5400
},
{
"epoch": 1.2824997085227934,
"grad_norm": 0.30861613154411316,
"learning_rate": 0.0002,
"loss": 0.2441,
"step": 5500
},
{
"epoch": 1.3058178850413897,
"grad_norm": 0.43310919404029846,
"learning_rate": 0.0002,
"loss": 0.2331,
"step": 5600
},
{
"epoch": 1.329136061559986,
"grad_norm": 0.36176246404647827,
"learning_rate": 0.0002,
"loss": 0.2431,
"step": 5700
},
{
"epoch": 1.3524542380785822,
"grad_norm": 0.3790377974510193,
"learning_rate": 0.0002,
"loss": 0.2458,
"step": 5800
},
{
"epoch": 1.3757724145971786,
"grad_norm": 0.4052121341228485,
"learning_rate": 0.0002,
"loss": 0.2446,
"step": 5900
},
{
"epoch": 1.3990905911157747,
"grad_norm": 0.35783982276916504,
"learning_rate": 0.0002,
"loss": 0.2465,
"step": 6000
},
{
"epoch": 1.422408767634371,
"grad_norm": 0.35436511039733887,
"learning_rate": 0.0002,
"loss": 0.2569,
"step": 6100
},
{
"epoch": 1.4457269441529672,
"grad_norm": 0.2950509488582611,
"learning_rate": 0.0002,
"loss": 0.22,
"step": 6200
},
{
"epoch": 1.4690451206715636,
"grad_norm": 0.36950767040252686,
"learning_rate": 0.0002,
"loss": 0.2433,
"step": 6300
},
{
"epoch": 1.4923632971901597,
"grad_norm": 0.35253265500068665,
"learning_rate": 0.0002,
"loss": 0.2269,
"step": 6400
},
{
"epoch": 1.515681473708756,
"grad_norm": 0.3378414213657379,
"learning_rate": 0.0002,
"loss": 0.2329,
"step": 6500
},
{
"epoch": 1.5389996502273522,
"grad_norm": 0.4102073311805725,
"learning_rate": 0.0002,
"loss": 0.2404,
"step": 6600
},
{
"epoch": 1.5623178267459483,
"grad_norm": 0.4430312216281891,
"learning_rate": 0.0002,
"loss": 0.235,
"step": 6700
},
{
"epoch": 1.5856360032645447,
"grad_norm": 0.3363936245441437,
"learning_rate": 0.0002,
"loss": 0.2288,
"step": 6800
},
{
"epoch": 1.608954179783141,
"grad_norm": 0.3177776634693146,
"learning_rate": 0.0002,
"loss": 0.2443,
"step": 6900
},
{
"epoch": 1.6322723563017372,
"grad_norm": 0.33283111453056335,
"learning_rate": 0.0002,
"loss": 0.2267,
"step": 7000
},
{
"epoch": 1.6555905328203333,
"grad_norm": 0.4799099564552307,
"learning_rate": 0.0002,
"loss": 0.2355,
"step": 7100
},
{
"epoch": 1.6789087093389297,
"grad_norm": 0.38987642526626587,
"learning_rate": 0.0002,
"loss": 0.2268,
"step": 7200
},
{
"epoch": 1.702226885857526,
"grad_norm": 0.32820141315460205,
"learning_rate": 0.0002,
"loss": 0.2098,
"step": 7300
},
{
"epoch": 1.7255450623761222,
"grad_norm": 0.4211929142475128,
"learning_rate": 0.0002,
"loss": 0.2291,
"step": 7400
},
{
"epoch": 1.7488632388947183,
"grad_norm": 0.42743125557899475,
"learning_rate": 0.0002,
"loss": 0.2192,
"step": 7500
},
{
"epoch": 1.7721814154133146,
"grad_norm": 0.33759135007858276,
"learning_rate": 0.0002,
"loss": 0.2301,
"step": 7600
},
{
"epoch": 1.795499591931911,
"grad_norm": 0.24578171968460083,
"learning_rate": 0.0002,
"loss": 0.2233,
"step": 7700
},
{
"epoch": 1.8188177684505071,
"grad_norm": 0.3331544101238251,
"learning_rate": 0.0002,
"loss": 0.2308,
"step": 7800
},
{
"epoch": 1.8421359449691033,
"grad_norm": 0.4028831720352173,
"learning_rate": 0.0002,
"loss": 0.2112,
"step": 7900
},
{
"epoch": 1.8654541214876996,
"grad_norm": 0.3874329924583435,
"learning_rate": 0.0002,
"loss": 0.1998,
"step": 8000
},
{
"epoch": 1.888772298006296,
"grad_norm": 0.30130070447921753,
"learning_rate": 0.0002,
"loss": 0.203,
"step": 8100
},
{
"epoch": 1.9120904745248921,
"grad_norm": 0.41124048829078674,
"learning_rate": 0.0002,
"loss": 0.2184,
"step": 8200
},
{
"epoch": 1.9354086510434882,
"grad_norm": 0.3104913532733917,
"learning_rate": 0.0002,
"loss": 0.2211,
"step": 8300
},
{
"epoch": 1.9587268275620846,
"grad_norm": 0.30567994713783264,
"learning_rate": 0.0002,
"loss": 0.2039,
"step": 8400
},
{
"epoch": 1.982045004080681,
"grad_norm": 0.3126045763492584,
"learning_rate": 0.0002,
"loss": 0.2107,
"step": 8500
},
{
"epoch": 2.0053631805992773,
"grad_norm": 0.29460686445236206,
"learning_rate": 0.0002,
"loss": 0.1901,
"step": 8600
},
{
"epoch": 2.0286813571178732,
"grad_norm": 0.4113939106464386,
"learning_rate": 0.0002,
"loss": 0.1621,
"step": 8700
},
{
"epoch": 2.0519995336364696,
"grad_norm": 0.33105671405792236,
"learning_rate": 0.0002,
"loss": 0.1657,
"step": 8800
},
{
"epoch": 2.075317710155066,
"grad_norm": 0.33191269636154175,
"learning_rate": 0.0002,
"loss": 0.1773,
"step": 8900
},
{
"epoch": 2.0986358866736623,
"grad_norm": 0.3344513475894928,
"learning_rate": 0.0002,
"loss": 0.1654,
"step": 9000
},
{
"epoch": 2.121954063192258,
"grad_norm": 0.31760096549987793,
"learning_rate": 0.0002,
"loss": 0.1677,
"step": 9100
},
{
"epoch": 2.1452722397108546,
"grad_norm": 0.32853373885154724,
"learning_rate": 0.0002,
"loss": 0.1775,
"step": 9200
},
{
"epoch": 2.168590416229451,
"grad_norm": 0.38260915875434875,
"learning_rate": 0.0002,
"loss": 0.1644,
"step": 9300
},
{
"epoch": 2.1919085927480473,
"grad_norm": 0.3272022604942322,
"learning_rate": 0.0002,
"loss": 0.1632,
"step": 9400
},
{
"epoch": 2.215226769266643,
"grad_norm": 0.40181514620780945,
"learning_rate": 0.0002,
"loss": 0.1672,
"step": 9500
},
{
"epoch": 2.2385449457852395,
"grad_norm": 0.285182923078537,
"learning_rate": 0.0002,
"loss": 0.1695,
"step": 9600
},
{
"epoch": 2.261863122303836,
"grad_norm": 0.3401045799255371,
"learning_rate": 0.0002,
"loss": 0.1658,
"step": 9700
},
{
"epoch": 2.2851812988224323,
"grad_norm": 0.45088696479797363,
"learning_rate": 0.0002,
"loss": 0.173,
"step": 9800
},
{
"epoch": 2.308499475341028,
"grad_norm": 0.09891465306282043,
"learning_rate": 0.0002,
"loss": 0.1725,
"step": 9900
},
{
"epoch": 2.3318176518596245,
"grad_norm": 0.3077000081539154,
"learning_rate": 0.0002,
"loss": 0.1777,
"step": 10000
},
{
"epoch": 2.355135828378221,
"grad_norm": 0.2650957703590393,
"learning_rate": 0.0002,
"loss": 0.1606,
"step": 10100
},
{
"epoch": 2.3784540048968172,
"grad_norm": 0.2967466413974762,
"learning_rate": 0.0002,
"loss": 0.1626,
"step": 10200
},
{
"epoch": 2.401772181415413,
"grad_norm": 0.21177765727043152,
"learning_rate": 0.0002,
"loss": 0.1762,
"step": 10300
},
{
"epoch": 2.4250903579340095,
"grad_norm": 0.34562838077545166,
"learning_rate": 0.0002,
"loss": 0.1653,
"step": 10400
},
{
"epoch": 2.448408534452606,
"grad_norm": 0.2537182569503784,
"learning_rate": 0.0002,
"loss": 0.1722,
"step": 10500
},
{
"epoch": 2.4717267109712022,
"grad_norm": 0.22955211997032166,
"learning_rate": 0.0002,
"loss": 0.1713,
"step": 10600
},
{
"epoch": 2.495044887489798,
"grad_norm": 0.3709162175655365,
"learning_rate": 0.0002,
"loss": 0.1679,
"step": 10700
},
{
"epoch": 2.5183630640083945,
"grad_norm": 0.24581150710582733,
"learning_rate": 0.0002,
"loss": 0.1604,
"step": 10800
},
{
"epoch": 2.541681240526991,
"grad_norm": 0.20854513347148895,
"learning_rate": 0.0002,
"loss": 0.1687,
"step": 10900
},
{
"epoch": 2.5649994170455868,
"grad_norm": 0.2496633380651474,
"learning_rate": 0.0002,
"loss": 0.163,
"step": 11000
},
{
"epoch": 2.588317593564183,
"grad_norm": 0.23603980243206024,
"learning_rate": 0.0002,
"loss": 0.1748,
"step": 11100
},
{
"epoch": 2.6116357700827795,
"grad_norm": 0.36322489380836487,
"learning_rate": 0.0002,
"loss": 0.1798,
"step": 11200
},
{
"epoch": 2.634953946601376,
"grad_norm": 0.32981303334236145,
"learning_rate": 0.0002,
"loss": 0.1588,
"step": 11300
},
{
"epoch": 2.658272123119972,
"grad_norm": 0.4760492742061615,
"learning_rate": 0.0002,
"loss": 0.1723,
"step": 11400
},
{
"epoch": 2.681590299638568,
"grad_norm": 0.22435927391052246,
"learning_rate": 0.0002,
"loss": 0.1742,
"step": 11500
},
{
"epoch": 2.7049084761571645,
"grad_norm": 0.2695131003856659,
"learning_rate": 0.0002,
"loss": 0.1602,
"step": 11600
},
{
"epoch": 2.728226652675761,
"grad_norm": 0.16897708177566528,
"learning_rate": 0.0002,
"loss": 0.1698,
"step": 11700
},
{
"epoch": 2.751544829194357,
"grad_norm": 0.2540949881076813,
"learning_rate": 0.0002,
"loss": 0.1641,
"step": 11800
},
{
"epoch": 2.7748630057129535,
"grad_norm": 0.40854746103286743,
"learning_rate": 0.0002,
"loss": 0.1747,
"step": 11900
},
{
"epoch": 2.7981811822315494,
"grad_norm": 0.3012579679489136,
"learning_rate": 0.0002,
"loss": 0.1619,
"step": 12000
},
{
"epoch": 2.821499358750146,
"grad_norm": 0.18468593060970306,
"learning_rate": 0.0002,
"loss": 0.1686,
"step": 12100
},
{
"epoch": 2.844817535268742,
"grad_norm": 0.3668818175792694,
"learning_rate": 0.0002,
"loss": 0.1588,
"step": 12200
},
{
"epoch": 2.868135711787338,
"grad_norm": 0.5856422185897827,
"learning_rate": 0.0002,
"loss": 0.1784,
"step": 12300
},
{
"epoch": 2.8914538883059344,
"grad_norm": 0.37487712502479553,
"learning_rate": 0.0002,
"loss": 0.1701,
"step": 12400
},
{
"epoch": 2.9147720648245308,
"grad_norm": 0.29282090067863464,
"learning_rate": 0.0002,
"loss": 0.1613,
"step": 12500
},
{
"epoch": 2.938090241343127,
"grad_norm": 0.306607186794281,
"learning_rate": 0.0002,
"loss": 0.1655,
"step": 12600
},
{
"epoch": 2.9614084178617235,
"grad_norm": 0.1990358531475067,
"learning_rate": 0.0002,
"loss": 0.17,
"step": 12700
},
{
"epoch": 2.9847265943803194,
"grad_norm": 0.4855429232120514,
"learning_rate": 0.0002,
"loss": 0.1722,
"step": 12800
},
{
"epoch": 3.0080447708989158,
"grad_norm": 0.39795544743537903,
"learning_rate": 0.0002,
"loss": 0.1548,
"step": 12900
},
{
"epoch": 3.031362947417512,
"grad_norm": 0.3113553524017334,
"learning_rate": 0.0002,
"loss": 0.1396,
"step": 13000
},
{
"epoch": 3.054681123936108,
"grad_norm": 0.3086554706096649,
"learning_rate": 0.0002,
"loss": 0.1364,
"step": 13100
},
{
"epoch": 3.0779993004547044,
"grad_norm": 0.24818335473537445,
"learning_rate": 0.0002,
"loss": 0.1414,
"step": 13200
},
{
"epoch": 3.1013174769733007,
"grad_norm": 0.37954941391944885,
"learning_rate": 0.0002,
"loss": 0.1388,
"step": 13300
},
{
"epoch": 3.124635653491897,
"grad_norm": 0.2943727672100067,
"learning_rate": 0.0002,
"loss": 0.1408,
"step": 13400
},
{
"epoch": 3.147953830010493,
"grad_norm": 0.35590696334838867,
"learning_rate": 0.0002,
"loss": 0.1363,
"step": 13500
},
{
"epoch": 3.1712720065290894,
"grad_norm": 0.19578373432159424,
"learning_rate": 0.0002,
"loss": 0.137,
"step": 13600
},
{
"epoch": 3.1945901830476857,
"grad_norm": 0.25028303265571594,
"learning_rate": 0.0002,
"loss": 0.1348,
"step": 13700
},
{
"epoch": 3.217908359566282,
"grad_norm": 0.18405300378799438,
"learning_rate": 0.0002,
"loss": 0.1372,
"step": 13800
},
{
"epoch": 3.241226536084878,
"grad_norm": 0.31417056918144226,
"learning_rate": 0.0002,
"loss": 0.1428,
"step": 13900
},
{
"epoch": 3.2645447126034743,
"grad_norm": 0.22496923804283142,
"learning_rate": 0.0002,
"loss": 0.1378,
"step": 14000
},
{
"epoch": 3.2878628891220707,
"grad_norm": 0.23862232267856598,
"learning_rate": 0.0002,
"loss": 0.1362,
"step": 14100
},
{
"epoch": 3.311181065640667,
"grad_norm": 0.2142096310853958,
"learning_rate": 0.0002,
"loss": 0.139,
"step": 14200
},
{
"epoch": 3.334499242159263,
"grad_norm": 0.2794269025325775,
"learning_rate": 0.0002,
"loss": 0.1376,
"step": 14300
},
{
"epoch": 3.3578174186778593,
"grad_norm": 0.14498618245124817,
"learning_rate": 0.0002,
"loss": 0.1416,
"step": 14400
},
{
"epoch": 3.3811355951964557,
"grad_norm": 0.2895399332046509,
"learning_rate": 0.0002,
"loss": 0.1379,
"step": 14500
},
{
"epoch": 3.404453771715052,
"grad_norm": 0.2537992000579834,
"learning_rate": 0.0002,
"loss": 0.1356,
"step": 14600
},
{
"epoch": 3.427771948233648,
"grad_norm": 0.20395183563232422,
"learning_rate": 0.0002,
"loss": 0.1424,
"step": 14700
},
{
"epoch": 3.4510901247522443,
"grad_norm": 0.15283405780792236,
"learning_rate": 0.0002,
"loss": 0.1395,
"step": 14800
},
{
"epoch": 3.4744083012708407,
"grad_norm": 0.4268224537372589,
"learning_rate": 0.0002,
"loss": 0.1359,
"step": 14900
},
{
"epoch": 3.497726477789437,
"grad_norm": 0.22292669117450714,
"learning_rate": 0.0002,
"loss": 0.1386,
"step": 15000
},
{
"epoch": 3.5210446543080334,
"grad_norm": 0.11900927871465683,
"learning_rate": 0.0002,
"loss": 0.1442,
"step": 15100
},
{
"epoch": 3.5443628308266293,
"grad_norm": 0.45133286714553833,
"learning_rate": 0.0002,
"loss": 0.1365,
"step": 15200
},
{
"epoch": 3.5676810073452256,
"grad_norm": 0.30186957120895386,
"learning_rate": 0.0002,
"loss": 0.1416,
"step": 15300
},
{
"epoch": 3.590999183863822,
"grad_norm": 0.31408384442329407,
"learning_rate": 0.0002,
"loss": 0.1387,
"step": 15400
},
{
"epoch": 3.614317360382418,
"grad_norm": 0.36072710156440735,
"learning_rate": 0.0002,
"loss": 0.1428,
"step": 15500
},
{
"epoch": 3.6376355369010143,
"grad_norm": 0.28984448313713074,
"learning_rate": 0.0002,
"loss": 0.1393,
"step": 15600
},
{
"epoch": 3.6609537134196106,
"grad_norm": 0.2014656662940979,
"learning_rate": 0.0002,
"loss": 0.1435,
"step": 15700
},
{
"epoch": 3.684271889938207,
"grad_norm": 0.41273656487464905,
"learning_rate": 0.0002,
"loss": 0.1369,
"step": 15800
},
{
"epoch": 3.7075900664568033,
"grad_norm": 0.48672163486480713,
"learning_rate": 0.0002,
"loss": 0.1433,
"step": 15900
},
{
"epoch": 3.7309082429753992,
"grad_norm": 0.19120950996875763,
"learning_rate": 0.0002,
"loss": 0.1405,
"step": 16000
},
{
"epoch": 3.7542264194939956,
"grad_norm": 0.19792740046977997,
"learning_rate": 0.0002,
"loss": 0.1451,
"step": 16100
},
{
"epoch": 3.777544596012592,
"grad_norm": 0.14919213950634003,
"learning_rate": 0.0002,
"loss": 0.1382,
"step": 16200
},
{
"epoch": 3.800862772531188,
"grad_norm": 0.4650104343891144,
"learning_rate": 0.0002,
"loss": 0.1339,
"step": 16300
},
{
"epoch": 3.8241809490497842,
"grad_norm": 0.3627985417842865,
"learning_rate": 0.0002,
"loss": 0.1422,
"step": 16400
},
{
"epoch": 3.8474991255683806,
"grad_norm": 0.7782896161079407,
"learning_rate": 0.0002,
"loss": 0.1432,
"step": 16500
},
{
"epoch": 3.870817302086977,
"grad_norm": 0.2858645021915436,
"learning_rate": 0.0002,
"loss": 0.1413,
"step": 16600
},
{
"epoch": 3.8941354786055733,
"grad_norm": 0.22150644659996033,
"learning_rate": 0.0002,
"loss": 0.1437,
"step": 16700
},
{
"epoch": 3.917453655124169,
"grad_norm": 0.3596114218235016,
"learning_rate": 0.0002,
"loss": 0.1463,
"step": 16800
},
{
"epoch": 3.9407718316427656,
"grad_norm": 0.14949366450309753,
"learning_rate": 0.0002,
"loss": 0.1449,
"step": 16900
},
{
"epoch": 3.964090008161362,
"grad_norm": 0.32889851927757263,
"learning_rate": 0.0002,
"loss": 0.1396,
"step": 17000
},
{
"epoch": 3.987408184679958,
"grad_norm": 0.1940721869468689,
"learning_rate": 0.0002,
"loss": 0.14,
"step": 17100
},
{
"epoch": 4.010726361198555,
"grad_norm": 0.1328798085451126,
"learning_rate": 0.0002,
"loss": 0.1316,
"step": 17200
},
{
"epoch": 4.0340445377171505,
"grad_norm": 0.09979192912578583,
"learning_rate": 0.0002,
"loss": 0.1224,
"step": 17300
},
{
"epoch": 4.0573627142357465,
"grad_norm": 0.22828274965286255,
"learning_rate": 0.0002,
"loss": 0.1184,
"step": 17400
},
{
"epoch": 4.080680890754343,
"grad_norm": 0.1396108716726303,
"learning_rate": 0.0002,
"loss": 0.1189,
"step": 17500
},
{
"epoch": 4.103999067272939,
"grad_norm": 0.1849929839372635,
"learning_rate": 0.0002,
"loss": 0.1231,
"step": 17600
},
{
"epoch": 4.127317243791535,
"grad_norm": 0.14947502315044403,
"learning_rate": 0.0002,
"loss": 0.1158,
"step": 17700
},
{
"epoch": 4.150635420310132,
"grad_norm": 0.3471536934375763,
"learning_rate": 0.0002,
"loss": 0.1204,
"step": 17800
},
{
"epoch": 4.173953596828728,
"grad_norm": 0.23290419578552246,
"learning_rate": 0.0002,
"loss": 0.1175,
"step": 17900
},
{
"epoch": 4.197271773347325,
"grad_norm": 0.17477743327617645,
"learning_rate": 0.0002,
"loss": 0.1205,
"step": 18000
},
{
"epoch": 4.2205899498659205,
"grad_norm": 0.1214243695139885,
"learning_rate": 0.0002,
"loss": 0.1188,
"step": 18100
},
{
"epoch": 4.243908126384516,
"grad_norm": 0.12706777453422546,
"learning_rate": 0.0002,
"loss": 0.1196,
"step": 18200
},
{
"epoch": 4.267226302903113,
"grad_norm": 0.18115375936031342,
"learning_rate": 0.0002,
"loss": 0.1179,
"step": 18300
},
{
"epoch": 4.290544479421709,
"grad_norm": 0.05149231478571892,
"learning_rate": 0.0002,
"loss": 0.1224,
"step": 18400
},
{
"epoch": 4.313862655940305,
"grad_norm": 0.47274354100227356,
"learning_rate": 0.0002,
"loss": 0.1192,
"step": 18500
},
{
"epoch": 4.337180832458902,
"grad_norm": 0.218338742852211,
"learning_rate": 0.0002,
"loss": 0.1244,
"step": 18600
},
{
"epoch": 4.360499008977498,
"grad_norm": 0.1247347891330719,
"learning_rate": 0.0002,
"loss": 0.1267,
"step": 18700
},
{
"epoch": 4.383817185496095,
"grad_norm": 0.2586764991283417,
"learning_rate": 0.0002,
"loss": 0.1236,
"step": 18800
},
{
"epoch": 4.4071353620146905,
"grad_norm": 0.11474807560443878,
"learning_rate": 0.0002,
"loss": 0.1252,
"step": 18900
},
{
"epoch": 4.430453538533286,
"grad_norm": 0.34646329283714294,
"learning_rate": 0.0002,
"loss": 0.1237,
"step": 19000
},
{
"epoch": 4.453771715051883,
"grad_norm": 0.17445826530456543,
"learning_rate": 0.0002,
"loss": 0.1183,
"step": 19100
},
{
"epoch": 4.477089891570479,
"grad_norm": 0.3867531716823578,
"learning_rate": 0.0002,
"loss": 0.1248,
"step": 19200
},
{
"epoch": 4.500408068089076,
"grad_norm": 0.15927106142044067,
"learning_rate": 0.0002,
"loss": 0.1258,
"step": 19300
},
{
"epoch": 4.523726244607672,
"grad_norm": 0.2284346967935562,
"learning_rate": 0.0002,
"loss": 0.1244,
"step": 19400
},
{
"epoch": 4.547044421126268,
"grad_norm": 0.3231777250766754,
"learning_rate": 0.0002,
"loss": 0.1257,
"step": 19500
},
{
"epoch": 4.5703625976448645,
"grad_norm": 0.10116703063249588,
"learning_rate": 0.0002,
"loss": 0.1293,
"step": 19600
},
{
"epoch": 4.59368077416346,
"grad_norm": 0.2922173738479614,
"learning_rate": 0.0002,
"loss": 0.1262,
"step": 19700
},
{
"epoch": 4.616998950682056,
"grad_norm": 0.1958065629005432,
"learning_rate": 0.0002,
"loss": 0.1258,
"step": 19800
},
{
"epoch": 4.640317127200653,
"grad_norm": 0.08755222707986832,
"learning_rate": 0.0002,
"loss": 0.1293,
"step": 19900
},
{
"epoch": 4.663635303719249,
"grad_norm": 0.1416950523853302,
"learning_rate": 0.0002,
"loss": 0.1227,
"step": 20000
},
{
"epoch": 4.686953480237845,
"grad_norm": 0.21383579075336456,
"learning_rate": 0.0002,
"loss": 0.1272,
"step": 20100
},
{
"epoch": 4.710271656756442,
"grad_norm": 0.27910149097442627,
"learning_rate": 0.0002,
"loss": 0.1298,
"step": 20200
},
{
"epoch": 4.733589833275038,
"grad_norm": 0.07715137302875519,
"learning_rate": 0.0002,
"loss": 0.1266,
"step": 20300
},
{
"epoch": 4.7569080097936345,
"grad_norm": 0.08127077668905258,
"learning_rate": 0.0002,
"loss": 0.1269,
"step": 20400
},
{
"epoch": 4.78022618631223,
"grad_norm": 0.3075973391532898,
"learning_rate": 0.0002,
"loss": 0.1308,
"step": 20500
},
{
"epoch": 4.803544362830826,
"grad_norm": 0.23989351093769073,
"learning_rate": 0.0002,
"loss": 0.1217,
"step": 20600
},
{
"epoch": 4.826862539349423,
"grad_norm": 0.1361120343208313,
"learning_rate": 0.0002,
"loss": 0.1237,
"step": 20700
},
{
"epoch": 4.850180715868019,
"grad_norm": 0.3711351156234741,
"learning_rate": 0.0002,
"loss": 0.1248,
"step": 20800
},
{
"epoch": 4.873498892386616,
"grad_norm": 0.3196912109851837,
"learning_rate": 0.0002,
"loss": 0.1236,
"step": 20900
},
{
"epoch": 4.896817068905212,
"grad_norm": 0.10089880973100662,
"learning_rate": 0.0002,
"loss": 0.1248,
"step": 21000
}
],
"logging_steps": 100,
"max_steps": 21440,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9056254817400013e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}