mixtral-dna-v0.2 / trainer_state.json
RaphaelMourad's picture
Upload 10 files
7758e1d verified
{
"best_metric": 6.209214687347412,
"best_model_checkpoint": "./results/models/checkpoint-104305",
"epoch": 24.0,
"eval_steps": 500,
"global_step": 108840,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11,
"learning_rate": 0.0019955898566703417,
"loss": 6.7688,
"step": 500
},
{
"epoch": 0.22,
"learning_rate": 0.001991179713340684,
"loss": 6.6804,
"step": 1000
},
{
"epoch": 0.33,
"learning_rate": 0.001986769570011025,
"loss": 6.6645,
"step": 1500
},
{
"epoch": 0.44,
"learning_rate": 0.0019823594266813673,
"loss": 6.6495,
"step": 2000
},
{
"epoch": 0.55,
"learning_rate": 0.001977949283351709,
"loss": 6.6246,
"step": 2500
},
{
"epoch": 0.66,
"learning_rate": 0.0019735391400220507,
"loss": 6.5823,
"step": 3000
},
{
"epoch": 0.77,
"learning_rate": 0.001969128996692393,
"loss": 6.5645,
"step": 3500
},
{
"epoch": 0.88,
"learning_rate": 0.001964718853362734,
"loss": 6.5409,
"step": 4000
},
{
"epoch": 0.99,
"learning_rate": 0.001960308710033076,
"loss": 6.5185,
"step": 4500
},
{
"epoch": 1.0,
"eval_loss": 6.5362162590026855,
"eval_runtime": 3.2086,
"eval_samples_per_second": 90.695,
"eval_steps_per_second": 1.558,
"step": 4535
},
{
"epoch": 1.1,
"learning_rate": 0.001955898566703418,
"loss": 6.501,
"step": 5000
},
{
"epoch": 1.21,
"learning_rate": 0.0019514884233737598,
"loss": 6.5008,
"step": 5500
},
{
"epoch": 1.32,
"learning_rate": 0.0019470782800441013,
"loss": 6.5114,
"step": 6000
},
{
"epoch": 1.43,
"learning_rate": 0.0019426681367144432,
"loss": 6.4978,
"step": 6500
},
{
"epoch": 1.54,
"learning_rate": 0.0019382579933847851,
"loss": 6.5002,
"step": 7000
},
{
"epoch": 1.65,
"learning_rate": 0.0019338478500551268,
"loss": 6.4791,
"step": 7500
},
{
"epoch": 1.76,
"learning_rate": 0.0019294377067254687,
"loss": 6.4779,
"step": 8000
},
{
"epoch": 1.87,
"learning_rate": 0.0019250275633958104,
"loss": 6.4694,
"step": 8500
},
{
"epoch": 1.98,
"learning_rate": 0.0019206174200661521,
"loss": 6.466,
"step": 9000
},
{
"epoch": 2.0,
"eval_loss": 6.482348918914795,
"eval_runtime": 3.1798,
"eval_samples_per_second": 91.514,
"eval_steps_per_second": 1.572,
"step": 9070
},
{
"epoch": 2.09,
"learning_rate": 0.001916207276736494,
"loss": 6.4515,
"step": 9500
},
{
"epoch": 2.21,
"learning_rate": 0.0019117971334068357,
"loss": 6.4551,
"step": 10000
},
{
"epoch": 2.32,
"learning_rate": 0.0019073869900771776,
"loss": 6.444,
"step": 10500
},
{
"epoch": 2.43,
"learning_rate": 0.0019029768467475193,
"loss": 6.4236,
"step": 11000
},
{
"epoch": 2.54,
"learning_rate": 0.001898566703417861,
"loss": 6.4277,
"step": 11500
},
{
"epoch": 2.65,
"learning_rate": 0.001894156560088203,
"loss": 6.4185,
"step": 12000
},
{
"epoch": 2.76,
"learning_rate": 0.0018897464167585446,
"loss": 6.4164,
"step": 12500
},
{
"epoch": 2.87,
"learning_rate": 0.0018853362734288866,
"loss": 6.4177,
"step": 13000
},
{
"epoch": 2.98,
"learning_rate": 0.0018809261300992283,
"loss": 6.4135,
"step": 13500
},
{
"epoch": 3.0,
"eval_loss": 6.432834625244141,
"eval_runtime": 3.1903,
"eval_samples_per_second": 91.214,
"eval_steps_per_second": 1.567,
"step": 13605
},
{
"epoch": 3.09,
"learning_rate": 0.00187651598676957,
"loss": 6.4072,
"step": 14000
},
{
"epoch": 3.2,
"learning_rate": 0.0018721058434399119,
"loss": 6.3986,
"step": 14500
},
{
"epoch": 3.31,
"learning_rate": 0.0018676957001102538,
"loss": 6.3877,
"step": 15000
},
{
"epoch": 3.42,
"learning_rate": 0.0018632855567805953,
"loss": 6.3881,
"step": 15500
},
{
"epoch": 3.53,
"learning_rate": 0.0018588754134509372,
"loss": 6.3853,
"step": 16000
},
{
"epoch": 3.64,
"learning_rate": 0.0018544652701212789,
"loss": 6.3969,
"step": 16500
},
{
"epoch": 3.75,
"learning_rate": 0.0018500551267916208,
"loss": 6.3868,
"step": 17000
},
{
"epoch": 3.86,
"learning_rate": 0.0018456449834619627,
"loss": 6.3914,
"step": 17500
},
{
"epoch": 3.97,
"learning_rate": 0.0018412348401323042,
"loss": 6.3761,
"step": 18000
},
{
"epoch": 4.0,
"eval_loss": 6.406026840209961,
"eval_runtime": 3.1898,
"eval_samples_per_second": 91.228,
"eval_steps_per_second": 1.567,
"step": 18140
},
{
"epoch": 4.08,
"learning_rate": 0.001836824696802646,
"loss": 6.3767,
"step": 18500
},
{
"epoch": 4.19,
"learning_rate": 0.001832414553472988,
"loss": 6.3771,
"step": 19000
},
{
"epoch": 4.3,
"learning_rate": 0.0018280044101433297,
"loss": 6.3629,
"step": 19500
},
{
"epoch": 4.41,
"learning_rate": 0.0018235942668136716,
"loss": 6.3658,
"step": 20000
},
{
"epoch": 4.52,
"learning_rate": 0.0018191841234840131,
"loss": 6.3683,
"step": 20500
},
{
"epoch": 4.63,
"learning_rate": 0.001814773980154355,
"loss": 6.3679,
"step": 21000
},
{
"epoch": 4.74,
"learning_rate": 0.001810363836824697,
"loss": 6.3651,
"step": 21500
},
{
"epoch": 4.85,
"learning_rate": 0.0018059536934950386,
"loss": 6.3492,
"step": 22000
},
{
"epoch": 4.96,
"learning_rate": 0.0018015435501653806,
"loss": 6.3456,
"step": 22500
},
{
"epoch": 5.0,
"eval_loss": 6.364647388458252,
"eval_runtime": 3.181,
"eval_samples_per_second": 91.481,
"eval_steps_per_second": 1.572,
"step": 22675
},
{
"epoch": 5.07,
"learning_rate": 0.001797133406835722,
"loss": 6.3347,
"step": 23000
},
{
"epoch": 5.18,
"learning_rate": 0.001792723263506064,
"loss": 6.3421,
"step": 23500
},
{
"epoch": 5.29,
"learning_rate": 0.0017883131201764059,
"loss": 6.3458,
"step": 24000
},
{
"epoch": 5.4,
"learning_rate": 0.0017839029768467476,
"loss": 6.34,
"step": 24500
},
{
"epoch": 5.51,
"learning_rate": 0.0017794928335170893,
"loss": 6.3494,
"step": 25000
},
{
"epoch": 5.62,
"learning_rate": 0.0017750826901874312,
"loss": 6.3306,
"step": 25500
},
{
"epoch": 5.73,
"learning_rate": 0.0017706725468577729,
"loss": 6.3336,
"step": 26000
},
{
"epoch": 5.84,
"learning_rate": 0.0017662624035281148,
"loss": 6.3347,
"step": 26500
},
{
"epoch": 5.95,
"learning_rate": 0.0017618522601984565,
"loss": 6.3298,
"step": 27000
},
{
"epoch": 6.0,
"eval_loss": 6.354611396789551,
"eval_runtime": 3.181,
"eval_samples_per_second": 91.481,
"eval_steps_per_second": 1.572,
"step": 27210
},
{
"epoch": 6.06,
"learning_rate": 0.0017574421168687982,
"loss": 6.3297,
"step": 27500
},
{
"epoch": 6.17,
"learning_rate": 0.00175303197353914,
"loss": 6.3288,
"step": 28000
},
{
"epoch": 6.28,
"learning_rate": 0.0017486218302094818,
"loss": 6.3304,
"step": 28500
},
{
"epoch": 6.39,
"learning_rate": 0.0017442116868798237,
"loss": 6.3151,
"step": 29000
},
{
"epoch": 6.5,
"learning_rate": 0.0017398015435501656,
"loss": 6.3186,
"step": 29500
},
{
"epoch": 6.62,
"learning_rate": 0.001735391400220507,
"loss": 6.3194,
"step": 30000
},
{
"epoch": 6.73,
"learning_rate": 0.001730981256890849,
"loss": 6.3115,
"step": 30500
},
{
"epoch": 6.84,
"learning_rate": 0.0017265711135611907,
"loss": 6.3191,
"step": 31000
},
{
"epoch": 6.95,
"learning_rate": 0.0017221609702315326,
"loss": 6.3108,
"step": 31500
},
{
"epoch": 7.0,
"eval_loss": 6.336333274841309,
"eval_runtime": 3.1987,
"eval_samples_per_second": 90.975,
"eval_steps_per_second": 1.563,
"step": 31745
},
{
"epoch": 7.06,
"learning_rate": 0.0017177508269018745,
"loss": 6.3078,
"step": 32000
},
{
"epoch": 7.17,
"learning_rate": 0.001713340683572216,
"loss": 6.3046,
"step": 32500
},
{
"epoch": 7.28,
"learning_rate": 0.001708930540242558,
"loss": 6.3013,
"step": 33000
},
{
"epoch": 7.39,
"learning_rate": 0.0017045203969128996,
"loss": 6.3082,
"step": 33500
},
{
"epoch": 7.5,
"learning_rate": 0.0017001102535832415,
"loss": 6.3015,
"step": 34000
},
{
"epoch": 7.61,
"learning_rate": 0.0016957001102535832,
"loss": 6.2922,
"step": 34500
},
{
"epoch": 7.72,
"learning_rate": 0.001691289966923925,
"loss": 6.2995,
"step": 35000
},
{
"epoch": 7.83,
"learning_rate": 0.0016868798235942669,
"loss": 6.302,
"step": 35500
},
{
"epoch": 7.94,
"learning_rate": 0.0016824696802646088,
"loss": 6.3015,
"step": 36000
},
{
"epoch": 8.0,
"eval_loss": 6.320508003234863,
"eval_runtime": 3.1701,
"eval_samples_per_second": 91.795,
"eval_steps_per_second": 1.577,
"step": 36280
},
{
"epoch": 8.05,
"learning_rate": 0.0016780595369349505,
"loss": 6.2848,
"step": 36500
},
{
"epoch": 8.16,
"learning_rate": 0.0016736493936052922,
"loss": 6.297,
"step": 37000
},
{
"epoch": 8.27,
"learning_rate": 0.0016692392502756339,
"loss": 6.2874,
"step": 37500
},
{
"epoch": 8.38,
"learning_rate": 0.0016648291069459758,
"loss": 6.2883,
"step": 38000
},
{
"epoch": 8.49,
"learning_rate": 0.0016604189636163177,
"loss": 6.2775,
"step": 38500
},
{
"epoch": 8.6,
"learning_rate": 0.0016560088202866594,
"loss": 6.2791,
"step": 39000
},
{
"epoch": 8.71,
"learning_rate": 0.001651598676957001,
"loss": 6.2816,
"step": 39500
},
{
"epoch": 8.82,
"learning_rate": 0.0016471885336273428,
"loss": 6.2744,
"step": 40000
},
{
"epoch": 8.93,
"learning_rate": 0.0016427783902976847,
"loss": 6.2721,
"step": 40500
},
{
"epoch": 9.0,
"eval_loss": 6.303074836730957,
"eval_runtime": 3.1711,
"eval_samples_per_second": 91.767,
"eval_steps_per_second": 1.577,
"step": 40815
},
{
"epoch": 9.04,
"learning_rate": 0.0016383682469680266,
"loss": 6.276,
"step": 41000
},
{
"epoch": 9.15,
"learning_rate": 0.0016339581036383683,
"loss": 6.2713,
"step": 41500
},
{
"epoch": 9.26,
"learning_rate": 0.00162954796030871,
"loss": 6.2678,
"step": 42000
},
{
"epoch": 9.37,
"learning_rate": 0.001625137816979052,
"loss": 6.2672,
"step": 42500
},
{
"epoch": 9.48,
"learning_rate": 0.0016207276736493936,
"loss": 6.2596,
"step": 43000
},
{
"epoch": 9.59,
"learning_rate": 0.0016163175303197355,
"loss": 6.2705,
"step": 43500
},
{
"epoch": 9.7,
"learning_rate": 0.001611907386990077,
"loss": 6.2771,
"step": 44000
},
{
"epoch": 9.81,
"learning_rate": 0.001607497243660419,
"loss": 6.2601,
"step": 44500
},
{
"epoch": 9.92,
"learning_rate": 0.0016030871003307608,
"loss": 6.2657,
"step": 45000
},
{
"epoch": 10.0,
"eval_loss": 6.291815280914307,
"eval_runtime": 3.1834,
"eval_samples_per_second": 91.412,
"eval_steps_per_second": 1.571,
"step": 45350
},
{
"epoch": 10.03,
"learning_rate": 0.0015986769570011025,
"loss": 6.264,
"step": 45500
},
{
"epoch": 10.14,
"learning_rate": 0.0015942668136714445,
"loss": 6.2611,
"step": 46000
},
{
"epoch": 10.25,
"learning_rate": 0.0015898566703417862,
"loss": 6.2654,
"step": 46500
},
{
"epoch": 10.36,
"learning_rate": 0.0015854465270121279,
"loss": 6.258,
"step": 47000
},
{
"epoch": 10.47,
"learning_rate": 0.0015810363836824698,
"loss": 6.2543,
"step": 47500
},
{
"epoch": 10.58,
"learning_rate": 0.0015766262403528115,
"loss": 6.2513,
"step": 48000
},
{
"epoch": 10.69,
"learning_rate": 0.0015722160970231534,
"loss": 6.2556,
"step": 48500
},
{
"epoch": 10.8,
"learning_rate": 0.001567805953693495,
"loss": 6.2581,
"step": 49000
},
{
"epoch": 10.92,
"learning_rate": 0.0015633958103638368,
"loss": 6.2431,
"step": 49500
},
{
"epoch": 11.0,
"eval_loss": 6.28529691696167,
"eval_runtime": 3.1715,
"eval_samples_per_second": 91.755,
"eval_steps_per_second": 1.577,
"step": 49885
},
{
"epoch": 11.03,
"learning_rate": 0.0015589856670341787,
"loss": 6.2599,
"step": 50000
},
{
"epoch": 11.14,
"learning_rate": 0.0015545755237045204,
"loss": 6.2521,
"step": 50500
},
{
"epoch": 11.25,
"learning_rate": 0.0015501653803748623,
"loss": 6.2548,
"step": 51000
},
{
"epoch": 11.36,
"learning_rate": 0.001545755237045204,
"loss": 6.2448,
"step": 51500
},
{
"epoch": 11.47,
"learning_rate": 0.0015413450937155457,
"loss": 6.2482,
"step": 52000
},
{
"epoch": 11.58,
"learning_rate": 0.0015369349503858876,
"loss": 6.2521,
"step": 52500
},
{
"epoch": 11.69,
"learning_rate": 0.0015325248070562295,
"loss": 6.2511,
"step": 53000
},
{
"epoch": 11.8,
"learning_rate": 0.001528114663726571,
"loss": 6.2509,
"step": 53500
},
{
"epoch": 11.91,
"learning_rate": 0.001523704520396913,
"loss": 6.2467,
"step": 54000
},
{
"epoch": 12.0,
"eval_loss": 6.272589206695557,
"eval_runtime": 3.2247,
"eval_samples_per_second": 90.241,
"eval_steps_per_second": 1.551,
"step": 54420
},
{
"epoch": 12.02,
"learning_rate": 0.0015192943770672546,
"loss": 6.2525,
"step": 54500
},
{
"epoch": 12.13,
"learning_rate": 0.0015148842337375965,
"loss": 6.236,
"step": 55000
},
{
"epoch": 12.24,
"learning_rate": 0.0015104740904079384,
"loss": 6.2487,
"step": 55500
},
{
"epoch": 12.35,
"learning_rate": 0.00150606394707828,
"loss": 6.2375,
"step": 56000
},
{
"epoch": 12.46,
"learning_rate": 0.0015016538037486218,
"loss": 6.2375,
"step": 56500
},
{
"epoch": 12.57,
"learning_rate": 0.0014972436604189638,
"loss": 6.2414,
"step": 57000
},
{
"epoch": 12.68,
"learning_rate": 0.0014928335170893055,
"loss": 6.2343,
"step": 57500
},
{
"epoch": 12.79,
"learning_rate": 0.0014884233737596474,
"loss": 6.225,
"step": 58000
},
{
"epoch": 12.9,
"learning_rate": 0.0014840132304299888,
"loss": 6.2336,
"step": 58500
},
{
"epoch": 13.0,
"eval_loss": 6.260488033294678,
"eval_runtime": 3.1929,
"eval_samples_per_second": 91.14,
"eval_steps_per_second": 1.566,
"step": 58955
},
{
"epoch": 13.01,
"learning_rate": 0.0014796030871003308,
"loss": 6.2281,
"step": 59000
},
{
"epoch": 13.12,
"learning_rate": 0.0014751929437706727,
"loss": 6.2288,
"step": 59500
},
{
"epoch": 13.23,
"learning_rate": 0.0014707828004410144,
"loss": 6.2271,
"step": 60000
},
{
"epoch": 13.34,
"learning_rate": 0.0014663726571113563,
"loss": 6.232,
"step": 60500
},
{
"epoch": 13.45,
"learning_rate": 0.0014619625137816978,
"loss": 6.2307,
"step": 61000
},
{
"epoch": 13.56,
"learning_rate": 0.0014575523704520397,
"loss": 6.2294,
"step": 61500
},
{
"epoch": 13.67,
"learning_rate": 0.0014531422271223816,
"loss": 6.2215,
"step": 62000
},
{
"epoch": 13.78,
"learning_rate": 0.0014487320837927233,
"loss": 6.2218,
"step": 62500
},
{
"epoch": 13.89,
"learning_rate": 0.001444321940463065,
"loss": 6.2279,
"step": 63000
},
{
"epoch": 14.0,
"eval_loss": 6.25548791885376,
"eval_runtime": 3.2087,
"eval_samples_per_second": 90.691,
"eval_steps_per_second": 1.558,
"step": 63490
},
{
"epoch": 14.0,
"learning_rate": 0.001439911797133407,
"loss": 6.2282,
"step": 63500
},
{
"epoch": 14.11,
"learning_rate": 0.0014355016538037486,
"loss": 6.2224,
"step": 64000
},
{
"epoch": 14.22,
"learning_rate": 0.0014310915104740905,
"loss": 6.2242,
"step": 64500
},
{
"epoch": 14.33,
"learning_rate": 0.0014266813671444322,
"loss": 6.2338,
"step": 65000
},
{
"epoch": 14.44,
"learning_rate": 0.001422271223814774,
"loss": 6.2136,
"step": 65500
},
{
"epoch": 14.55,
"learning_rate": 0.0014178610804851158,
"loss": 6.2176,
"step": 66000
},
{
"epoch": 14.66,
"learning_rate": 0.0014134509371554575,
"loss": 6.2128,
"step": 66500
},
{
"epoch": 14.77,
"learning_rate": 0.0014090407938257994,
"loss": 6.2227,
"step": 67000
},
{
"epoch": 14.88,
"learning_rate": 0.0014046306504961414,
"loss": 6.2145,
"step": 67500
},
{
"epoch": 14.99,
"learning_rate": 0.0014002205071664828,
"loss": 6.2222,
"step": 68000
},
{
"epoch": 15.0,
"eval_loss": 6.248010158538818,
"eval_runtime": 3.1977,
"eval_samples_per_second": 91.003,
"eval_steps_per_second": 1.564,
"step": 68025
},
{
"epoch": 15.1,
"learning_rate": 0.0013958103638368248,
"loss": 6.2192,
"step": 68500
},
{
"epoch": 15.21,
"learning_rate": 0.0013914002205071664,
"loss": 6.2112,
"step": 69000
},
{
"epoch": 15.33,
"learning_rate": 0.0013869900771775084,
"loss": 6.2227,
"step": 69500
},
{
"epoch": 15.44,
"learning_rate": 0.0013825799338478503,
"loss": 6.2186,
"step": 70000
},
{
"epoch": 15.55,
"learning_rate": 0.0013781697905181918,
"loss": 6.2133,
"step": 70500
},
{
"epoch": 15.66,
"learning_rate": 0.0013737596471885337,
"loss": 6.2115,
"step": 71000
},
{
"epoch": 15.77,
"learning_rate": 0.0013693495038588754,
"loss": 6.2185,
"step": 71500
},
{
"epoch": 15.88,
"learning_rate": 0.0013649393605292173,
"loss": 6.212,
"step": 72000
},
{
"epoch": 15.99,
"learning_rate": 0.001360529217199559,
"loss": 6.2089,
"step": 72500
},
{
"epoch": 16.0,
"eval_loss": 6.242463111877441,
"eval_runtime": 3.2691,
"eval_samples_per_second": 89.016,
"eval_steps_per_second": 1.529,
"step": 72560
},
{
"epoch": 16.1,
"learning_rate": 0.0013561190738699007,
"loss": 6.2139,
"step": 73000
},
{
"epoch": 16.21,
"learning_rate": 0.0013517089305402426,
"loss": 6.2089,
"step": 73500
},
{
"epoch": 16.32,
"learning_rate": 0.0013472987872105845,
"loss": 6.2019,
"step": 74000
},
{
"epoch": 16.43,
"learning_rate": 0.0013428886438809262,
"loss": 6.1961,
"step": 74500
},
{
"epoch": 16.54,
"learning_rate": 0.001338478500551268,
"loss": 6.2156,
"step": 75000
},
{
"epoch": 16.65,
"learning_rate": 0.0013340683572216096,
"loss": 6.2056,
"step": 75500
},
{
"epoch": 16.76,
"learning_rate": 0.0013296582138919515,
"loss": 6.2078,
"step": 76000
},
{
"epoch": 16.87,
"learning_rate": 0.0013252480705622934,
"loss": 6.2113,
"step": 76500
},
{
"epoch": 16.98,
"learning_rate": 0.0013208379272326351,
"loss": 6.2133,
"step": 77000
},
{
"epoch": 17.0,
"eval_loss": 6.236937046051025,
"eval_runtime": 3.2084,
"eval_samples_per_second": 90.7,
"eval_steps_per_second": 1.558,
"step": 77095
},
{
"epoch": 17.09,
"learning_rate": 0.0013164277839029768,
"loss": 6.2033,
"step": 77500
},
{
"epoch": 17.2,
"learning_rate": 0.0013120176405733185,
"loss": 6.2032,
"step": 78000
},
{
"epoch": 17.31,
"learning_rate": 0.0013076074972436604,
"loss": 6.2016,
"step": 78500
},
{
"epoch": 17.42,
"learning_rate": 0.0013031973539140024,
"loss": 6.2063,
"step": 79000
},
{
"epoch": 17.53,
"learning_rate": 0.001298787210584344,
"loss": 6.2016,
"step": 79500
},
{
"epoch": 17.64,
"learning_rate": 0.0012943770672546857,
"loss": 6.1975,
"step": 80000
},
{
"epoch": 17.75,
"learning_rate": 0.0012899669239250277,
"loss": 6.1994,
"step": 80500
},
{
"epoch": 17.86,
"learning_rate": 0.0012855567805953694,
"loss": 6.1992,
"step": 81000
},
{
"epoch": 17.97,
"learning_rate": 0.0012811466372657113,
"loss": 6.1978,
"step": 81500
},
{
"epoch": 18.0,
"eval_loss": 6.230894088745117,
"eval_runtime": 3.1986,
"eval_samples_per_second": 90.977,
"eval_steps_per_second": 1.563,
"step": 81630
},
{
"epoch": 18.08,
"learning_rate": 0.0012767364939360528,
"loss": 6.1995,
"step": 82000
},
{
"epoch": 18.19,
"learning_rate": 0.0012723263506063947,
"loss": 6.2002,
"step": 82500
},
{
"epoch": 18.3,
"learning_rate": 0.0012679162072767366,
"loss": 6.1985,
"step": 83000
},
{
"epoch": 18.41,
"learning_rate": 0.0012635060639470783,
"loss": 6.1986,
"step": 83500
},
{
"epoch": 18.52,
"learning_rate": 0.0012590959206174202,
"loss": 6.1846,
"step": 84000
},
{
"epoch": 18.63,
"learning_rate": 0.0012546857772877619,
"loss": 6.1968,
"step": 84500
},
{
"epoch": 18.74,
"learning_rate": 0.0012502756339581036,
"loss": 6.1935,
"step": 85000
},
{
"epoch": 18.85,
"learning_rate": 0.0012458654906284455,
"loss": 6.1969,
"step": 85500
},
{
"epoch": 18.96,
"learning_rate": 0.0012414553472987872,
"loss": 6.1936,
"step": 86000
},
{
"epoch": 19.0,
"eval_loss": 6.223233699798584,
"eval_runtime": 3.2991,
"eval_samples_per_second": 88.205,
"eval_steps_per_second": 1.516,
"step": 86165
},
{
"epoch": 19.07,
"learning_rate": 0.0012370452039691291,
"loss": 6.1966,
"step": 86500
},
{
"epoch": 19.18,
"learning_rate": 0.0012326350606394708,
"loss": 6.1873,
"step": 87000
},
{
"epoch": 19.29,
"learning_rate": 0.0012282249173098125,
"loss": 6.1899,
"step": 87500
},
{
"epoch": 19.4,
"learning_rate": 0.0012238147739801544,
"loss": 6.1917,
"step": 88000
},
{
"epoch": 19.51,
"learning_rate": 0.0012194046306504961,
"loss": 6.1894,
"step": 88500
},
{
"epoch": 19.63,
"learning_rate": 0.001214994487320838,
"loss": 6.196,
"step": 89000
},
{
"epoch": 19.74,
"learning_rate": 0.0012105843439911797,
"loss": 6.186,
"step": 89500
},
{
"epoch": 19.85,
"learning_rate": 0.0012061742006615214,
"loss": 6.1871,
"step": 90000
},
{
"epoch": 19.96,
"learning_rate": 0.0012017640573318633,
"loss": 6.1913,
"step": 90500
},
{
"epoch": 20.0,
"eval_loss": 6.2211713790893555,
"eval_runtime": 3.2092,
"eval_samples_per_second": 90.677,
"eval_steps_per_second": 1.558,
"step": 90700
},
{
"epoch": 20.07,
"learning_rate": 0.0011973539140022053,
"loss": 6.1844,
"step": 91000
},
{
"epoch": 20.18,
"learning_rate": 0.0011929437706725467,
"loss": 6.1876,
"step": 91500
},
{
"epoch": 20.29,
"learning_rate": 0.0011885336273428887,
"loss": 6.1959,
"step": 92000
},
{
"epoch": 20.4,
"learning_rate": 0.0011841234840132304,
"loss": 6.191,
"step": 92500
},
{
"epoch": 20.51,
"learning_rate": 0.0011797133406835723,
"loss": 6.1884,
"step": 93000
},
{
"epoch": 20.62,
"learning_rate": 0.0011753031973539142,
"loss": 6.1791,
"step": 93500
},
{
"epoch": 20.73,
"learning_rate": 0.0011708930540242557,
"loss": 6.188,
"step": 94000
},
{
"epoch": 20.84,
"learning_rate": 0.0011664829106945976,
"loss": 6.1839,
"step": 94500
},
{
"epoch": 20.95,
"learning_rate": 0.0011620727673649395,
"loss": 6.18,
"step": 95000
},
{
"epoch": 21.0,
"eval_loss": 6.215102195739746,
"eval_runtime": 3.1993,
"eval_samples_per_second": 90.959,
"eval_steps_per_second": 1.563,
"step": 95235
},
{
"epoch": 21.06,
"learning_rate": 0.0011576626240352812,
"loss": 6.1761,
"step": 95500
},
{
"epoch": 21.17,
"learning_rate": 0.001153252480705623,
"loss": 6.1808,
"step": 96000
},
{
"epoch": 21.28,
"learning_rate": 0.0011488423373759646,
"loss": 6.1794,
"step": 96500
},
{
"epoch": 21.39,
"learning_rate": 0.0011444321940463065,
"loss": 6.1841,
"step": 97000
},
{
"epoch": 21.5,
"learning_rate": 0.0011400220507166484,
"loss": 6.183,
"step": 97500
},
{
"epoch": 21.61,
"learning_rate": 0.0011356119073869901,
"loss": 6.1844,
"step": 98000
},
{
"epoch": 21.72,
"learning_rate": 0.001131201764057332,
"loss": 6.1853,
"step": 98500
},
{
"epoch": 21.83,
"learning_rate": 0.0011267916207276735,
"loss": 6.1764,
"step": 99000
},
{
"epoch": 21.94,
"learning_rate": 0.0011223814773980154,
"loss": 6.1855,
"step": 99500
},
{
"epoch": 22.0,
"eval_loss": 6.212313175201416,
"eval_runtime": 3.1975,
"eval_samples_per_second": 91.009,
"eval_steps_per_second": 1.564,
"step": 99770
},
{
"epoch": 22.05,
"learning_rate": 0.0011179713340683573,
"loss": 6.1747,
"step": 100000
},
{
"epoch": 22.16,
"learning_rate": 0.001113561190738699,
"loss": 6.1786,
"step": 100500
},
{
"epoch": 22.27,
"learning_rate": 0.0011091510474090407,
"loss": 6.1784,
"step": 101000
},
{
"epoch": 22.38,
"learning_rate": 0.0011047409040793826,
"loss": 6.1798,
"step": 101500
},
{
"epoch": 22.49,
"learning_rate": 0.0011003307607497243,
"loss": 6.1793,
"step": 102000
},
{
"epoch": 22.6,
"learning_rate": 0.0010959206174200663,
"loss": 6.1775,
"step": 102500
},
{
"epoch": 22.71,
"learning_rate": 0.001091510474090408,
"loss": 6.1778,
"step": 103000
},
{
"epoch": 22.82,
"learning_rate": 0.0010871003307607497,
"loss": 6.1798,
"step": 103500
},
{
"epoch": 22.93,
"learning_rate": 0.0010826901874310916,
"loss": 6.1758,
"step": 104000
},
{
"epoch": 23.0,
"eval_loss": 6.209214687347412,
"eval_runtime": 3.2078,
"eval_samples_per_second": 90.718,
"eval_steps_per_second": 1.559,
"step": 104305
},
{
"epoch": 23.04,
"learning_rate": 0.0010782800441014333,
"loss": 6.1856,
"step": 104500
},
{
"epoch": 23.15,
"learning_rate": 0.0010738699007717752,
"loss": 6.1849,
"step": 105000
},
{
"epoch": 23.26,
"learning_rate": 0.001069459757442117,
"loss": 6.179,
"step": 105500
},
{
"epoch": 23.37,
"learning_rate": 0.0010650496141124586,
"loss": 6.1804,
"step": 106000
},
{
"epoch": 23.48,
"learning_rate": 0.0010606394707828005,
"loss": 6.1782,
"step": 106500
},
{
"epoch": 23.59,
"learning_rate": 0.0010562293274531422,
"loss": 6.1745,
"step": 107000
},
{
"epoch": 23.7,
"learning_rate": 0.001051819184123484,
"loss": 6.183,
"step": 107500
},
{
"epoch": 23.81,
"learning_rate": 0.001047409040793826,
"loss": 6.1828,
"step": 108000
},
{
"epoch": 23.93,
"learning_rate": 0.0010429988974641675,
"loss": 6.18,
"step": 108500
},
{
"epoch": 24.0,
"eval_loss": 6.213593006134033,
"eval_runtime": 3.2056,
"eval_samples_per_second": 90.78,
"eval_steps_per_second": 1.56,
"step": 108840
}
],
"logging_steps": 500,
"max_steps": 226750,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"total_flos": 1.23831193938535e+19,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}