|
{ |
|
"best_metric": 5.67610502243042, |
|
"best_model_checkpoint": "./results/models/mistral-dna/checkpoint-326520", |
|
"epoch": 19.0, |
|
"eval_steps": 500, |
|
"global_step": 344660, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027563395810363836, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0009994487320837927, |
|
"loss": 6.911, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05512679162072767, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0009988974641675853, |
|
"loss": 6.4675, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08269018743109151, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0009983461962513782, |
|
"loss": 6.3671, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.11025358324145534, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0009977949283351709, |
|
"loss": 6.3713, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1378169790518192, |
|
"grad_norm": 8.625, |
|
"learning_rate": 0.0009972436604189637, |
|
"loss": 6.3203, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.16538037486218302, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.0009966923925027564, |
|
"loss": 6.2909, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.19294377067254687, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.000996141124586549, |
|
"loss": 6.258, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2205071664829107, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.000995589856670342, |
|
"loss": 6.229, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.24807056229327454, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0009950385887541346, |
|
"loss": 6.1963, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2756339581036384, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0009944873208379273, |
|
"loss": 6.164, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3031973539140022, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00099393605292172, |
|
"loss": 6.182, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.33076074972436603, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.0009933847850055126, |
|
"loss": 6.1642, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.35832414553472985, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0009928335170893054, |
|
"loss": 6.1384, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.38588754134509373, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.000992282249173098, |
|
"loss": 6.1192, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.41345093715545755, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.000991730981256891, |
|
"loss": 6.118, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4410143329658214, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 0.0009911797133406836, |
|
"loss": 6.1237, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4685777287761852, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0009906284454244763, |
|
"loss": 6.1096, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.4961411245865491, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.0009900771775082692, |
|
"loss": 6.0873, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.523704520396913, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0009895259095920618, |
|
"loss": 6.0941, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5512679162072768, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0009889746416758545, |
|
"loss": 6.0842, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5788313120176406, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.0009884233737596471, |
|
"loss": 6.0843, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6063947078280044, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0009878721058434398, |
|
"loss": 6.0626, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6339581036383682, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 0.0009873208379272327, |
|
"loss": 6.0823, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6615214994487321, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0009867695700110253, |
|
"loss": 6.0745, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6890848952590959, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0009862183020948182, |
|
"loss": 6.0307, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.7166482910694597, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0009856670341786109, |
|
"loss": 6.0327, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.7442116868798236, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0009851157662624035, |
|
"loss": 6.0547, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7717750826901875, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0009845644983461964, |
|
"loss": 6.0374, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7993384785005513, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.000984013230429989, |
|
"loss": 6.029, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.8269018743109151, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0009834619625137817, |
|
"loss": 6.0243, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.8544652701212789, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0009829106945975744, |
|
"loss": 5.9914, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8820286659316428, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.000982359426681367, |
|
"loss": 6.0179, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.9095920617420066, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.00098180815876516, |
|
"loss": 6.0157, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.9371554575523704, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0009812568908489526, |
|
"loss": 6.0221, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.9647188533627343, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0009807056229327454, |
|
"loss": 6.0173, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9922822491730982, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.000980154355016538, |
|
"loss": 5.9947, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 5.929446220397949, |
|
"eval_runtime": 3.5789, |
|
"eval_samples_per_second": 81.309, |
|
"eval_steps_per_second": 5.309, |
|
"step": 18140 |
|
}, |
|
{ |
|
"epoch": 1.0198456449834619, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0009796030871003308, |
|
"loss": 5.998, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.0474090407938257, |
|
"grad_norm": 10.0, |
|
"learning_rate": 0.0009790518191841234, |
|
"loss": 6.0043, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.0749724366041897, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0009785005512679163, |
|
"loss": 5.9952, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.1025358324145536, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.000977949283351709, |
|
"loss": 5.9804, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.1300992282249174, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.0009773980154355016, |
|
"loss": 5.9935, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.1576626240352812, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0009768467475192943, |
|
"loss": 5.9866, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.185226019845645, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.000976295479603087, |
|
"loss": 5.9887, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.2127894156560088, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.0009757442116868799, |
|
"loss": 5.9695, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.2403528114663727, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0009751929437706726, |
|
"loss": 5.9573, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.2679162072767365, |
|
"grad_norm": 12.75, |
|
"learning_rate": 0.0009746416758544653, |
|
"loss": 5.9796, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.2954796030871003, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.000974090407938258, |
|
"loss": 5.9909, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.3230429988974641, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0009735391400220506, |
|
"loss": 5.9719, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.350606394707828, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0009729878721058435, |
|
"loss": 5.9726, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.3781697905181918, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0009724366041896362, |
|
"loss": 5.9742, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4057331863285556, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0009718853362734289, |
|
"loss": 5.9722, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.4332965821389196, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0009713340683572216, |
|
"loss": 5.9578, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.4608599779492835, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0009707828004410143, |
|
"loss": 5.9473, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.4884233737596473, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.0009702315325248071, |
|
"loss": 5.955, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.515986769570011, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0009696802646085998, |
|
"loss": 5.9419, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.543550165380375, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.0009691289966923926, |
|
"loss": 5.9387, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.5711135611907387, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0009685777287761852, |
|
"loss": 5.95, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.5986769570011026, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.0009680264608599779, |
|
"loss": 5.9469, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.6262403528114664, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 0.0009674751929437707, |
|
"loss": 5.9324, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.6538037486218302, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0009669239250275634, |
|
"loss": 5.9238, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.681367144432194, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0009663726571113562, |
|
"loss": 5.9536, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.7089305402425579, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0009658213891951488, |
|
"loss": 5.9408, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.7364939360529217, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0009652701212789415, |
|
"loss": 5.9553, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.7640573318632855, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0009647188533627344, |
|
"loss": 5.9385, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.7916207276736493, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.000964167585446527, |
|
"loss": 5.9371, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.8191841234840131, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0009636163175303198, |
|
"loss": 5.9171, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.846747519294377, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.0009630650496141124, |
|
"loss": 5.9314, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.8743109151047408, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0009625137816979052, |
|
"loss": 5.951, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.9018743109151046, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.000961962513781698, |
|
"loss": 5.9276, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.9294377067254684, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0009614112458654906, |
|
"loss": 5.9392, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.9570011025358323, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0009608599779492834, |
|
"loss": 5.9271, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.9845644983461963, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.0009603087100330761, |
|
"loss": 5.9204, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 5.8477325439453125, |
|
"eval_runtime": 3.5523, |
|
"eval_samples_per_second": 81.919, |
|
"eval_steps_per_second": 5.349, |
|
"step": 36280 |
|
}, |
|
{ |
|
"epoch": 2.01212789415656, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0009597574421168688, |
|
"loss": 5.9157, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.0396912899669237, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0009592061742006616, |
|
"loss": 5.931, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0672546857772875, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.0009586549062844542, |
|
"loss": 5.9114, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.0948180815876514, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.000958103638368247, |
|
"loss": 5.9024, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1223814773980156, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0009575523704520397, |
|
"loss": 5.9296, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.1499448732083795, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0009570011025358324, |
|
"loss": 5.9374, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.1775082690187433, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0009564498346196252, |
|
"loss": 5.9184, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.205071664829107, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.0009558985667034179, |
|
"loss": 5.896, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.232635060639471, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0009553472987872106, |
|
"loss": 5.9052, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.2601984564498347, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.0009547960308710033, |
|
"loss": 5.9184, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.2877618522601986, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0009542447629547961, |
|
"loss": 5.8971, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.3153252480705624, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0009536934950385888, |
|
"loss": 5.9281, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.342888643880926, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.0009531422271223815, |
|
"loss": 5.8906, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.37045203969129, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0009525909592061742, |
|
"loss": 5.8993, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.398015435501654, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.0009520396912899669, |
|
"loss": 5.9032, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.4255788313120177, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0009514884233737597, |
|
"loss": 5.8909, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.4531422271223815, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0009509371554575524, |
|
"loss": 5.9002, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.4807056229327453, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0009503858875413451, |
|
"loss": 5.882, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.508269018743109, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.0009498346196251379, |
|
"loss": 5.8949, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.535832414553473, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0009492833517089305, |
|
"loss": 5.8871, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.563395810363837, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009487320837927233, |
|
"loss": 5.9061, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.5909592061742006, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.000948180815876516, |
|
"loss": 5.8935, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.6185226019845644, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009476295479603087, |
|
"loss": 5.8811, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.6460859977949283, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0009470782800441015, |
|
"loss": 5.8933, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.673649393605292, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0009465270121278941, |
|
"loss": 5.8882, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.701212789415656, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.0009459757442116869, |
|
"loss": 5.8635, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.7287761852260197, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0009454244762954797, |
|
"loss": 5.888, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.7563395810363835, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0009448732083792723, |
|
"loss": 5.8789, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.7839029768467474, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009443219404630651, |
|
"loss": 5.8909, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.811466372657111, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0009437706725468577, |
|
"loss": 5.8678, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8390297684674755, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 0.0009432194046306505, |
|
"loss": 5.8754, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.8665931642778393, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009426681367144433, |
|
"loss": 5.8766, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.894156560088203, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009421168687982359, |
|
"loss": 5.886, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.921719955898567, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0009415656008820287, |
|
"loss": 5.8551, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9492833517089307, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0009410143329658214, |
|
"loss": 5.8986, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.9768467475192946, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.0009404630650496141, |
|
"loss": 5.891, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 5.803347587585449, |
|
"eval_runtime": 3.5792, |
|
"eval_samples_per_second": 81.304, |
|
"eval_steps_per_second": 5.308, |
|
"step": 54420 |
|
}, |
|
{ |
|
"epoch": 3.0044101433296584, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0009399117971334069, |
|
"loss": 5.8926, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 3.031973539140022, |
|
"grad_norm": 14.875, |
|
"learning_rate": 0.0009393605292171997, |
|
"loss": 5.8568, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.059536934950386, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0009388092613009923, |
|
"loss": 5.8754, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 3.08710033076075, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.000938257993384785, |
|
"loss": 5.8574, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.1146637265711137, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0009377067254685777, |
|
"loss": 5.8662, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 3.1422271223814775, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0009371554575523704, |
|
"loss": 5.8519, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.1697905181918413, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0009366041896361633, |
|
"loss": 5.8509, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 3.197353914002205, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0009360529217199559, |
|
"loss": 5.8836, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.224917309812569, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009355016538037486, |
|
"loss": 5.8643, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 3.252480705622933, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0009349503858875414, |
|
"loss": 5.8799, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.2800441014332966, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.000934399117971334, |
|
"loss": 5.866, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 3.3076074972436604, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0009338478500551269, |
|
"loss": 5.8548, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.3351708930540243, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0009332965821389196, |
|
"loss": 5.8507, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 3.362734288864388, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.0009327453142227122, |
|
"loss": 5.8459, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.390297684674752, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.000932194046306505, |
|
"loss": 5.8494, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 3.4178610804851157, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0009316427783902976, |
|
"loss": 5.873, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.4454244762954795, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0009310915104740905, |
|
"loss": 5.8589, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 3.4729878721058434, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0009305402425578832, |
|
"loss": 5.8487, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.500551267916207, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0009299889746416758, |
|
"loss": 5.8644, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 3.528114663726571, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0009294377067254686, |
|
"loss": 5.8487, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.555678059536935, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0009288864388092612, |
|
"loss": 5.8694, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 3.5832414553472987, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0009283351708930541, |
|
"loss": 5.8628, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.6108048511576625, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009277839029768468, |
|
"loss": 5.8573, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 3.6383682469680263, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0009272326350606394, |
|
"loss": 5.8417, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.66593164277839, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0009266813671444322, |
|
"loss": 5.8485, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 3.693495038588754, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009261300992282249, |
|
"loss": 5.8443, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.7210584343991178, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0009255788313120177, |
|
"loss": 5.841, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 3.7486218302094816, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009250275633958104, |
|
"loss": 5.835, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.7761852260198454, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.0009244762954796031, |
|
"loss": 5.8575, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 3.8037486218302092, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0009239250275633958, |
|
"loss": 5.8631, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.831312017640573, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0009233737596471885, |
|
"loss": 5.8509, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 3.8588754134509373, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0009228224917309814, |
|
"loss": 5.847, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.886438809261301, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.000922271223814774, |
|
"loss": 5.8587, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 3.914002205071665, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0009217199558985667, |
|
"loss": 5.8451, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.941565600882029, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0009211686879823594, |
|
"loss": 5.8521, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 3.9691289966923926, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0009206174200661521, |
|
"loss": 5.8407, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.9966923925027564, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.000920066152149945, |
|
"loss": 5.8205, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 5.774289608001709, |
|
"eval_runtime": 3.5097, |
|
"eval_samples_per_second": 82.912, |
|
"eval_steps_per_second": 5.414, |
|
"step": 72560 |
|
}, |
|
{ |
|
"epoch": 4.02425578831312, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0009195148842337376, |
|
"loss": 5.8396, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 4.051819184123484, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.0009189636163175303, |
|
"loss": 5.8453, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 4.0793825799338475, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.000918412348401323, |
|
"loss": 5.8365, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.106945975744211, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0009178610804851157, |
|
"loss": 5.8304, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 4.134509371554575, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0009173098125689086, |
|
"loss": 5.8235, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.16207276736494, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0009167585446527012, |
|
"loss": 5.8591, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 4.189636163175303, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.000916207276736494, |
|
"loss": 5.8466, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.2171995589856675, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0009156560088202867, |
|
"loss": 5.8329, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 4.244762954796031, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0009151047409040793, |
|
"loss": 5.8334, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.272326350606395, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0009145534729878722, |
|
"loss": 5.8243, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 4.299889746416759, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009140022050716649, |
|
"loss": 5.8238, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.327453142227123, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.0009134509371554576, |
|
"loss": 5.8285, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 4.355016538037487, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0009128996692392503, |
|
"loss": 5.838, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.38257993384785, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0009123484013230429, |
|
"loss": 5.8347, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 4.410143329658214, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009117971334068358, |
|
"loss": 5.832, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.437706725468578, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0009112458654906285, |
|
"loss": 5.841, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 4.465270121278942, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0009106945975744212, |
|
"loss": 5.8299, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.492833517089306, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0009101433296582139, |
|
"loss": 5.8212, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 4.5203969128996695, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0009095920617420066, |
|
"loss": 5.8406, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.547960308710033, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009090407938257994, |
|
"loss": 5.8292, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 4.575523704520397, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009084895259095921, |
|
"loss": 5.8295, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.603087100330761, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009079382579933849, |
|
"loss": 5.8325, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 4.630650496141125, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0009073869900771775, |
|
"loss": 5.8038, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.658213891951489, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0009068357221609702, |
|
"loss": 5.8389, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 4.685777287761852, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.000906284454244763, |
|
"loss": 5.829, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.713340683572216, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0009057331863285557, |
|
"loss": 5.8178, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 4.74090407938258, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0009051819184123485, |
|
"loss": 5.8195, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.768467475192944, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0009046306504961411, |
|
"loss": 5.8204, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 4.796030871003308, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0009040793825799338, |
|
"loss": 5.7925, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.8235942668136715, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0009035281146637267, |
|
"loss": 5.8219, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 4.851157662624035, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0009029768467475193, |
|
"loss": 5.8183, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.878721058434399, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0009024255788313121, |
|
"loss": 5.8106, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 4.906284454244763, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0009018743109151047, |
|
"loss": 5.8082, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.933847850055127, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0009013230429988974, |
|
"loss": 5.8191, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 4.961411245865491, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0009007717750826903, |
|
"loss": 5.8146, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.9889746416758545, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0009002205071664829, |
|
"loss": 5.834, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 5.750288486480713, |
|
"eval_runtime": 3.4633, |
|
"eval_samples_per_second": 84.024, |
|
"eval_steps_per_second": 5.486, |
|
"step": 90700 |
|
}, |
|
{ |
|
"epoch": 5.016538037486218, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0008996692392502757, |
|
"loss": 5.7809, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 5.044101433296582, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0008991179713340684, |
|
"loss": 5.8308, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 5.071664829106946, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.000898566703417861, |
|
"loss": 5.7961, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.09922822491731, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0008980154355016539, |
|
"loss": 5.787, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 5.126791620727674, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0008974641675854465, |
|
"loss": 5.8046, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.154355016538037, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0008969128996692393, |
|
"loss": 5.8129, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 5.181918412348401, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000896361631753032, |
|
"loss": 5.7921, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.209481808158765, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0008958103638368246, |
|
"loss": 5.8238, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 5.237045203969129, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0008952590959206174, |
|
"loss": 5.8191, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.264608599779493, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0008947078280044102, |
|
"loss": 5.8169, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 5.2921719955898565, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0008941565600882029, |
|
"loss": 5.8309, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.31973539140022, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0008936052921719956, |
|
"loss": 5.7999, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 5.347298787210584, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0008930540242557882, |
|
"loss": 5.8098, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.374862183020948, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.000892502756339581, |
|
"loss": 5.799, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 5.402425578831312, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0008919514884233738, |
|
"loss": 5.7938, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.429988974641676, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008914002205071665, |
|
"loss": 5.8295, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 5.4575523704520394, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0008908489525909592, |
|
"loss": 5.803, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.485115766262403, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.000890297684674752, |
|
"loss": 5.7955, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 5.512679162072767, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0008897464167585446, |
|
"loss": 5.8132, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.540242557883131, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0008891951488423374, |
|
"loss": 5.8069, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 5.567805953693495, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008886438809261302, |
|
"loss": 5.7817, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.595369349503859, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.0008880926130099228, |
|
"loss": 5.8091, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 5.622932745314222, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0008875413450937156, |
|
"loss": 5.8136, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.650496141124586, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0008869900771775082, |
|
"loss": 5.7865, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 5.67805953693495, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.000886438809261301, |
|
"loss": 5.8017, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.705622932745314, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008858875413450938, |
|
"loss": 5.7965, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 5.733186328555679, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008853362734288864, |
|
"loss": 5.8133, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.7607497243660415, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0008847850055126792, |
|
"loss": 5.7881, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 5.788313120176406, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0008842337375964719, |
|
"loss": 5.7775, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.815876515986769, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0008836824696802646, |
|
"loss": 5.8153, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 5.843439911797134, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0008831312017640574, |
|
"loss": 5.8193, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.871003307607497, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00088257993384785, |
|
"loss": 5.7903, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 5.8985667034178615, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0008820286659316428, |
|
"loss": 5.8197, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.926130099228224, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.0008814773980154355, |
|
"loss": 5.8029, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 5.953693495038589, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0008809261300992282, |
|
"loss": 5.8085, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.981256890848953, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.000880374862183021, |
|
"loss": 5.7913, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 5.734777927398682, |
|
"eval_runtime": 3.5058, |
|
"eval_samples_per_second": 83.005, |
|
"eval_steps_per_second": 5.42, |
|
"step": 108840 |
|
}, |
|
{ |
|
"epoch": 6.008820286659317, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0008798235942668137, |
|
"loss": 5.7969, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 6.036383682469681, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.0008792723263506064, |
|
"loss": 5.798, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 6.063947078280044, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008787210584343991, |
|
"loss": 5.8031, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 6.091510474090408, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0008781697905181919, |
|
"loss": 5.7775, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 6.119073869900772, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0008776185226019846, |
|
"loss": 5.7736, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.146637265711136, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0008770672546857773, |
|
"loss": 5.7873, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 6.1742006615215, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00087651598676957, |
|
"loss": 5.7928, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.2017640573318635, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0008759647188533627, |
|
"loss": 5.7961, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 6.229327453142227, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0008754134509371555, |
|
"loss": 5.7866, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.256890848952591, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0008748621830209482, |
|
"loss": 5.7872, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 6.284454244762955, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0008743109151047409, |
|
"loss": 5.803, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.312017640573319, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0008737596471885337, |
|
"loss": 5.8078, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 6.339581036383683, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0008732083792723263, |
|
"loss": 5.7968, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.3671444321940465, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0008726571113561191, |
|
"loss": 5.785, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 6.39470782800441, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0008721058434399119, |
|
"loss": 5.7898, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.422271223814774, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.0008715545755237045, |
|
"loss": 5.802, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 6.449834619625138, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0008710033076074973, |
|
"loss": 5.7907, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.477398015435502, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0008704520396912899, |
|
"loss": 5.7899, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 6.504961411245866, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0008699007717750828, |
|
"loss": 5.7908, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.532524807056229, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0008693495038588755, |
|
"loss": 5.808, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 6.560088202866593, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0008687982359426681, |
|
"loss": 5.7799, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.587651598676957, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0008682469680264609, |
|
"loss": 5.7839, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 6.615214994487321, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0008676957001102535, |
|
"loss": 5.7823, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.642778390297685, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0008671444321940464, |
|
"loss": 5.7844, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 6.6703417861080485, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0008665931642778391, |
|
"loss": 5.7917, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.697905181918412, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008660418963616317, |
|
"loss": 5.8034, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 6.725468577728776, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0008654906284454245, |
|
"loss": 5.7769, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.75303197353914, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0008649393605292172, |
|
"loss": 5.7911, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 6.780595369349504, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00086438809261301, |
|
"loss": 5.7873, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.808158765159868, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0008638368246968027, |
|
"loss": 5.7845, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 6.835722160970231, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0008632855567805954, |
|
"loss": 5.8045, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.863285556780595, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0008627342888643881, |
|
"loss": 5.7967, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 6.890848952590959, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0008621830209481808, |
|
"loss": 5.7611, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.918412348401323, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0008616317530319737, |
|
"loss": 5.7735, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 6.945975744211687, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0008610804851157663, |
|
"loss": 5.7777, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.9735391400220506, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.000860529217199559, |
|
"loss": 5.7793, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 5.720947742462158, |
|
"eval_runtime": 3.4558, |
|
"eval_samples_per_second": 84.206, |
|
"eval_steps_per_second": 5.498, |
|
"step": 126980 |
|
}, |
|
{ |
|
"epoch": 7.001102535832414, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.0008599779492833517, |
|
"loss": 5.7761, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 7.028665931642778, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008594266813671444, |
|
"loss": 5.7634, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 7.056229327453142, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0008588754134509373, |
|
"loss": 5.7837, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 7.083792723263506, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0008583241455347299, |
|
"loss": 5.7759, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 7.11135611907387, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0008577728776185226, |
|
"loss": 5.7737, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 7.1389195148842335, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008572216097023154, |
|
"loss": 5.7793, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 7.166482910694597, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.000856670341786108, |
|
"loss": 5.8033, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.194046306504961, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0008561190738699009, |
|
"loss": 5.7663, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 7.221609702315325, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0008555678059536935, |
|
"loss": 5.7625, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.249173098125689, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0008550165380374862, |
|
"loss": 5.8, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 7.276736493936053, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.000854465270121279, |
|
"loss": 5.7776, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.304299889746416, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0008539140022050716, |
|
"loss": 5.7849, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 7.33186328555678, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0008533627342888644, |
|
"loss": 5.7772, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.359426681367144, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0008528114663726572, |
|
"loss": 5.761, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 7.386990077177508, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0008522601984564498, |
|
"loss": 5.7731, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.414553472987872, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0008517089305402426, |
|
"loss": 5.7644, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 7.4421168687982355, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0008511576626240352, |
|
"loss": 5.761, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.4696802646086, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.000850606394707828, |
|
"loss": 5.7774, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 7.497243660418963, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0008500551267916208, |
|
"loss": 5.7838, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.524807056229328, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0008495038588754134, |
|
"loss": 5.7687, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 7.552370452039691, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0008489525909592062, |
|
"loss": 5.7818, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.5799338478500555, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0008484013230429989, |
|
"loss": 5.7693, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 7.607497243660419, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0008478500551267916, |
|
"loss": 5.7745, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.635060639470783, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0008472987872105844, |
|
"loss": 5.7658, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 7.662624035281147, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.000846747519294377, |
|
"loss": 5.7618, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.690187431091511, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.0008461962513781698, |
|
"loss": 5.7692, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 7.717750826901875, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0008456449834619625, |
|
"loss": 5.7893, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.7453142227122385, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0008450937155457552, |
|
"loss": 5.7833, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 7.772877618522602, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.000844542447629548, |
|
"loss": 5.7728, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.800441014332966, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0008439911797133408, |
|
"loss": 5.7854, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 7.82800441014333, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0008434399117971334, |
|
"loss": 5.7749, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.855567805953694, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0008428886438809261, |
|
"loss": 5.7706, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 7.883131201764058, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0008423373759647189, |
|
"loss": 5.7673, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.910694597574421, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0008417861080485116, |
|
"loss": 5.7834, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 7.938257993384785, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0008412348401323044, |
|
"loss": 5.7748, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.965821389195149, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.000840683572216097, |
|
"loss": 5.7908, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 7.993384785005513, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0008401323042998897, |
|
"loss": 5.7698, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 5.71162223815918, |
|
"eval_runtime": 3.4422, |
|
"eval_samples_per_second": 84.54, |
|
"eval_steps_per_second": 5.52, |
|
"step": 145120 |
|
}, |
|
{ |
|
"epoch": 8.020948180815877, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0008395810363836825, |
|
"loss": 5.7724, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 8.04851157662624, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0008390297684674752, |
|
"loss": 5.7663, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 8.076074972436604, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.000838478500551268, |
|
"loss": 5.7685, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 8.103638368246967, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0008379272326350607, |
|
"loss": 5.7875, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 8.131201764057332, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0008373759647188533, |
|
"loss": 5.7763, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 8.158765159867695, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0008368246968026461, |
|
"loss": 5.7662, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.18632855567806, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0008362734288864388, |
|
"loss": 5.7624, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 8.213891951488423, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0008357221609702316, |
|
"loss": 5.7667, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.241455347298787, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0008351708930540243, |
|
"loss": 5.7788, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 8.26901874310915, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0008346196251378169, |
|
"loss": 5.7647, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.296582138919515, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0008340683572216097, |
|
"loss": 5.7819, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 8.32414553472988, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0008335170893054025, |
|
"loss": 5.7612, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.351708930540243, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0008329658213891952, |
|
"loss": 5.7775, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 8.379272326350605, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0008324145534729879, |
|
"loss": 5.7565, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.40683572216097, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0008318632855567805, |
|
"loss": 5.7515, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 8.434399117971335, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0008313120176405733, |
|
"loss": 5.7639, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.461962513781698, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0008307607497243661, |
|
"loss": 5.7595, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 8.489525909592063, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0008302094818081588, |
|
"loss": 5.7484, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.517089305402425, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0008296582138919515, |
|
"loss": 5.7783, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 8.54465270121279, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0008291069459757442, |
|
"loss": 5.7618, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.572216097023153, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0008285556780595369, |
|
"loss": 5.7686, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 8.599779492833518, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0008280044101433297, |
|
"loss": 5.7739, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.62734288864388, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0008274531422271225, |
|
"loss": 5.7731, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 8.654906284454245, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0008269018743109151, |
|
"loss": 5.7755, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.682469680264608, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0008263506063947078, |
|
"loss": 5.7607, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 8.710033076074973, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0008257993384785005, |
|
"loss": 5.7848, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.737596471885336, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0008252480705622933, |
|
"loss": 5.7626, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 8.7651598676957, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0008246968026460861, |
|
"loss": 5.7732, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.792723263506064, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0008241455347298787, |
|
"loss": 5.7488, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 8.820286659316428, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0008235942668136714, |
|
"loss": 5.7689, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.847850055126791, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0008230429988974642, |
|
"loss": 5.7707, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 8.875413450937156, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0008224917309812569, |
|
"loss": 5.7448, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.902976846747519, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0008219404630650497, |
|
"loss": 5.7603, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 8.930540242557884, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0008213891951488423, |
|
"loss": 5.7592, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.958103638368247, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0008208379272326351, |
|
"loss": 5.7537, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 8.985667034178611, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0008202866593164278, |
|
"loss": 5.758, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 5.707642078399658, |
|
"eval_runtime": 3.4079, |
|
"eval_samples_per_second": 85.389, |
|
"eval_steps_per_second": 5.575, |
|
"step": 163260 |
|
}, |
|
{ |
|
"epoch": 9.013230429988974, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0008197353914002205, |
|
"loss": 5.7547, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 9.040793825799339, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0008191841234840133, |
|
"loss": 5.7489, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 9.068357221609702, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000818632855567806, |
|
"loss": 5.7536, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 9.095920617420067, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0008180815876515987, |
|
"loss": 5.7599, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 9.12348401323043, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0008175303197353914, |
|
"loss": 5.7676, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 9.151047409040794, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0008169790518191842, |
|
"loss": 5.7523, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 9.178610804851157, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0008164277839029769, |
|
"loss": 5.7702, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 9.206174200661522, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0008158765159867696, |
|
"loss": 5.7238, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.233737596471885, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0008153252480705623, |
|
"loss": 5.7696, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 9.26130099228225, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.000814773980154355, |
|
"loss": 5.7566, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.288864388092613, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.0008142227122381478, |
|
"loss": 5.7403, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 9.316427783902977, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0008136714443219405, |
|
"loss": 5.752, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.34399117971334, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0008131201764057332, |
|
"loss": 5.753, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 9.371554575523705, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.000812568908489526, |
|
"loss": 5.7681, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.399117971334068, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0008120176405733186, |
|
"loss": 5.7472, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 9.426681367144432, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0008114663726571113, |
|
"loss": 5.7753, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.454244762954795, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0008109151047409042, |
|
"loss": 5.7646, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 9.48180815876516, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.0008103638368246968, |
|
"loss": 5.7771, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.509371554575523, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0008098125689084896, |
|
"loss": 5.7628, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 9.536934950385888, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0008092613009922822, |
|
"loss": 5.7618, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.56449834619625, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0008087100330760749, |
|
"loss": 5.7626, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 9.592061742006615, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0008081587651598678, |
|
"loss": 5.7592, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.619625137816978, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.0008076074972436604, |
|
"loss": 5.7822, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 9.647188533627343, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0008070562293274532, |
|
"loss": 5.7712, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.674751929437706, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0008065049614112459, |
|
"loss": 5.7641, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 9.70231532524807, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0008059536934950385, |
|
"loss": 5.7498, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.729878721058434, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0008054024255788314, |
|
"loss": 5.7343, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 9.757442116868798, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.000804851157662624, |
|
"loss": 5.7558, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.785005512679161, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0008042998897464168, |
|
"loss": 5.7295, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 9.812568908489526, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0008037486218302095, |
|
"loss": 5.7506, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.840132304299889, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0008031973539140021, |
|
"loss": 5.7619, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 9.867695700110254, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.000802646085997795, |
|
"loss": 5.7847, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.895259095920617, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0008020948180815877, |
|
"loss": 5.7624, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 9.922822491730981, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0008015435501653804, |
|
"loss": 5.7732, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.950385887541344, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0008009922822491731, |
|
"loss": 5.7425, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 9.977949283351709, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.0008004410143329657, |
|
"loss": 5.7658, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 5.698179244995117, |
|
"eval_runtime": 3.4017, |
|
"eval_samples_per_second": 85.545, |
|
"eval_steps_per_second": 5.585, |
|
"step": 181400 |
|
}, |
|
{ |
|
"epoch": 10.005512679162074, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0007998897464167586, |
|
"loss": 5.754, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 10.033076074972437, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0007993384785005513, |
|
"loss": 5.7548, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 10.060639470782801, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.000798787210584344, |
|
"loss": 5.7513, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 10.088202866593164, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0007982359426681367, |
|
"loss": 5.7397, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 10.115766262403529, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0007976846747519294, |
|
"loss": 5.7576, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 10.143329658213892, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0007971334068357222, |
|
"loss": 5.7683, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 10.170893054024257, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0007965821389195149, |
|
"loss": 5.7573, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 10.19845644983462, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0007960308710033077, |
|
"loss": 5.7413, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.226019845644984, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0007954796030871003, |
|
"loss": 5.7609, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 10.253583241455347, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0007949283351708931, |
|
"loss": 5.7462, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.281146637265712, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0007943770672546858, |
|
"loss": 5.7551, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 10.308710033076075, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0007938257993384785, |
|
"loss": 5.7674, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.33627342888644, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0007932745314222713, |
|
"loss": 5.7599, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 10.363836824696802, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0007927232635060639, |
|
"loss": 5.7496, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.391400220507167, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0007921719955898567, |
|
"loss": 5.7392, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 10.41896361631753, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0007916207276736495, |
|
"loss": 5.7736, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.446527012127895, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0007910694597574421, |
|
"loss": 5.7394, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 10.474090407938258, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0007905181918412349, |
|
"loss": 5.7425, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.501653803748622, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0007899669239250275, |
|
"loss": 5.7564, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 10.529217199558985, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0007894156560088203, |
|
"loss": 5.7643, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.55678059536935, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0007888643880926131, |
|
"loss": 5.7533, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 10.584343991179713, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0007883131201764057, |
|
"loss": 5.7359, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.611907386990078, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0007877618522601985, |
|
"loss": 5.7601, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 10.63947078280044, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0007872105843439912, |
|
"loss": 5.7627, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.667034178610805, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0007866593164277839, |
|
"loss": 5.7579, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 10.694597574421168, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0007861080485115767, |
|
"loss": 5.7502, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.722160970231533, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0007855567805953693, |
|
"loss": 5.7542, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 10.749724366041896, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.0007850055126791621, |
|
"loss": 5.7384, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.77728776185226, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0007844542447629548, |
|
"loss": 5.7489, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 10.804851157662624, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0007839029768467475, |
|
"loss": 5.7698, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.832414553472988, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0007833517089305403, |
|
"loss": 5.7565, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 10.859977949283351, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.000782800441014333, |
|
"loss": 5.7514, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.887541345093716, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0007822491730981257, |
|
"loss": 5.7497, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 10.915104740904079, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0007816979051819184, |
|
"loss": 5.7485, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.942668136714444, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0007811466372657112, |
|
"loss": 5.7435, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 10.970231532524807, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0007805953693495039, |
|
"loss": 5.7511, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.997794928335171, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0007800441014332966, |
|
"loss": 5.7397, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 5.6925530433654785, |
|
"eval_runtime": 3.473, |
|
"eval_samples_per_second": 83.788, |
|
"eval_steps_per_second": 5.471, |
|
"step": 199540 |
|
}, |
|
{ |
|
"epoch": 11.025358324145534, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0007794928335170893, |
|
"loss": 5.7422, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 11.052921719955899, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.000778941565600882, |
|
"loss": 5.733, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 11.080485115766262, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0007783902976846748, |
|
"loss": 5.7357, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 11.108048511576627, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0007778390297684675, |
|
"loss": 5.7599, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 11.13561190738699, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0007772877618522602, |
|
"loss": 5.7505, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 11.163175303197354, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.000776736493936053, |
|
"loss": 5.7602, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 11.190738699007717, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0007761852260198456, |
|
"loss": 5.7426, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 11.218302094818082, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0007756339581036384, |
|
"loss": 5.751, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 11.245865490628445, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0007750826901874311, |
|
"loss": 5.7675, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.27342888643881, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0007745314222712239, |
|
"loss": 5.7414, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 11.300992282249172, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0007739801543550166, |
|
"loss": 5.7601, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.328555678059537, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0007734288864388092, |
|
"loss": 5.7351, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 11.3561190738699, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.000772877618522602, |
|
"loss": 5.7413, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.383682469680265, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0007723263506063948, |
|
"loss": 5.7493, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 11.411245865490628, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0007717750826901875, |
|
"loss": 5.7704, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.438809261300992, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0007712238147739802, |
|
"loss": 5.747, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 11.466372657111355, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0007706725468577728, |
|
"loss": 5.744, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.49393605292172, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0007701212789415656, |
|
"loss": 5.7392, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 11.521499448732083, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0007695700110253583, |
|
"loss": 5.7407, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.549062844542448, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0007690187431091511, |
|
"loss": 5.7333, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 11.576626240352812, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0007684674751929438, |
|
"loss": 5.743, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.604189636163175, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0007679162072767365, |
|
"loss": 5.7642, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 11.631753031973538, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0007673649393605292, |
|
"loss": 5.7351, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.659316427783903, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0007668136714443219, |
|
"loss": 5.7701, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 11.686879823594268, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0007662624035281148, |
|
"loss": 5.7295, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.71444321940463, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0007657111356119074, |
|
"loss": 5.7499, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 11.742006615214994, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0007651598676957001, |
|
"loss": 5.7722, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.769570011025358, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0007646085997794928, |
|
"loss": 5.7446, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 11.797133406835723, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0007640573318632855, |
|
"loss": 5.7524, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.824696802646086, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0007635060639470784, |
|
"loss": 5.7592, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 11.85226019845645, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.000762954796030871, |
|
"loss": 5.7532, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.879823594266814, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0007624035281146637, |
|
"loss": 5.7412, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 11.907386990077178, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0007618522601984565, |
|
"loss": 5.7421, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.934950385887541, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0007613009922822491, |
|
"loss": 5.7383, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 11.962513781697906, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.000760749724366042, |
|
"loss": 5.7447, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.990077177508269, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0007601984564498347, |
|
"loss": 5.7552, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 5.68997859954834, |
|
"eval_runtime": 3.4588, |
|
"eval_samples_per_second": 84.133, |
|
"eval_steps_per_second": 5.493, |
|
"step": 217680 |
|
}, |
|
{ |
|
"epoch": 12.017640573318634, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0007596471885336273, |
|
"loss": 5.7476, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 12.045203969128996, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0007590959206174201, |
|
"loss": 5.7503, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 12.072767364939361, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0007585446527012127, |
|
"loss": 5.7334, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 12.100330760749724, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0007579933847850056, |
|
"loss": 5.7367, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 12.127894156560089, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0007574421168687983, |
|
"loss": 5.7288, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 12.155457552370452, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0007568908489525909, |
|
"loss": 5.752, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 12.183020948180816, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0007563395810363837, |
|
"loss": 5.7276, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 12.21058434399118, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0007557883131201763, |
|
"loss": 5.732, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 12.238147739801544, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0007552370452039692, |
|
"loss": 5.7643, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 12.265711135611907, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0007546857772877619, |
|
"loss": 5.7374, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 12.293274531422272, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0007541345093715545, |
|
"loss": 5.7423, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.320837927232635, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0007535832414553473, |
|
"loss": 5.7593, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 12.348401323043, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00075303197353914, |
|
"loss": 5.7556, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.375964718853362, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0007524807056229328, |
|
"loss": 5.7512, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 12.403528114663727, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0007519294377067255, |
|
"loss": 5.7309, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.43109151047409, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0007513781697905182, |
|
"loss": 5.7577, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 12.458654906284455, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0007508269018743109, |
|
"loss": 5.7341, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.486218302094818, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0007502756339581036, |
|
"loss": 5.7458, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 12.513781697905182, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0007497243660418965, |
|
"loss": 5.7408, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.541345093715545, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0007491730981256891, |
|
"loss": 5.7453, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 12.56890848952591, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0007486218302094819, |
|
"loss": 5.7414, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.596471885336273, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0007480705622932745, |
|
"loss": 5.7766, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 12.624035281146638, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0007475192943770672, |
|
"loss": 5.7368, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.651598676957, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0007469680264608601, |
|
"loss": 5.7412, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 12.679162072767365, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0007464167585446527, |
|
"loss": 5.7527, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.706725468577728, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0007458654906284455, |
|
"loss": 5.7399, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 12.734288864388093, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0007453142227122382, |
|
"loss": 5.7555, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.761852260198456, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0007447629547960308, |
|
"loss": 5.7398, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 12.78941565600882, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0007442116868798237, |
|
"loss": 5.737, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.816979051819184, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 0.0007436604189636163, |
|
"loss": 5.7539, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 12.844542447629548, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0007431091510474091, |
|
"loss": 5.7506, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.872105843439911, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0007425578831312018, |
|
"loss": 5.7503, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 12.899669239250276, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0007420066152149944, |
|
"loss": 5.7505, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.927232635060639, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0007414553472987873, |
|
"loss": 5.7385, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 12.954796030871004, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00074090407938258, |
|
"loss": 5.737, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.982359426681366, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.0007403528114663727, |
|
"loss": 5.755, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 5.689921855926514, |
|
"eval_runtime": 3.4572, |
|
"eval_samples_per_second": 84.171, |
|
"eval_steps_per_second": 5.496, |
|
"step": 235820 |
|
}, |
|
{ |
|
"epoch": 13.009922822491731, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.0007398015435501654, |
|
"loss": 5.7438, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 13.037486218302094, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.000739250275633958, |
|
"loss": 5.7433, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 13.065049614112459, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0007386990077177509, |
|
"loss": 5.7426, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 13.092613009922822, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0007381477398015436, |
|
"loss": 5.7621, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 13.120176405733186, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0007375964718853363, |
|
"loss": 5.722, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 13.14773980154355, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.000737045203969129, |
|
"loss": 5.7348, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 13.175303197353914, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0007364939360529217, |
|
"loss": 5.7326, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 13.202866593164277, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0007359426681367145, |
|
"loss": 5.747, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 13.230429988974642, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0007353914002205072, |
|
"loss": 5.7548, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 13.257993384785005, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0007348401323043, |
|
"loss": 5.7422, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 13.28555678059537, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0007342888643880926, |
|
"loss": 5.737, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.313120176405732, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0007337375964718853, |
|
"loss": 5.7167, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 13.340683572216097, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0007331863285556781, |
|
"loss": 5.7299, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.368246968026462, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0007326350606394708, |
|
"loss": 5.7564, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 13.395810363836825, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0007320837927232636, |
|
"loss": 5.7445, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.42337375964719, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0007315325248070562, |
|
"loss": 5.753, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 13.450937155457552, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0007309812568908489, |
|
"loss": 5.7441, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.478500551267917, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0007304299889746418, |
|
"loss": 5.7416, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 13.50606394707828, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0007298787210584344, |
|
"loss": 5.7404, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.533627342888645, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0007293274531422272, |
|
"loss": 5.7318, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 13.561190738699008, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0007287761852260198, |
|
"loss": 5.7366, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.588754134509372, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0007282249173098125, |
|
"loss": 5.7395, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 13.616317530319735, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0007276736493936053, |
|
"loss": 5.7366, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.6438809261301, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.000727122381477398, |
|
"loss": 5.7624, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 13.671444321940463, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0007265711135611908, |
|
"loss": 5.751, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.699007717750828, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0007260198456449835, |
|
"loss": 5.7418, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 13.72657111356119, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0007254685777287761, |
|
"loss": 5.7474, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.754134509371555, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0007249173098125689, |
|
"loss": 5.7553, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 13.781697905181918, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0007243660418963616, |
|
"loss": 5.7491, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.809261300992283, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0007238147739801544, |
|
"loss": 5.7412, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 13.836824696802646, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0007232635060639471, |
|
"loss": 5.7366, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.86438809261301, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0007227122381477398, |
|
"loss": 5.7245, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 13.891951488423373, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0007221609702315325, |
|
"loss": 5.7421, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.919514884233738, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0007216097023153253, |
|
"loss": 5.7628, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 13.947078280044101, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.000721058434399118, |
|
"loss": 5.7538, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.974641675854466, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.0007205071664829107, |
|
"loss": 5.7307, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 5.6859917640686035, |
|
"eval_runtime": 3.4223, |
|
"eval_samples_per_second": 85.03, |
|
"eval_steps_per_second": 5.552, |
|
"step": 253960 |
|
}, |
|
{ |
|
"epoch": 14.002205071664829, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0007199558985667035, |
|
"loss": 5.7491, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 14.029768467475193, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0007194046306504961, |
|
"loss": 5.7406, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 14.057331863285556, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0007188533627342889, |
|
"loss": 5.7294, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 14.084895259095921, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0007183020948180816, |
|
"loss": 5.7418, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 14.112458654906284, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0007177508269018743, |
|
"loss": 5.7267, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 14.140022050716649, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0007171995589856671, |
|
"loss": 5.7439, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 14.167585446527012, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0007166482910694597, |
|
"loss": 5.7512, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 14.195148842337376, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0007160970231532525, |
|
"loss": 5.7419, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 14.22271223814774, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0007155457552370453, |
|
"loss": 5.7306, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 14.250275633958104, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0007149944873208379, |
|
"loss": 5.7509, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 14.277839029768467, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0007144432194046307, |
|
"loss": 5.7601, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 14.305402425578832, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0007138919514884233, |
|
"loss": 5.7511, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 14.332965821389195, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0007133406835722161, |
|
"loss": 5.7343, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.36052921719956, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0007127894156560089, |
|
"loss": 5.7474, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 14.388092613009922, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0007122381477398015, |
|
"loss": 5.7431, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.415656008820287, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0007116868798235943, |
|
"loss": 5.7377, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 14.44321940463065, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.000711135611907387, |
|
"loss": 5.7433, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.470782800441015, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0007105843439911797, |
|
"loss": 5.7341, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 14.498346196251378, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0007100330760749725, |
|
"loss": 5.7479, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.525909592061742, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0007094818081587651, |
|
"loss": 5.7299, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 14.553472987872105, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0007089305402425579, |
|
"loss": 5.7444, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.58103638368247, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0007083792723263506, |
|
"loss": 5.7349, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 14.608599779492833, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0007078280044101433, |
|
"loss": 5.736, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.636163175303198, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0007072767364939361, |
|
"loss": 5.7253, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 14.66372657111356, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0007067254685777288, |
|
"loss": 5.7351, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.691289966923925, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0007061742006615215, |
|
"loss": 5.7491, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 14.718853362734288, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0007056229327453142, |
|
"loss": 5.7343, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.746416758544653, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.000705071664829107, |
|
"loss": 5.7302, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 14.773980154355016, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0007045203969128997, |
|
"loss": 5.7474, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.80154355016538, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0007039691289966924, |
|
"loss": 5.73, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 14.829106945975743, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0007034178610804851, |
|
"loss": 5.737, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.856670341786108, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0007028665931642778, |
|
"loss": 5.7352, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 14.884233737596471, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0007023153252480707, |
|
"loss": 5.7386, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.911797133406836, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0007017640573318633, |
|
"loss": 5.7437, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 14.9393605292172, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.000701212789415656, |
|
"loss": 5.7477, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.966923925027563, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0007006615214994488, |
|
"loss": 5.7491, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 14.994487320837926, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0007001102535832414, |
|
"loss": 5.7383, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 5.684328079223633, |
|
"eval_runtime": 3.4544, |
|
"eval_samples_per_second": 84.241, |
|
"eval_steps_per_second": 5.5, |
|
"step": 272100 |
|
}, |
|
{ |
|
"epoch": 15.022050716648291, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0006995589856670343, |
|
"loss": 5.7559, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 15.049614112458656, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.000699007717750827, |
|
"loss": 5.7207, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 15.077177508269019, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0006984564498346196, |
|
"loss": 5.7345, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 15.104740904079383, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0006979051819184124, |
|
"loss": 5.7125, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 15.132304299889746, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.000697353914002205, |
|
"loss": 5.7131, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 15.159867695700111, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0006968026460859979, |
|
"loss": 5.7542, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 15.187431091510474, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0006962513781697906, |
|
"loss": 5.7501, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 15.214994487320839, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0006957001102535832, |
|
"loss": 5.7513, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 15.242557883131202, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.000695148842337376, |
|
"loss": 5.7289, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 15.270121278941566, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0006945975744211686, |
|
"loss": 5.7345, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 15.29768467475193, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0006940463065049615, |
|
"loss": 5.732, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 15.325248070562294, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0006934950385887542, |
|
"loss": 5.7349, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.352811466372657, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0006929437706725468, |
|
"loss": 5.7272, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 15.380374862183022, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006923925027563396, |
|
"loss": 5.7469, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.407938257993385, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0006918412348401323, |
|
"loss": 5.7411, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 15.43550165380375, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0006912899669239251, |
|
"loss": 5.7562, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.463065049614112, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0006907386990077178, |
|
"loss": 5.7441, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 15.490628445424477, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006901874310915105, |
|
"loss": 5.7325, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.51819184123484, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0006896361631753032, |
|
"loss": 5.7548, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 15.545755237045205, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0006890848952590959, |
|
"loss": 5.7274, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.573318632855568, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0006885336273428888, |
|
"loss": 5.7324, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 15.600882028665932, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0006879823594266814, |
|
"loss": 5.7302, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.628445424476295, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0006874310915104741, |
|
"loss": 5.7339, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 15.65600882028666, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0006868798235942668, |
|
"loss": 5.7307, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.683572216097023, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0006863285556780595, |
|
"loss": 5.7437, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 15.711135611907387, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0006857772877618523, |
|
"loss": 5.7391, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.73869900771775, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.000685226019845645, |
|
"loss": 5.7501, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 15.766262403528115, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.0006846747519294377, |
|
"loss": 5.7547, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.793825799338478, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0006841234840132305, |
|
"loss": 5.7367, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 15.821389195148843, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0006835722160970231, |
|
"loss": 5.7488, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.848952590959206, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0006830209481808159, |
|
"loss": 5.7372, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 15.87651598676957, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0006824696802646086, |
|
"loss": 5.7485, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.904079382579933, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0006819184123484013, |
|
"loss": 5.7483, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 15.931642778390298, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0006813671444321941, |
|
"loss": 5.7407, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.959206174200661, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006808158765159867, |
|
"loss": 5.7061, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 15.986769570011026, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0006802646085997795, |
|
"loss": 5.7402, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 5.680827617645264, |
|
"eval_runtime": 3.4293, |
|
"eval_samples_per_second": 84.857, |
|
"eval_steps_per_second": 5.54, |
|
"step": 290240 |
|
}, |
|
{ |
|
"epoch": 16.01433296582139, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0006797133406835723, |
|
"loss": 5.7171, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 16.041896361631753, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0006791620727673649, |
|
"loss": 5.7378, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 16.069459757442118, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0006786108048511577, |
|
"loss": 5.7238, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 16.09702315325248, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006780595369349503, |
|
"loss": 5.7174, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 16.124586549062844, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0006775082690187431, |
|
"loss": 5.7536, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 16.15214994487321, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0006769570011025359, |
|
"loss": 5.7398, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 16.179713340683573, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0006764057331863286, |
|
"loss": 5.7189, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 16.207276736493935, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0006758544652701213, |
|
"loss": 5.7359, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 16.2348401323043, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.000675303197353914, |
|
"loss": 5.723, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 16.262403528114664, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0006747519294377067, |
|
"loss": 5.7319, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 16.28996692392503, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0006742006615214995, |
|
"loss": 5.7183, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 16.31753031973539, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006736493936052923, |
|
"loss": 5.7189, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 16.345093715545755, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0006730981256890849, |
|
"loss": 5.7545, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 16.37265711135612, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0006725468577728776, |
|
"loss": 5.7382, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.400220507166484, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0006719955898566703, |
|
"loss": 5.7488, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 16.427783902976845, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006714443219404631, |
|
"loss": 5.7106, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.45534729878721, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0006708930540242559, |
|
"loss": 5.7385, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 16.482910694597575, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0006703417861080485, |
|
"loss": 5.7501, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.51047409040794, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0006697905181918412, |
|
"loss": 5.7298, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 16.5380374862183, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.000669239250275634, |
|
"loss": 5.7651, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.565600882028665, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0006686879823594267, |
|
"loss": 5.7334, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 16.59316427783903, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0006681367144432195, |
|
"loss": 5.7408, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.620727673649395, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0006675854465270121, |
|
"loss": 5.7507, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 16.64829106945976, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0006670341786108048, |
|
"loss": 5.7275, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.67585446527012, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0006664829106945976, |
|
"loss": 5.7235, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 16.703417861080485, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0006659316427783903, |
|
"loss": 5.7495, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.73098125689085, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0006653803748621831, |
|
"loss": 5.7496, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 16.75854465270121, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006648291069459758, |
|
"loss": 5.7355, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.786108048511576, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0006642778390297684, |
|
"loss": 5.7187, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 16.81367144432194, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0006637265711135612, |
|
"loss": 5.7352, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.841234840132305, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.000663175303197354, |
|
"loss": 5.7352, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 16.86879823594267, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0006626240352811467, |
|
"loss": 5.7216, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.89636163175303, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0006620727673649394, |
|
"loss": 5.735, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 16.923925027563396, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.000661521499448732, |
|
"loss": 5.7457, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.95148842337376, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0006609702315325248, |
|
"loss": 5.7256, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 16.979051819184125, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0006604189636163176, |
|
"loss": 5.724, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 5.677351951599121, |
|
"eval_runtime": 3.5434, |
|
"eval_samples_per_second": 82.125, |
|
"eval_steps_per_second": 5.362, |
|
"step": 308380 |
|
}, |
|
{ |
|
"epoch": 17.006615214994486, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0006598676957001103, |
|
"loss": 5.7348, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 17.03417861080485, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.000659316427783903, |
|
"loss": 5.7402, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 17.061742006615216, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0006587651598676956, |
|
"loss": 5.7306, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 17.08930540242558, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0006582138919514884, |
|
"loss": 5.7248, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 17.11686879823594, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0006576626240352812, |
|
"loss": 5.7285, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 17.144432194046306, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0006571113561190739, |
|
"loss": 5.7293, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 17.17199558985667, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0006565600882028666, |
|
"loss": 5.7139, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 17.199558985667036, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0006560088202866593, |
|
"loss": 5.726, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 17.227122381477397, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.000655457552370452, |
|
"loss": 5.731, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 17.25468577728776, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0006549062844542448, |
|
"loss": 5.7353, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 17.282249173098126, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0006543550165380376, |
|
"loss": 5.737, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 17.30981256890849, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006538037486218302, |
|
"loss": 5.7319, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 17.337375964718852, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.000653252480705623, |
|
"loss": 5.7445, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 17.364939360529217, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0006527012127894156, |
|
"loss": 5.7415, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 17.39250275633958, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0006521499448732084, |
|
"loss": 5.7488, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 17.420066152149946, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0006515986769570012, |
|
"loss": 5.7302, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.447629547960307, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0006510474090407938, |
|
"loss": 5.7229, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 17.475192943770672, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0006504961411245866, |
|
"loss": 5.7164, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.502756339581037, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006499448732083793, |
|
"loss": 5.7286, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 17.5303197353914, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.000649393605292172, |
|
"loss": 5.7334, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.557883131201763, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0006488423373759648, |
|
"loss": 5.7309, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 17.585446527012127, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0006482910694597574, |
|
"loss": 5.7252, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.613009922822492, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0006477398015435502, |
|
"loss": 5.7413, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 17.640573318632857, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0006471885336273429, |
|
"loss": 5.732, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.668136714443218, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0006466372657111356, |
|
"loss": 5.7131, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 17.695700110253583, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0006460859977949284, |
|
"loss": 5.7292, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.723263506063947, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0006455347298787211, |
|
"loss": 5.7196, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 17.750826901874312, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0006449834619625138, |
|
"loss": 5.751, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.778390297684673, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0006444321940463065, |
|
"loss": 5.7218, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 17.805953693495038, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0006438809261300991, |
|
"loss": 5.7297, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.833517089305403, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.000643329658213892, |
|
"loss": 5.7535, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 17.861080485115767, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0006427783902976847, |
|
"loss": 5.7217, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.88864388092613, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.0006422271223814774, |
|
"loss": 5.7584, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 17.916207276736493, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0006416758544652701, |
|
"loss": 5.7269, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.943770672546858, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0006411245865490628, |
|
"loss": 5.7163, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 17.971334068357223, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0006405733186328556, |
|
"loss": 5.7437, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.998897464167584, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0006400220507166483, |
|
"loss": 5.7127, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 5.67610502243042, |
|
"eval_runtime": 3.4429, |
|
"eval_samples_per_second": 84.521, |
|
"eval_steps_per_second": 5.519, |
|
"step": 326520 |
|
}, |
|
{ |
|
"epoch": 18.02646085997795, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0006394707828004411, |
|
"loss": 5.7263, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 18.054024255788313, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.0006389195148842337, |
|
"loss": 5.729, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 18.081587651598678, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0006383682469680264, |
|
"loss": 5.716, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 18.10915104740904, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0006378169790518193, |
|
"loss": 5.7466, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 18.136714443219404, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0006372657111356119, |
|
"loss": 5.729, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 18.16427783902977, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0006367144432194047, |
|
"loss": 5.7284, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 18.191841234840133, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0006361631753031973, |
|
"loss": 5.7361, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 18.219404630650494, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00063561190738699, |
|
"loss": 5.7429, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 18.24696802646086, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006350606394707829, |
|
"loss": 5.7463, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 18.274531422271224, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0006345093715545755, |
|
"loss": 5.7233, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 18.30209481808159, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0006339581036383683, |
|
"loss": 5.7377, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 18.329658213891953, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.000633406835722161, |
|
"loss": 5.7343, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 18.357221609702314, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0006328555678059536, |
|
"loss": 5.7412, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 18.38478500551268, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0006323042998897465, |
|
"loss": 5.7272, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 18.412348401323044, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0006317530319735391, |
|
"loss": 5.7074, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.43991179713341, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0006312017640573319, |
|
"loss": 5.7206, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 18.46747519294377, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0006306504961411246, |
|
"loss": 5.6941, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.495038588754134, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0006300992282249172, |
|
"loss": 5.7386, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 18.5226019845645, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0006295479603087101, |
|
"loss": 5.7343, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.550165380374864, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0006289966923925028, |
|
"loss": 5.7212, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 18.577728776185225, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0006284454244762955, |
|
"loss": 5.7416, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.60529217199559, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.0006278941565600882, |
|
"loss": 5.7385, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 18.632855567805954, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0006273428886438809, |
|
"loss": 5.7053, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.66041896361632, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0006267916207276737, |
|
"loss": 5.742, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 18.68798235942668, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0006262403528114664, |
|
"loss": 5.7315, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.715545755237045, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0006256890848952591, |
|
"loss": 5.7446, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 18.74310915104741, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0006251378169790518, |
|
"loss": 5.723, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.770672546857774, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0006245865490628446, |
|
"loss": 5.7278, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 18.798235942668136, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0006240352811466373, |
|
"loss": 5.7344, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.8257993384785, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00062348401323043, |
|
"loss": 5.7251, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 18.853362734288865, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0006229327453142228, |
|
"loss": 5.7415, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.88092613009923, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0006223814773980154, |
|
"loss": 5.7231, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 18.90848952590959, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0006218302094818082, |
|
"loss": 5.7078, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.936052921719956, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0006212789415656009, |
|
"loss": 5.7255, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 18.96361631753032, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0006207276736493936, |
|
"loss": 5.7394, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.991179713340685, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0006201764057331864, |
|
"loss": 5.7248, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 5.676168918609619, |
|
"eval_runtime": 3.4816, |
|
"eval_samples_per_second": 83.582, |
|
"eval_steps_per_second": 5.457, |
|
"step": 344660 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 907000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1404914590783722e+20, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|