cryptom's picture
Upload alpaca-lora-based-origin-llama7b with huggingface_hub
c908b27
{
"best_metric": 0.636846125125885,
"best_model_checkpoint": "lora-alpaca-cn/checkpoint-12600",
"epoch": 2.978723404255319,
"global_step": 12600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.7735,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 0.00011999999999999999,
"loss": 1.1358,
"step": 40
},
{
"epoch": 0.01,
"learning_rate": 0.00017999999999999998,
"loss": 0.9749,
"step": 60
},
{
"epoch": 0.02,
"learning_rate": 0.00023999999999999998,
"loss": 0.9316,
"step": 80
},
{
"epoch": 0.02,
"learning_rate": 0.0003,
"loss": 0.9072,
"step": 100
},
{
"epoch": 0.03,
"learning_rate": 0.0002995234312946783,
"loss": 0.8963,
"step": 120
},
{
"epoch": 0.03,
"learning_rate": 0.0002990468625893566,
"loss": 0.8853,
"step": 140
},
{
"epoch": 0.04,
"learning_rate": 0.0002985702938840349,
"loss": 0.8709,
"step": 160
},
{
"epoch": 0.04,
"learning_rate": 0.00029809372517871323,
"loss": 0.8555,
"step": 180
},
{
"epoch": 0.05,
"learning_rate": 0.00029761715647339156,
"loss": 0.8584,
"step": 200
},
{
"epoch": 0.05,
"eval_loss": 0.8360834717750549,
"eval_runtime": 49.29,
"eval_samples_per_second": 40.576,
"eval_steps_per_second": 2.536,
"step": 200
},
{
"epoch": 0.05,
"learning_rate": 0.0002971405877680699,
"loss": 0.859,
"step": 220
},
{
"epoch": 0.06,
"learning_rate": 0.00029666401906274816,
"loss": 0.8511,
"step": 240
},
{
"epoch": 0.06,
"learning_rate": 0.0002961874503574265,
"loss": 0.8401,
"step": 260
},
{
"epoch": 0.07,
"learning_rate": 0.0002957108816521048,
"loss": 0.8357,
"step": 280
},
{
"epoch": 0.07,
"learning_rate": 0.00029523431294678314,
"loss": 0.8413,
"step": 300
},
{
"epoch": 0.08,
"learning_rate": 0.00029475774424146147,
"loss": 0.8283,
"step": 320
},
{
"epoch": 0.08,
"learning_rate": 0.0002942811755361398,
"loss": 0.8202,
"step": 340
},
{
"epoch": 0.09,
"learning_rate": 0.00029380460683081807,
"loss": 0.8222,
"step": 360
},
{
"epoch": 0.09,
"learning_rate": 0.0002933280381254964,
"loss": 0.8178,
"step": 380
},
{
"epoch": 0.09,
"learning_rate": 0.0002928514694201747,
"loss": 0.8177,
"step": 400
},
{
"epoch": 0.09,
"eval_loss": 0.7966175079345703,
"eval_runtime": 49.1752,
"eval_samples_per_second": 40.671,
"eval_steps_per_second": 2.542,
"step": 400
},
{
"epoch": 0.1,
"learning_rate": 0.00029237490071485305,
"loss": 0.8057,
"step": 420
},
{
"epoch": 0.1,
"learning_rate": 0.0002918983320095314,
"loss": 0.811,
"step": 440
},
{
"epoch": 0.11,
"learning_rate": 0.00029142176330420965,
"loss": 0.8056,
"step": 460
},
{
"epoch": 0.11,
"learning_rate": 0.000290945194598888,
"loss": 0.7993,
"step": 480
},
{
"epoch": 0.12,
"learning_rate": 0.0002904686258935663,
"loss": 0.7982,
"step": 500
},
{
"epoch": 0.12,
"learning_rate": 0.0002899920571882446,
"loss": 0.8023,
"step": 520
},
{
"epoch": 0.13,
"learning_rate": 0.00028951548848292296,
"loss": 0.7968,
"step": 540
},
{
"epoch": 0.13,
"learning_rate": 0.00028903891977760123,
"loss": 0.8029,
"step": 560
},
{
"epoch": 0.14,
"learning_rate": 0.00028856235107227956,
"loss": 0.7892,
"step": 580
},
{
"epoch": 0.14,
"learning_rate": 0.0002880857823669579,
"loss": 0.7946,
"step": 600
},
{
"epoch": 0.14,
"eval_loss": 0.7735009789466858,
"eval_runtime": 49.3305,
"eval_samples_per_second": 40.543,
"eval_steps_per_second": 2.534,
"step": 600
},
{
"epoch": 0.15,
"learning_rate": 0.00028760921366163616,
"loss": 0.782,
"step": 620
},
{
"epoch": 0.15,
"learning_rate": 0.0002871326449563145,
"loss": 0.7799,
"step": 640
},
{
"epoch": 0.16,
"learning_rate": 0.0002866560762509928,
"loss": 0.7782,
"step": 660
},
{
"epoch": 0.16,
"learning_rate": 0.00028617950754567114,
"loss": 0.7785,
"step": 680
},
{
"epoch": 0.17,
"learning_rate": 0.00028570293884034947,
"loss": 0.785,
"step": 700
},
{
"epoch": 0.17,
"learning_rate": 0.0002852263701350278,
"loss": 0.7754,
"step": 720
},
{
"epoch": 0.17,
"learning_rate": 0.00028474980142970607,
"loss": 0.7804,
"step": 740
},
{
"epoch": 0.18,
"learning_rate": 0.0002842732327243844,
"loss": 0.7696,
"step": 760
},
{
"epoch": 0.18,
"learning_rate": 0.0002837966640190627,
"loss": 0.7692,
"step": 780
},
{
"epoch": 0.19,
"learning_rate": 0.00028332009531374105,
"loss": 0.7752,
"step": 800
},
{
"epoch": 0.19,
"eval_loss": 0.7564254403114319,
"eval_runtime": 49.106,
"eval_samples_per_second": 40.728,
"eval_steps_per_second": 2.546,
"step": 800
},
{
"epoch": 0.19,
"learning_rate": 0.0002828435266084194,
"loss": 0.7698,
"step": 820
},
{
"epoch": 0.2,
"learning_rate": 0.00028236695790309765,
"loss": 0.7699,
"step": 840
},
{
"epoch": 0.2,
"learning_rate": 0.000281890389197776,
"loss": 0.7718,
"step": 860
},
{
"epoch": 0.21,
"learning_rate": 0.0002814138204924543,
"loss": 0.7644,
"step": 880
},
{
"epoch": 0.21,
"learning_rate": 0.00028093725178713263,
"loss": 0.7659,
"step": 900
},
{
"epoch": 0.22,
"learning_rate": 0.00028046068308181096,
"loss": 0.7641,
"step": 920
},
{
"epoch": 0.22,
"learning_rate": 0.00027998411437648923,
"loss": 0.7535,
"step": 940
},
{
"epoch": 0.23,
"learning_rate": 0.00027950754567116756,
"loss": 0.7672,
"step": 960
},
{
"epoch": 0.23,
"learning_rate": 0.0002790309769658459,
"loss": 0.7563,
"step": 980
},
{
"epoch": 0.24,
"learning_rate": 0.0002785544082605242,
"loss": 0.752,
"step": 1000
},
{
"epoch": 0.24,
"eval_loss": 0.7433652281761169,
"eval_runtime": 48.9945,
"eval_samples_per_second": 40.821,
"eval_steps_per_second": 2.551,
"step": 1000
},
{
"epoch": 0.24,
"learning_rate": 0.00027807783955520254,
"loss": 0.755,
"step": 1020
},
{
"epoch": 0.25,
"learning_rate": 0.00027760127084988087,
"loss": 0.7563,
"step": 1040
},
{
"epoch": 0.25,
"learning_rate": 0.00027712470214455914,
"loss": 0.7475,
"step": 1060
},
{
"epoch": 0.26,
"learning_rate": 0.00027664813343923747,
"loss": 0.7599,
"step": 1080
},
{
"epoch": 0.26,
"learning_rate": 0.0002761715647339158,
"loss": 0.7533,
"step": 1100
},
{
"epoch": 0.26,
"learning_rate": 0.00027569499602859407,
"loss": 0.7488,
"step": 1120
},
{
"epoch": 0.27,
"learning_rate": 0.00027521842732327245,
"loss": 0.753,
"step": 1140
},
{
"epoch": 0.27,
"learning_rate": 0.0002747418586179507,
"loss": 0.7435,
"step": 1160
},
{
"epoch": 0.28,
"learning_rate": 0.00027426528991262905,
"loss": 0.7457,
"step": 1180
},
{
"epoch": 0.28,
"learning_rate": 0.0002737887212073074,
"loss": 0.742,
"step": 1200
},
{
"epoch": 0.28,
"eval_loss": 0.7321739792823792,
"eval_runtime": 48.8876,
"eval_samples_per_second": 40.91,
"eval_steps_per_second": 2.557,
"step": 1200
},
{
"epoch": 0.29,
"learning_rate": 0.00027331215250198565,
"loss": 0.7474,
"step": 1220
},
{
"epoch": 0.29,
"learning_rate": 0.000272835583796664,
"loss": 0.7456,
"step": 1240
},
{
"epoch": 0.3,
"learning_rate": 0.0002723590150913423,
"loss": 0.7406,
"step": 1260
},
{
"epoch": 0.3,
"learning_rate": 0.00027188244638602063,
"loss": 0.7448,
"step": 1280
},
{
"epoch": 0.31,
"learning_rate": 0.00027140587768069896,
"loss": 0.7445,
"step": 1300
},
{
"epoch": 0.31,
"learning_rate": 0.00027092930897537723,
"loss": 0.7349,
"step": 1320
},
{
"epoch": 0.32,
"learning_rate": 0.00027045274027005556,
"loss": 0.7395,
"step": 1340
},
{
"epoch": 0.32,
"learning_rate": 0.0002699761715647339,
"loss": 0.7382,
"step": 1360
},
{
"epoch": 0.33,
"learning_rate": 0.0002694996028594122,
"loss": 0.7357,
"step": 1380
},
{
"epoch": 0.33,
"learning_rate": 0.00026902303415409054,
"loss": 0.7409,
"step": 1400
},
{
"epoch": 0.33,
"eval_loss": 0.7235888242721558,
"eval_runtime": 49.2145,
"eval_samples_per_second": 40.638,
"eval_steps_per_second": 2.54,
"step": 1400
},
{
"epoch": 0.34,
"learning_rate": 0.00026854646544876887,
"loss": 0.7376,
"step": 1420
},
{
"epoch": 0.34,
"learning_rate": 0.00026806989674344714,
"loss": 0.7298,
"step": 1440
},
{
"epoch": 0.35,
"learning_rate": 0.00026759332803812547,
"loss": 0.7379,
"step": 1460
},
{
"epoch": 0.35,
"learning_rate": 0.0002671167593328038,
"loss": 0.7354,
"step": 1480
},
{
"epoch": 0.35,
"learning_rate": 0.0002666401906274821,
"loss": 0.7341,
"step": 1500
},
{
"epoch": 0.36,
"learning_rate": 0.00026616362192216045,
"loss": 0.7352,
"step": 1520
},
{
"epoch": 0.36,
"learning_rate": 0.0002656870532168387,
"loss": 0.7321,
"step": 1540
},
{
"epoch": 0.37,
"learning_rate": 0.00026521048451151705,
"loss": 0.7285,
"step": 1560
},
{
"epoch": 0.37,
"learning_rate": 0.0002647339158061954,
"loss": 0.73,
"step": 1580
},
{
"epoch": 0.38,
"learning_rate": 0.00026425734710087365,
"loss": 0.7304,
"step": 1600
},
{
"epoch": 0.38,
"eval_loss": 0.716058611869812,
"eval_runtime": 48.9201,
"eval_samples_per_second": 40.883,
"eval_steps_per_second": 2.555,
"step": 1600
},
{
"epoch": 0.38,
"learning_rate": 0.00026378077839555203,
"loss": 0.7314,
"step": 1620
},
{
"epoch": 0.39,
"learning_rate": 0.0002633042096902303,
"loss": 0.7315,
"step": 1640
},
{
"epoch": 0.39,
"learning_rate": 0.00026282764098490863,
"loss": 0.7239,
"step": 1660
},
{
"epoch": 0.4,
"learning_rate": 0.00026235107227958696,
"loss": 0.73,
"step": 1680
},
{
"epoch": 0.4,
"learning_rate": 0.00026187450357426523,
"loss": 0.7243,
"step": 1700
},
{
"epoch": 0.41,
"learning_rate": 0.00026139793486894356,
"loss": 0.7199,
"step": 1720
},
{
"epoch": 0.41,
"learning_rate": 0.0002609213661636219,
"loss": 0.7216,
"step": 1740
},
{
"epoch": 0.42,
"learning_rate": 0.0002604447974583002,
"loss": 0.7358,
"step": 1760
},
{
"epoch": 0.42,
"learning_rate": 0.00025996822875297854,
"loss": 0.7313,
"step": 1780
},
{
"epoch": 0.43,
"learning_rate": 0.00025949166004765687,
"loss": 0.7236,
"step": 1800
},
{
"epoch": 0.43,
"eval_loss": 0.7097632884979248,
"eval_runtime": 49.4908,
"eval_samples_per_second": 40.412,
"eval_steps_per_second": 2.526,
"step": 1800
},
{
"epoch": 0.43,
"learning_rate": 0.00025901509134233514,
"loss": 0.7282,
"step": 1820
},
{
"epoch": 0.43,
"learning_rate": 0.00025853852263701347,
"loss": 0.7187,
"step": 1840
},
{
"epoch": 0.44,
"learning_rate": 0.0002580619539316918,
"loss": 0.7303,
"step": 1860
},
{
"epoch": 0.44,
"learning_rate": 0.0002575853852263701,
"loss": 0.724,
"step": 1880
},
{
"epoch": 0.45,
"learning_rate": 0.00025710881652104845,
"loss": 0.7248,
"step": 1900
},
{
"epoch": 0.45,
"learning_rate": 0.0002566322478157267,
"loss": 0.7195,
"step": 1920
},
{
"epoch": 0.46,
"learning_rate": 0.00025615567911040505,
"loss": 0.7269,
"step": 1940
},
{
"epoch": 0.46,
"learning_rate": 0.0002556791104050834,
"loss": 0.7209,
"step": 1960
},
{
"epoch": 0.47,
"learning_rate": 0.0002552025416997617,
"loss": 0.7282,
"step": 1980
},
{
"epoch": 0.47,
"learning_rate": 0.00025472597299444003,
"loss": 0.7195,
"step": 2000
},
{
"epoch": 0.47,
"eval_loss": 0.7037709355354309,
"eval_runtime": 49.7167,
"eval_samples_per_second": 40.228,
"eval_steps_per_second": 2.514,
"step": 2000
},
{
"epoch": 0.48,
"learning_rate": 0.0002542494042891183,
"loss": 0.7229,
"step": 2020
},
{
"epoch": 0.48,
"learning_rate": 0.00025377283558379664,
"loss": 0.718,
"step": 2040
},
{
"epoch": 0.49,
"learning_rate": 0.00025329626687847496,
"loss": 0.7223,
"step": 2060
},
{
"epoch": 0.49,
"learning_rate": 0.00025281969817315324,
"loss": 0.7209,
"step": 2080
},
{
"epoch": 0.5,
"learning_rate": 0.0002523431294678316,
"loss": 0.7151,
"step": 2100
},
{
"epoch": 0.5,
"learning_rate": 0.0002518665607625099,
"loss": 0.7141,
"step": 2120
},
{
"epoch": 0.51,
"learning_rate": 0.0002513899920571882,
"loss": 0.7084,
"step": 2140
},
{
"epoch": 0.51,
"learning_rate": 0.00025091342335186654,
"loss": 0.7075,
"step": 2160
},
{
"epoch": 0.52,
"learning_rate": 0.00025043685464654487,
"loss": 0.7133,
"step": 2180
},
{
"epoch": 0.52,
"learning_rate": 0.00024996028594122314,
"loss": 0.7092,
"step": 2200
},
{
"epoch": 0.52,
"eval_loss": 0.6989386677742004,
"eval_runtime": 49.2344,
"eval_samples_per_second": 40.622,
"eval_steps_per_second": 2.539,
"step": 2200
},
{
"epoch": 0.52,
"learning_rate": 0.0002494837172359015,
"loss": 0.7178,
"step": 2220
},
{
"epoch": 0.53,
"learning_rate": 0.0002490071485305798,
"loss": 0.7188,
"step": 2240
},
{
"epoch": 0.53,
"learning_rate": 0.0002485305798252581,
"loss": 0.7161,
"step": 2260
},
{
"epoch": 0.54,
"learning_rate": 0.00024805401111993645,
"loss": 0.7078,
"step": 2280
},
{
"epoch": 0.54,
"learning_rate": 0.0002475774424146147,
"loss": 0.7,
"step": 2300
},
{
"epoch": 0.55,
"learning_rate": 0.00024710087370929305,
"loss": 0.718,
"step": 2320
},
{
"epoch": 0.55,
"learning_rate": 0.0002466243050039714,
"loss": 0.7059,
"step": 2340
},
{
"epoch": 0.56,
"learning_rate": 0.0002461477362986497,
"loss": 0.712,
"step": 2360
},
{
"epoch": 0.56,
"learning_rate": 0.00024567116759332804,
"loss": 0.7116,
"step": 2380
},
{
"epoch": 0.57,
"learning_rate": 0.0002451945988880063,
"loss": 0.6986,
"step": 2400
},
{
"epoch": 0.57,
"eval_loss": 0.6939737796783447,
"eval_runtime": 49.459,
"eval_samples_per_second": 40.438,
"eval_steps_per_second": 2.527,
"step": 2400
},
{
"epoch": 0.57,
"learning_rate": 0.00024471803018268464,
"loss": 0.7168,
"step": 2420
},
{
"epoch": 0.58,
"learning_rate": 0.00024424146147736296,
"loss": 0.7141,
"step": 2440
},
{
"epoch": 0.58,
"learning_rate": 0.00024376489277204126,
"loss": 0.7095,
"step": 2460
},
{
"epoch": 0.59,
"learning_rate": 0.00024328832406671962,
"loss": 0.7091,
"step": 2480
},
{
"epoch": 0.59,
"learning_rate": 0.00024281175536139792,
"loss": 0.7015,
"step": 2500
},
{
"epoch": 0.6,
"learning_rate": 0.00024233518665607622,
"loss": 0.7109,
"step": 2520
},
{
"epoch": 0.6,
"learning_rate": 0.00024185861795075455,
"loss": 0.7086,
"step": 2540
},
{
"epoch": 0.61,
"learning_rate": 0.00024138204924543285,
"loss": 0.7118,
"step": 2560
},
{
"epoch": 0.61,
"learning_rate": 0.00024090548054011117,
"loss": 0.7033,
"step": 2580
},
{
"epoch": 0.61,
"learning_rate": 0.0002404289118347895,
"loss": 0.7128,
"step": 2600
},
{
"epoch": 0.61,
"eval_loss": 0.6901652812957764,
"eval_runtime": 49.5038,
"eval_samples_per_second": 40.401,
"eval_steps_per_second": 2.525,
"step": 2600
},
{
"epoch": 0.62,
"learning_rate": 0.00023995234312946783,
"loss": 0.6968,
"step": 2620
},
{
"epoch": 0.62,
"learning_rate": 0.00023947577442414613,
"loss": 0.7109,
"step": 2640
},
{
"epoch": 0.63,
"learning_rate": 0.00023899920571882443,
"loss": 0.7048,
"step": 2660
},
{
"epoch": 0.63,
"learning_rate": 0.00023852263701350276,
"loss": 0.7012,
"step": 2680
},
{
"epoch": 0.64,
"learning_rate": 0.00023804606830818106,
"loss": 0.7065,
"step": 2700
},
{
"epoch": 0.64,
"learning_rate": 0.0002375694996028594,
"loss": 0.7009,
"step": 2720
},
{
"epoch": 0.65,
"learning_rate": 0.0002370929308975377,
"loss": 0.7035,
"step": 2740
},
{
"epoch": 0.65,
"learning_rate": 0.00023661636219221604,
"loss": 0.6973,
"step": 2760
},
{
"epoch": 0.66,
"learning_rate": 0.00023613979348689434,
"loss": 0.7075,
"step": 2780
},
{
"epoch": 0.66,
"learning_rate": 0.00023566322478157264,
"loss": 0.6952,
"step": 2800
},
{
"epoch": 0.66,
"eval_loss": 0.6865400671958923,
"eval_runtime": 49.2814,
"eval_samples_per_second": 40.583,
"eval_steps_per_second": 2.536,
"step": 2800
},
{
"epoch": 0.67,
"learning_rate": 0.00023518665607625097,
"loss": 0.6979,
"step": 2820
},
{
"epoch": 0.67,
"learning_rate": 0.0002347100873709293,
"loss": 0.6973,
"step": 2840
},
{
"epoch": 0.68,
"learning_rate": 0.00023423351866560762,
"loss": 0.7033,
"step": 2860
},
{
"epoch": 0.68,
"learning_rate": 0.00023375694996028592,
"loss": 0.6964,
"step": 2880
},
{
"epoch": 0.69,
"learning_rate": 0.00023328038125496422,
"loss": 0.7052,
"step": 2900
},
{
"epoch": 0.69,
"learning_rate": 0.00023280381254964255,
"loss": 0.6999,
"step": 2920
},
{
"epoch": 0.7,
"learning_rate": 0.00023232724384432085,
"loss": 0.6963,
"step": 2940
},
{
"epoch": 0.7,
"learning_rate": 0.0002318506751389992,
"loss": 0.7025,
"step": 2960
},
{
"epoch": 0.7,
"learning_rate": 0.0002313741064336775,
"loss": 0.704,
"step": 2980
},
{
"epoch": 0.71,
"learning_rate": 0.00023089753772835583,
"loss": 0.6926,
"step": 3000
},
{
"epoch": 0.71,
"eval_loss": 0.6828380227088928,
"eval_runtime": 49.5667,
"eval_samples_per_second": 40.35,
"eval_steps_per_second": 2.522,
"step": 3000
},
{
"epoch": 0.71,
"learning_rate": 0.00023042096902303413,
"loss": 0.698,
"step": 3020
},
{
"epoch": 0.72,
"learning_rate": 0.00022994440031771243,
"loss": 0.6893,
"step": 3040
},
{
"epoch": 0.72,
"learning_rate": 0.00022946783161239076,
"loss": 0.6938,
"step": 3060
},
{
"epoch": 0.73,
"learning_rate": 0.00022899126290706908,
"loss": 0.6974,
"step": 3080
},
{
"epoch": 0.73,
"learning_rate": 0.0002285146942017474,
"loss": 0.6922,
"step": 3100
},
{
"epoch": 0.74,
"learning_rate": 0.0002280381254964257,
"loss": 0.7073,
"step": 3120
},
{
"epoch": 0.74,
"learning_rate": 0.00022756155679110404,
"loss": 0.6895,
"step": 3140
},
{
"epoch": 0.75,
"learning_rate": 0.00022708498808578234,
"loss": 0.7012,
"step": 3160
},
{
"epoch": 0.75,
"learning_rate": 0.00022660841938046064,
"loss": 0.6985,
"step": 3180
},
{
"epoch": 0.76,
"learning_rate": 0.000226131850675139,
"loss": 0.6901,
"step": 3200
},
{
"epoch": 0.76,
"eval_loss": 0.6807068586349487,
"eval_runtime": 49.2421,
"eval_samples_per_second": 40.616,
"eval_steps_per_second": 2.538,
"step": 3200
},
{
"epoch": 0.76,
"learning_rate": 0.0002256552819698173,
"loss": 0.697,
"step": 3220
},
{
"epoch": 0.77,
"learning_rate": 0.00022517871326449562,
"loss": 0.7002,
"step": 3240
},
{
"epoch": 0.77,
"learning_rate": 0.00022470214455917392,
"loss": 0.6918,
"step": 3260
},
{
"epoch": 0.78,
"learning_rate": 0.00022422557585385225,
"loss": 0.6999,
"step": 3280
},
{
"epoch": 0.78,
"learning_rate": 0.00022374900714853055,
"loss": 0.6961,
"step": 3300
},
{
"epoch": 0.78,
"learning_rate": 0.0002232724384432089,
"loss": 0.6888,
"step": 3320
},
{
"epoch": 0.79,
"learning_rate": 0.0002227958697378872,
"loss": 0.695,
"step": 3340
},
{
"epoch": 0.79,
"learning_rate": 0.0002223193010325655,
"loss": 0.6861,
"step": 3360
},
{
"epoch": 0.8,
"learning_rate": 0.00022184273232724383,
"loss": 0.6864,
"step": 3380
},
{
"epoch": 0.8,
"learning_rate": 0.00022136616362192213,
"loss": 0.6917,
"step": 3400
},
{
"epoch": 0.8,
"eval_loss": 0.6773961782455444,
"eval_runtime": 49.3961,
"eval_samples_per_second": 40.489,
"eval_steps_per_second": 2.531,
"step": 3400
},
{
"epoch": 0.81,
"learning_rate": 0.00022088959491660043,
"loss": 0.679,
"step": 3420
},
{
"epoch": 0.81,
"learning_rate": 0.00022041302621127879,
"loss": 0.6915,
"step": 3440
},
{
"epoch": 0.82,
"learning_rate": 0.00021993645750595709,
"loss": 0.6937,
"step": 3460
},
{
"epoch": 0.82,
"learning_rate": 0.0002194598888006354,
"loss": 0.6831,
"step": 3480
},
{
"epoch": 0.83,
"learning_rate": 0.0002189833200953137,
"loss": 0.6875,
"step": 3500
},
{
"epoch": 0.83,
"learning_rate": 0.00021850675138999204,
"loss": 0.6916,
"step": 3520
},
{
"epoch": 0.84,
"learning_rate": 0.00021803018268467034,
"loss": 0.6896,
"step": 3540
},
{
"epoch": 0.84,
"learning_rate": 0.0002175536139793487,
"loss": 0.6986,
"step": 3560
},
{
"epoch": 0.85,
"learning_rate": 0.000217077045274027,
"loss": 0.693,
"step": 3580
},
{
"epoch": 0.85,
"learning_rate": 0.0002166004765687053,
"loss": 0.6893,
"step": 3600
},
{
"epoch": 0.85,
"eval_loss": 0.6753410696983337,
"eval_runtime": 49.3307,
"eval_samples_per_second": 40.543,
"eval_steps_per_second": 2.534,
"step": 3600
},
{
"epoch": 0.86,
"learning_rate": 0.00021612390786338362,
"loss": 0.6872,
"step": 3620
},
{
"epoch": 0.86,
"learning_rate": 0.00021564733915806192,
"loss": 0.6862,
"step": 3640
},
{
"epoch": 0.87,
"learning_rate": 0.00021517077045274025,
"loss": 0.6943,
"step": 3660
},
{
"epoch": 0.87,
"learning_rate": 0.00021469420174741858,
"loss": 0.6896,
"step": 3680
},
{
"epoch": 0.87,
"learning_rate": 0.0002142176330420969,
"loss": 0.6912,
"step": 3700
},
{
"epoch": 0.88,
"learning_rate": 0.0002137410643367752,
"loss": 0.6859,
"step": 3720
},
{
"epoch": 0.88,
"learning_rate": 0.0002132644956314535,
"loss": 0.6791,
"step": 3740
},
{
"epoch": 0.89,
"learning_rate": 0.00021278792692613183,
"loss": 0.6882,
"step": 3760
},
{
"epoch": 0.89,
"learning_rate": 0.00021231135822081013,
"loss": 0.6823,
"step": 3780
},
{
"epoch": 0.9,
"learning_rate": 0.0002118347895154885,
"loss": 0.6831,
"step": 3800
},
{
"epoch": 0.9,
"eval_loss": 0.6738302707672119,
"eval_runtime": 49.4648,
"eval_samples_per_second": 40.433,
"eval_steps_per_second": 2.527,
"step": 3800
},
{
"epoch": 0.9,
"learning_rate": 0.0002113582208101668,
"loss": 0.6818,
"step": 3820
},
{
"epoch": 0.91,
"learning_rate": 0.0002108816521048451,
"loss": 0.6912,
"step": 3840
},
{
"epoch": 0.91,
"learning_rate": 0.00021040508339952341,
"loss": 0.6884,
"step": 3860
},
{
"epoch": 0.92,
"learning_rate": 0.00020992851469420171,
"loss": 0.6888,
"step": 3880
},
{
"epoch": 0.92,
"learning_rate": 0.00020945194598888004,
"loss": 0.6822,
"step": 3900
},
{
"epoch": 0.93,
"learning_rate": 0.00020897537728355837,
"loss": 0.6879,
"step": 3920
},
{
"epoch": 0.93,
"learning_rate": 0.0002084988085782367,
"loss": 0.6771,
"step": 3940
},
{
"epoch": 0.94,
"learning_rate": 0.000208022239872915,
"loss": 0.684,
"step": 3960
},
{
"epoch": 0.94,
"learning_rate": 0.0002075456711675933,
"loss": 0.6878,
"step": 3980
},
{
"epoch": 0.95,
"learning_rate": 0.00020706910246227162,
"loss": 0.6913,
"step": 4000
},
{
"epoch": 0.95,
"eval_loss": 0.6711302995681763,
"eval_runtime": 49.117,
"eval_samples_per_second": 40.719,
"eval_steps_per_second": 2.545,
"step": 4000
},
{
"epoch": 0.95,
"learning_rate": 0.00020659253375694992,
"loss": 0.683,
"step": 4020
},
{
"epoch": 0.96,
"learning_rate": 0.00020611596505162828,
"loss": 0.6833,
"step": 4040
},
{
"epoch": 0.96,
"learning_rate": 0.00020563939634630658,
"loss": 0.6793,
"step": 4060
},
{
"epoch": 0.96,
"learning_rate": 0.0002051628276409849,
"loss": 0.6843,
"step": 4080
},
{
"epoch": 0.97,
"learning_rate": 0.0002046862589356632,
"loss": 0.6822,
"step": 4100
},
{
"epoch": 0.97,
"learning_rate": 0.0002042096902303415,
"loss": 0.6856,
"step": 4120
},
{
"epoch": 0.98,
"learning_rate": 0.00020373312152501983,
"loss": 0.6809,
"step": 4140
},
{
"epoch": 0.98,
"learning_rate": 0.00020325655281969816,
"loss": 0.6843,
"step": 4160
},
{
"epoch": 0.99,
"learning_rate": 0.0002027799841143765,
"loss": 0.6754,
"step": 4180
},
{
"epoch": 0.99,
"learning_rate": 0.0002023034154090548,
"loss": 0.6823,
"step": 4200
},
{
"epoch": 0.99,
"eval_loss": 0.6697036027908325,
"eval_runtime": 49.3237,
"eval_samples_per_second": 40.548,
"eval_steps_per_second": 2.534,
"step": 4200
},
{
"epoch": 1.0,
"learning_rate": 0.00020182684670373312,
"loss": 0.6861,
"step": 4220
},
{
"epoch": 1.0,
"learning_rate": 0.00020135027799841142,
"loss": 0.6806,
"step": 4240
},
{
"epoch": 1.01,
"learning_rate": 0.00020087370929308972,
"loss": 0.6823,
"step": 4260
},
{
"epoch": 1.01,
"learning_rate": 0.00020039714058776807,
"loss": 0.6805,
"step": 4280
},
{
"epoch": 1.02,
"learning_rate": 0.00019992057188244637,
"loss": 0.6813,
"step": 4300
},
{
"epoch": 1.02,
"learning_rate": 0.0001994440031771247,
"loss": 0.675,
"step": 4320
},
{
"epoch": 1.03,
"learning_rate": 0.000198967434471803,
"loss": 0.6728,
"step": 4340
},
{
"epoch": 1.03,
"learning_rate": 0.0001984908657664813,
"loss": 0.6676,
"step": 4360
},
{
"epoch": 1.04,
"learning_rate": 0.00019801429706115963,
"loss": 0.6729,
"step": 4380
},
{
"epoch": 1.04,
"learning_rate": 0.00019753772835583795,
"loss": 0.685,
"step": 4400
},
{
"epoch": 1.04,
"eval_loss": 0.6667952537536621,
"eval_runtime": 49.3899,
"eval_samples_per_second": 40.494,
"eval_steps_per_second": 2.531,
"step": 4400
},
{
"epoch": 1.04,
"learning_rate": 0.00019706115965051628,
"loss": 0.6786,
"step": 4420
},
{
"epoch": 1.05,
"learning_rate": 0.00019658459094519458,
"loss": 0.6738,
"step": 4440
},
{
"epoch": 1.05,
"learning_rate": 0.0001961080222398729,
"loss": 0.686,
"step": 4460
},
{
"epoch": 1.06,
"learning_rate": 0.0001956314535345512,
"loss": 0.6818,
"step": 4480
},
{
"epoch": 1.06,
"learning_rate": 0.0001951548848292295,
"loss": 0.6741,
"step": 4500
},
{
"epoch": 1.07,
"learning_rate": 0.00019467831612390786,
"loss": 0.6756,
"step": 4520
},
{
"epoch": 1.07,
"learning_rate": 0.00019420174741858616,
"loss": 0.6851,
"step": 4540
},
{
"epoch": 1.08,
"learning_rate": 0.0001937251787132645,
"loss": 0.6784,
"step": 4560
},
{
"epoch": 1.08,
"learning_rate": 0.0001932486100079428,
"loss": 0.6785,
"step": 4580
},
{
"epoch": 1.09,
"learning_rate": 0.00019277204130262112,
"loss": 0.678,
"step": 4600
},
{
"epoch": 1.09,
"eval_loss": 0.6655837297439575,
"eval_runtime": 49.5019,
"eval_samples_per_second": 40.403,
"eval_steps_per_second": 2.525,
"step": 4600
},
{
"epoch": 1.09,
"learning_rate": 0.00019229547259729942,
"loss": 0.6782,
"step": 4620
},
{
"epoch": 1.1,
"learning_rate": 0.00019181890389197777,
"loss": 0.6683,
"step": 4640
},
{
"epoch": 1.1,
"learning_rate": 0.00019134233518665607,
"loss": 0.6783,
"step": 4660
},
{
"epoch": 1.11,
"learning_rate": 0.00019086576648133437,
"loss": 0.675,
"step": 4680
},
{
"epoch": 1.11,
"learning_rate": 0.0001903891977760127,
"loss": 0.6691,
"step": 4700
},
{
"epoch": 1.12,
"learning_rate": 0.000189912629070691,
"loss": 0.6726,
"step": 4720
},
{
"epoch": 1.12,
"learning_rate": 0.00018943606036536933,
"loss": 0.68,
"step": 4740
},
{
"epoch": 1.13,
"learning_rate": 0.00018895949166004763,
"loss": 0.6694,
"step": 4760
},
{
"epoch": 1.13,
"learning_rate": 0.00018848292295472598,
"loss": 0.6686,
"step": 4780
},
{
"epoch": 1.13,
"learning_rate": 0.00018800635424940428,
"loss": 0.6766,
"step": 4800
},
{
"epoch": 1.13,
"eval_loss": 0.6646501421928406,
"eval_runtime": 49.3188,
"eval_samples_per_second": 40.552,
"eval_steps_per_second": 2.535,
"step": 4800
},
{
"epoch": 1.14,
"learning_rate": 0.00018752978554408258,
"loss": 0.6724,
"step": 4820
},
{
"epoch": 1.14,
"learning_rate": 0.0001870532168387609,
"loss": 0.6801,
"step": 4840
},
{
"epoch": 1.15,
"learning_rate": 0.0001865766481334392,
"loss": 0.6698,
"step": 4860
},
{
"epoch": 1.15,
"learning_rate": 0.0001861000794281175,
"loss": 0.6723,
"step": 4880
},
{
"epoch": 1.16,
"learning_rate": 0.00018562351072279586,
"loss": 0.6693,
"step": 4900
},
{
"epoch": 1.16,
"learning_rate": 0.00018514694201747416,
"loss": 0.6716,
"step": 4920
},
{
"epoch": 1.17,
"learning_rate": 0.0001846703733121525,
"loss": 0.674,
"step": 4940
},
{
"epoch": 1.17,
"learning_rate": 0.0001841938046068308,
"loss": 0.6702,
"step": 4960
},
{
"epoch": 1.18,
"learning_rate": 0.00018371723590150912,
"loss": 0.6716,
"step": 4980
},
{
"epoch": 1.18,
"learning_rate": 0.00018324066719618742,
"loss": 0.672,
"step": 5000
},
{
"epoch": 1.18,
"eval_loss": 0.662735104560852,
"eval_runtime": 49.275,
"eval_samples_per_second": 40.589,
"eval_steps_per_second": 2.537,
"step": 5000
},
{
"epoch": 1.19,
"learning_rate": 0.00018276409849086577,
"loss": 0.6701,
"step": 5020
},
{
"epoch": 1.19,
"learning_rate": 0.00018228752978554407,
"loss": 0.6663,
"step": 5040
},
{
"epoch": 1.2,
"learning_rate": 0.00018181096108022237,
"loss": 0.6651,
"step": 5060
},
{
"epoch": 1.2,
"learning_rate": 0.0001813343923749007,
"loss": 0.6708,
"step": 5080
},
{
"epoch": 1.21,
"learning_rate": 0.000180857823669579,
"loss": 0.6697,
"step": 5100
},
{
"epoch": 1.21,
"learning_rate": 0.00018038125496425733,
"loss": 0.662,
"step": 5120
},
{
"epoch": 1.22,
"learning_rate": 0.00017990468625893566,
"loss": 0.669,
"step": 5140
},
{
"epoch": 1.22,
"learning_rate": 0.00017942811755361398,
"loss": 0.6649,
"step": 5160
},
{
"epoch": 1.22,
"learning_rate": 0.00017895154884829228,
"loss": 0.668,
"step": 5180
},
{
"epoch": 1.23,
"learning_rate": 0.00017847498014297058,
"loss": 0.6796,
"step": 5200
},
{
"epoch": 1.23,
"eval_loss": 0.6609957218170166,
"eval_runtime": 49.2394,
"eval_samples_per_second": 40.618,
"eval_steps_per_second": 2.539,
"step": 5200
},
{
"epoch": 1.23,
"learning_rate": 0.0001779984114376489,
"loss": 0.6745,
"step": 5220
},
{
"epoch": 1.24,
"learning_rate": 0.0001775218427323272,
"loss": 0.6646,
"step": 5240
},
{
"epoch": 1.24,
"learning_rate": 0.00017704527402700556,
"loss": 0.6682,
"step": 5260
},
{
"epoch": 1.25,
"learning_rate": 0.00017656870532168386,
"loss": 0.6713,
"step": 5280
},
{
"epoch": 1.25,
"learning_rate": 0.00017609213661636216,
"loss": 0.6618,
"step": 5300
},
{
"epoch": 1.26,
"learning_rate": 0.0001756155679110405,
"loss": 0.6703,
"step": 5320
},
{
"epoch": 1.26,
"learning_rate": 0.0001751389992057188,
"loss": 0.6652,
"step": 5340
},
{
"epoch": 1.27,
"learning_rate": 0.00017466243050039712,
"loss": 0.6698,
"step": 5360
},
{
"epoch": 1.27,
"learning_rate": 0.00017418586179507545,
"loss": 0.6728,
"step": 5380
},
{
"epoch": 1.28,
"learning_rate": 0.00017370929308975377,
"loss": 0.6752,
"step": 5400
},
{
"epoch": 1.28,
"eval_loss": 0.6592395901679993,
"eval_runtime": 49.3426,
"eval_samples_per_second": 40.533,
"eval_steps_per_second": 2.533,
"step": 5400
},
{
"epoch": 1.28,
"learning_rate": 0.00017323272438443207,
"loss": 0.6653,
"step": 5420
},
{
"epoch": 1.29,
"learning_rate": 0.00017275615567911037,
"loss": 0.669,
"step": 5440
},
{
"epoch": 1.29,
"learning_rate": 0.0001722795869737887,
"loss": 0.6698,
"step": 5460
},
{
"epoch": 1.3,
"learning_rate": 0.000171803018268467,
"loss": 0.6742,
"step": 5480
},
{
"epoch": 1.3,
"learning_rate": 0.00017132644956314536,
"loss": 0.6596,
"step": 5500
},
{
"epoch": 1.3,
"learning_rate": 0.00017084988085782366,
"loss": 0.6699,
"step": 5520
},
{
"epoch": 1.31,
"learning_rate": 0.00017037331215250198,
"loss": 0.664,
"step": 5540
},
{
"epoch": 1.31,
"learning_rate": 0.00016989674344718028,
"loss": 0.6673,
"step": 5560
},
{
"epoch": 1.32,
"learning_rate": 0.00016942017474185858,
"loss": 0.6684,
"step": 5580
},
{
"epoch": 1.32,
"learning_rate": 0.0001689436060365369,
"loss": 0.6769,
"step": 5600
},
{
"epoch": 1.32,
"eval_loss": 0.6582754850387573,
"eval_runtime": 49.469,
"eval_samples_per_second": 40.429,
"eval_steps_per_second": 2.527,
"step": 5600
},
{
"epoch": 1.33,
"learning_rate": 0.00016846703733121524,
"loss": 0.6633,
"step": 5620
},
{
"epoch": 1.33,
"learning_rate": 0.00016799046862589357,
"loss": 0.6679,
"step": 5640
},
{
"epoch": 1.34,
"learning_rate": 0.00016751389992057187,
"loss": 0.6601,
"step": 5660
},
{
"epoch": 1.34,
"learning_rate": 0.0001670373312152502,
"loss": 0.6731,
"step": 5680
},
{
"epoch": 1.35,
"learning_rate": 0.0001665607625099285,
"loss": 0.6638,
"step": 5700
},
{
"epoch": 1.35,
"learning_rate": 0.0001660841938046068,
"loss": 0.6693,
"step": 5720
},
{
"epoch": 1.36,
"learning_rate": 0.00016560762509928515,
"loss": 0.6642,
"step": 5740
},
{
"epoch": 1.36,
"learning_rate": 0.00016513105639396345,
"loss": 0.6649,
"step": 5760
},
{
"epoch": 1.37,
"learning_rate": 0.00016465448768864178,
"loss": 0.663,
"step": 5780
},
{
"epoch": 1.37,
"learning_rate": 0.00016417791898332008,
"loss": 0.6629,
"step": 5800
},
{
"epoch": 1.37,
"eval_loss": 0.6574136018753052,
"eval_runtime": 49.3019,
"eval_samples_per_second": 40.566,
"eval_steps_per_second": 2.535,
"step": 5800
},
{
"epoch": 1.38,
"learning_rate": 0.00016370135027799838,
"loss": 0.6605,
"step": 5820
},
{
"epoch": 1.38,
"learning_rate": 0.0001632247815726767,
"loss": 0.6707,
"step": 5840
},
{
"epoch": 1.39,
"learning_rate": 0.00016274821286735503,
"loss": 0.6695,
"step": 5860
},
{
"epoch": 1.39,
"learning_rate": 0.00016227164416203336,
"loss": 0.6647,
"step": 5880
},
{
"epoch": 1.39,
"learning_rate": 0.00016179507545671166,
"loss": 0.6657,
"step": 5900
},
{
"epoch": 1.4,
"learning_rate": 0.00016131850675138999,
"loss": 0.6656,
"step": 5920
},
{
"epoch": 1.4,
"learning_rate": 0.00016084193804606829,
"loss": 0.6676,
"step": 5940
},
{
"epoch": 1.41,
"learning_rate": 0.00016036536934074659,
"loss": 0.6678,
"step": 5960
},
{
"epoch": 1.41,
"learning_rate": 0.00015988880063542494,
"loss": 0.6639,
"step": 5980
},
{
"epoch": 1.42,
"learning_rate": 0.00015941223193010324,
"loss": 0.6645,
"step": 6000
},
{
"epoch": 1.42,
"eval_loss": 0.656126081943512,
"eval_runtime": 49.5095,
"eval_samples_per_second": 40.396,
"eval_steps_per_second": 2.525,
"step": 6000
},
{
"epoch": 1.42,
"learning_rate": 0.00015893566322478157,
"loss": 0.6672,
"step": 6020
},
{
"epoch": 1.43,
"learning_rate": 0.00015845909451945987,
"loss": 0.6678,
"step": 6040
},
{
"epoch": 1.43,
"learning_rate": 0.0001579825258141382,
"loss": 0.6676,
"step": 6060
},
{
"epoch": 1.44,
"learning_rate": 0.0001575059571088165,
"loss": 0.6717,
"step": 6080
},
{
"epoch": 1.44,
"learning_rate": 0.00015702938840349485,
"loss": 0.671,
"step": 6100
},
{
"epoch": 1.45,
"learning_rate": 0.00015655281969817315,
"loss": 0.6611,
"step": 6120
},
{
"epoch": 1.45,
"learning_rate": 0.00015607625099285145,
"loss": 0.6606,
"step": 6140
},
{
"epoch": 1.46,
"learning_rate": 0.00015559968228752978,
"loss": 0.6647,
"step": 6160
},
{
"epoch": 1.46,
"learning_rate": 0.00015512311358220808,
"loss": 0.6652,
"step": 6180
},
{
"epoch": 1.47,
"learning_rate": 0.0001546465448768864,
"loss": 0.6629,
"step": 6200
},
{
"epoch": 1.47,
"eval_loss": 0.6549723148345947,
"eval_runtime": 49.4871,
"eval_samples_per_second": 40.415,
"eval_steps_per_second": 2.526,
"step": 6200
},
{
"epoch": 1.47,
"learning_rate": 0.00015416997617156473,
"loss": 0.6685,
"step": 6220
},
{
"epoch": 1.48,
"learning_rate": 0.00015369340746624306,
"loss": 0.6578,
"step": 6240
},
{
"epoch": 1.48,
"learning_rate": 0.00015321683876092136,
"loss": 0.6587,
"step": 6260
},
{
"epoch": 1.48,
"learning_rate": 0.00015274027005559966,
"loss": 0.6655,
"step": 6280
},
{
"epoch": 1.49,
"learning_rate": 0.000152263701350278,
"loss": 0.6662,
"step": 6300
},
{
"epoch": 1.49,
"learning_rate": 0.0001517871326449563,
"loss": 0.6648,
"step": 6320
},
{
"epoch": 1.5,
"learning_rate": 0.00015131056393963464,
"loss": 0.6638,
"step": 6340
},
{
"epoch": 1.5,
"learning_rate": 0.00015083399523431294,
"loss": 0.6614,
"step": 6360
},
{
"epoch": 1.51,
"learning_rate": 0.00015035742652899124,
"loss": 0.6552,
"step": 6380
},
{
"epoch": 1.51,
"learning_rate": 0.00014988085782366957,
"loss": 0.6753,
"step": 6400
},
{
"epoch": 1.51,
"eval_loss": 0.6544620990753174,
"eval_runtime": 49.3242,
"eval_samples_per_second": 40.548,
"eval_steps_per_second": 2.534,
"step": 6400
},
{
"epoch": 1.52,
"learning_rate": 0.00014940428911834787,
"loss": 0.6588,
"step": 6420
},
{
"epoch": 1.52,
"learning_rate": 0.0001489277204130262,
"loss": 0.6609,
"step": 6440
},
{
"epoch": 1.53,
"learning_rate": 0.00014845115170770452,
"loss": 0.6565,
"step": 6460
},
{
"epoch": 1.53,
"learning_rate": 0.00014797458300238282,
"loss": 0.6589,
"step": 6480
},
{
"epoch": 1.54,
"learning_rate": 0.00014749801429706115,
"loss": 0.6585,
"step": 6500
},
{
"epoch": 1.54,
"learning_rate": 0.00014702144559173945,
"loss": 0.6737,
"step": 6520
},
{
"epoch": 1.55,
"learning_rate": 0.00014654487688641778,
"loss": 0.6554,
"step": 6540
},
{
"epoch": 1.55,
"learning_rate": 0.0001460683081810961,
"loss": 0.6603,
"step": 6560
},
{
"epoch": 1.56,
"learning_rate": 0.0001455917394757744,
"loss": 0.6647,
"step": 6580
},
{
"epoch": 1.56,
"learning_rate": 0.00014511517077045273,
"loss": 0.6632,
"step": 6600
},
{
"epoch": 1.56,
"eval_loss": 0.6527110934257507,
"eval_runtime": 49.2622,
"eval_samples_per_second": 40.599,
"eval_steps_per_second": 2.537,
"step": 6600
},
{
"epoch": 1.57,
"learning_rate": 0.00014463860206513106,
"loss": 0.6705,
"step": 6620
},
{
"epoch": 1.57,
"learning_rate": 0.00014416203335980936,
"loss": 0.6703,
"step": 6640
},
{
"epoch": 1.57,
"learning_rate": 0.00014368546465448766,
"loss": 0.6602,
"step": 6660
},
{
"epoch": 1.58,
"learning_rate": 0.000143208895949166,
"loss": 0.6639,
"step": 6680
},
{
"epoch": 1.58,
"learning_rate": 0.00014273232724384432,
"loss": 0.6645,
"step": 6700
},
{
"epoch": 1.59,
"learning_rate": 0.00014225575853852262,
"loss": 0.6655,
"step": 6720
},
{
"epoch": 1.59,
"learning_rate": 0.00014177918983320094,
"loss": 0.664,
"step": 6740
},
{
"epoch": 1.6,
"learning_rate": 0.00014130262112787927,
"loss": 0.6656,
"step": 6760
},
{
"epoch": 1.6,
"learning_rate": 0.00014082605242255757,
"loss": 0.6658,
"step": 6780
},
{
"epoch": 1.61,
"learning_rate": 0.0001403494837172359,
"loss": 0.6641,
"step": 6800
},
{
"epoch": 1.61,
"eval_loss": 0.6513609886169434,
"eval_runtime": 49.4424,
"eval_samples_per_second": 40.451,
"eval_steps_per_second": 2.528,
"step": 6800
},
{
"epoch": 1.61,
"learning_rate": 0.0001398729150119142,
"loss": 0.6599,
"step": 6820
},
{
"epoch": 1.62,
"learning_rate": 0.00013939634630659252,
"loss": 0.6552,
"step": 6840
},
{
"epoch": 1.62,
"learning_rate": 0.00013891977760127085,
"loss": 0.6616,
"step": 6860
},
{
"epoch": 1.63,
"learning_rate": 0.00013844320889594915,
"loss": 0.6635,
"step": 6880
},
{
"epoch": 1.63,
"learning_rate": 0.00013796664019062745,
"loss": 0.6608,
"step": 6900
},
{
"epoch": 1.64,
"learning_rate": 0.00013749007148530578,
"loss": 0.6596,
"step": 6920
},
{
"epoch": 1.64,
"learning_rate": 0.0001370135027799841,
"loss": 0.6589,
"step": 6940
},
{
"epoch": 1.65,
"learning_rate": 0.0001365369340746624,
"loss": 0.6627,
"step": 6960
},
{
"epoch": 1.65,
"learning_rate": 0.00013606036536934073,
"loss": 0.6606,
"step": 6980
},
{
"epoch": 1.65,
"learning_rate": 0.00013558379666401906,
"loss": 0.6658,
"step": 7000
},
{
"epoch": 1.65,
"eval_loss": 0.6510519981384277,
"eval_runtime": 49.5012,
"eval_samples_per_second": 40.403,
"eval_steps_per_second": 2.525,
"step": 7000
},
{
"epoch": 1.66,
"learning_rate": 0.00013510722795869736,
"loss": 0.6571,
"step": 7020
},
{
"epoch": 1.66,
"learning_rate": 0.0001346306592533757,
"loss": 0.6607,
"step": 7040
},
{
"epoch": 1.67,
"learning_rate": 0.000134154090548054,
"loss": 0.6562,
"step": 7060
},
{
"epoch": 1.67,
"learning_rate": 0.00013367752184273232,
"loss": 0.6582,
"step": 7080
},
{
"epoch": 1.68,
"learning_rate": 0.00013320095313741064,
"loss": 0.6635,
"step": 7100
},
{
"epoch": 1.68,
"learning_rate": 0.00013272438443208894,
"loss": 0.6682,
"step": 7120
},
{
"epoch": 1.69,
"learning_rate": 0.00013224781572676727,
"loss": 0.6633,
"step": 7140
},
{
"epoch": 1.69,
"learning_rate": 0.0001317712470214456,
"loss": 0.6671,
"step": 7160
},
{
"epoch": 1.7,
"learning_rate": 0.0001312946783161239,
"loss": 0.6645,
"step": 7180
},
{
"epoch": 1.7,
"learning_rate": 0.0001308181096108022,
"loss": 0.6699,
"step": 7200
},
{
"epoch": 1.7,
"eval_loss": 0.6502068042755127,
"eval_runtime": 49.4619,
"eval_samples_per_second": 40.435,
"eval_steps_per_second": 2.527,
"step": 7200
},
{
"epoch": 1.71,
"learning_rate": 0.00013034154090548053,
"loss": 0.6617,
"step": 7220
},
{
"epoch": 1.71,
"learning_rate": 0.00012986497220015885,
"loss": 0.6639,
"step": 7240
},
{
"epoch": 1.72,
"learning_rate": 0.00012938840349483715,
"loss": 0.6634,
"step": 7260
},
{
"epoch": 1.72,
"learning_rate": 0.00012891183478951548,
"loss": 0.663,
"step": 7280
},
{
"epoch": 1.73,
"learning_rate": 0.00012843526608419378,
"loss": 0.6653,
"step": 7300
},
{
"epoch": 1.73,
"learning_rate": 0.0001279586973788721,
"loss": 0.6555,
"step": 7320
},
{
"epoch": 1.74,
"learning_rate": 0.00012748212867355044,
"loss": 0.6653,
"step": 7340
},
{
"epoch": 1.74,
"learning_rate": 0.00012700555996822874,
"loss": 0.6573,
"step": 7360
},
{
"epoch": 1.74,
"learning_rate": 0.00012652899126290706,
"loss": 0.658,
"step": 7380
},
{
"epoch": 1.75,
"learning_rate": 0.0001260524225575854,
"loss": 0.6562,
"step": 7400
},
{
"epoch": 1.75,
"eval_loss": 0.6491650342941284,
"eval_runtime": 49.2463,
"eval_samples_per_second": 40.612,
"eval_steps_per_second": 2.538,
"step": 7400
},
{
"epoch": 1.75,
"learning_rate": 0.0001255758538522637,
"loss": 0.6592,
"step": 7420
},
{
"epoch": 1.76,
"learning_rate": 0.000125099285146942,
"loss": 0.6587,
"step": 7440
},
{
"epoch": 1.76,
"learning_rate": 0.00012462271644162032,
"loss": 0.6616,
"step": 7460
},
{
"epoch": 1.77,
"learning_rate": 0.00012414614773629865,
"loss": 0.655,
"step": 7480
},
{
"epoch": 1.77,
"learning_rate": 0.00012366957903097695,
"loss": 0.6591,
"step": 7500
},
{
"epoch": 1.78,
"learning_rate": 0.00012319301032565527,
"loss": 0.6545,
"step": 7520
},
{
"epoch": 1.78,
"learning_rate": 0.0001227164416203336,
"loss": 0.6673,
"step": 7540
},
{
"epoch": 1.79,
"learning_rate": 0.0001222398729150119,
"loss": 0.6626,
"step": 7560
},
{
"epoch": 1.79,
"learning_rate": 0.00012176330420969023,
"loss": 0.6663,
"step": 7580
},
{
"epoch": 1.8,
"learning_rate": 0.00012128673550436854,
"loss": 0.6643,
"step": 7600
},
{
"epoch": 1.8,
"eval_loss": 0.6482685804367065,
"eval_runtime": 49.3591,
"eval_samples_per_second": 40.519,
"eval_steps_per_second": 2.532,
"step": 7600
},
{
"epoch": 1.8,
"learning_rate": 0.00012081016679904685,
"loss": 0.6623,
"step": 7620
},
{
"epoch": 1.81,
"learning_rate": 0.00012033359809372518,
"loss": 0.6636,
"step": 7640
},
{
"epoch": 1.81,
"learning_rate": 0.00011985702938840348,
"loss": 0.6598,
"step": 7660
},
{
"epoch": 1.82,
"learning_rate": 0.0001193804606830818,
"loss": 0.6521,
"step": 7680
},
{
"epoch": 1.82,
"learning_rate": 0.00011890389197776012,
"loss": 0.664,
"step": 7700
},
{
"epoch": 1.83,
"learning_rate": 0.00011842732327243844,
"loss": 0.6529,
"step": 7720
},
{
"epoch": 1.83,
"learning_rate": 0.00011795075456711675,
"loss": 0.6622,
"step": 7740
},
{
"epoch": 1.83,
"learning_rate": 0.00011747418586179508,
"loss": 0.6608,
"step": 7760
},
{
"epoch": 1.84,
"learning_rate": 0.00011699761715647338,
"loss": 0.6556,
"step": 7780
},
{
"epoch": 1.84,
"learning_rate": 0.00011652104845115169,
"loss": 0.6643,
"step": 7800
},
{
"epoch": 1.84,
"eval_loss": 0.6474015116691589,
"eval_runtime": 49.3608,
"eval_samples_per_second": 40.518,
"eval_steps_per_second": 2.532,
"step": 7800
},
{
"epoch": 1.85,
"learning_rate": 0.00011604447974583002,
"loss": 0.6541,
"step": 7820
},
{
"epoch": 1.85,
"learning_rate": 0.00011556791104050833,
"loss": 0.6614,
"step": 7840
},
{
"epoch": 1.86,
"learning_rate": 0.00011509134233518665,
"loss": 0.6499,
"step": 7860
},
{
"epoch": 1.86,
"learning_rate": 0.00011461477362986497,
"loss": 0.6563,
"step": 7880
},
{
"epoch": 1.87,
"learning_rate": 0.00011413820492454327,
"loss": 0.6589,
"step": 7900
},
{
"epoch": 1.87,
"learning_rate": 0.00011366163621922159,
"loss": 0.6544,
"step": 7920
},
{
"epoch": 1.88,
"learning_rate": 0.00011318506751389992,
"loss": 0.6606,
"step": 7940
},
{
"epoch": 1.88,
"learning_rate": 0.00011270849880857823,
"loss": 0.657,
"step": 7960
},
{
"epoch": 1.89,
"learning_rate": 0.00011223193010325654,
"loss": 0.6608,
"step": 7980
},
{
"epoch": 1.89,
"learning_rate": 0.00011175536139793487,
"loss": 0.6595,
"step": 8000
},
{
"epoch": 1.89,
"eval_loss": 0.6469079256057739,
"eval_runtime": 49.3012,
"eval_samples_per_second": 40.567,
"eval_steps_per_second": 2.535,
"step": 8000
},
{
"epoch": 1.9,
"learning_rate": 0.00011127879269261318,
"loss": 0.6563,
"step": 8020
},
{
"epoch": 1.9,
"learning_rate": 0.00011080222398729148,
"loss": 0.6602,
"step": 8040
},
{
"epoch": 1.91,
"learning_rate": 0.00011032565528196981,
"loss": 0.6603,
"step": 8060
},
{
"epoch": 1.91,
"learning_rate": 0.00010984908657664812,
"loss": 0.6495,
"step": 8080
},
{
"epoch": 1.91,
"learning_rate": 0.00010937251787132644,
"loss": 0.6551,
"step": 8100
},
{
"epoch": 1.92,
"learning_rate": 0.00010891977760127084,
"loss": 0.6497,
"step": 8120
},
{
"epoch": 1.92,
"learning_rate": 0.00010844320889594917,
"loss": 0.6652,
"step": 8140
},
{
"epoch": 1.93,
"learning_rate": 0.00010796664019062747,
"loss": 0.6497,
"step": 8160
},
{
"epoch": 1.93,
"learning_rate": 0.00010749007148530578,
"loss": 0.6554,
"step": 8180
},
{
"epoch": 1.94,
"learning_rate": 0.00010701350277998411,
"loss": 0.6563,
"step": 8200
},
{
"epoch": 1.94,
"eval_loss": 0.645990252494812,
"eval_runtime": 49.3957,
"eval_samples_per_second": 40.489,
"eval_steps_per_second": 2.531,
"step": 8200
},
{
"epoch": 1.94,
"learning_rate": 0.00010653693407466242,
"loss": 0.6572,
"step": 8220
},
{
"epoch": 1.95,
"learning_rate": 0.00010606036536934074,
"loss": 0.6563,
"step": 8240
},
{
"epoch": 1.95,
"learning_rate": 0.00010558379666401906,
"loss": 0.6535,
"step": 8260
},
{
"epoch": 1.96,
"learning_rate": 0.00010510722795869736,
"loss": 0.655,
"step": 8280
},
{
"epoch": 1.96,
"learning_rate": 0.00010463065925337568,
"loss": 0.6554,
"step": 8300
},
{
"epoch": 1.97,
"learning_rate": 0.000104154090548054,
"loss": 0.6559,
"step": 8320
},
{
"epoch": 1.97,
"learning_rate": 0.00010367752184273232,
"loss": 0.6522,
"step": 8340
},
{
"epoch": 1.98,
"learning_rate": 0.00010320095313741063,
"loss": 0.6568,
"step": 8360
},
{
"epoch": 1.98,
"learning_rate": 0.00010272438443208896,
"loss": 0.6566,
"step": 8380
},
{
"epoch": 1.99,
"learning_rate": 0.00010224781572676727,
"loss": 0.6496,
"step": 8400
},
{
"epoch": 1.99,
"eval_loss": 0.6457875967025757,
"eval_runtime": 49.0201,
"eval_samples_per_second": 40.8,
"eval_steps_per_second": 2.55,
"step": 8400
},
{
"epoch": 1.99,
"learning_rate": 0.00010177124702144557,
"loss": 0.66,
"step": 8420
},
{
"epoch": 2.0,
"learning_rate": 0.0001012946783161239,
"loss": 0.6457,
"step": 8440
},
{
"epoch": 2.0,
"learning_rate": 0.0001008419380460683,
"loss": 0.6349,
"step": 8460
},
{
"epoch": 2.0,
"learning_rate": 0.00010036536934074662,
"loss": 0.6545,
"step": 8480
},
{
"epoch": 2.01,
"learning_rate": 9.988880063542493e-05,
"loss": 0.6515,
"step": 8500
},
{
"epoch": 2.01,
"learning_rate": 9.941223193010326e-05,
"loss": 0.6459,
"step": 8520
},
{
"epoch": 2.02,
"learning_rate": 9.893566322478156e-05,
"loss": 0.6494,
"step": 8540
},
{
"epoch": 2.02,
"learning_rate": 9.845909451945987e-05,
"loss": 0.6608,
"step": 8560
},
{
"epoch": 2.03,
"learning_rate": 9.79825258141382e-05,
"loss": 0.6485,
"step": 8580
},
{
"epoch": 2.03,
"learning_rate": 9.750595710881651e-05,
"loss": 0.6461,
"step": 8600
},
{
"epoch": 2.03,
"eval_loss": 0.6450995802879333,
"eval_runtime": 49.2592,
"eval_samples_per_second": 40.602,
"eval_steps_per_second": 2.538,
"step": 8600
},
{
"epoch": 2.04,
"learning_rate": 9.702938840349483e-05,
"loss": 0.6523,
"step": 8620
},
{
"epoch": 2.04,
"learning_rate": 9.655281969817315e-05,
"loss": 0.6565,
"step": 8640
},
{
"epoch": 2.05,
"learning_rate": 9.607625099285145e-05,
"loss": 0.6541,
"step": 8660
},
{
"epoch": 2.05,
"learning_rate": 9.559968228752977e-05,
"loss": 0.6585,
"step": 8680
},
{
"epoch": 2.06,
"learning_rate": 9.51231135822081e-05,
"loss": 0.6531,
"step": 8700
},
{
"epoch": 2.06,
"learning_rate": 9.464654487688641e-05,
"loss": 0.6579,
"step": 8720
},
{
"epoch": 2.07,
"learning_rate": 9.416997617156472e-05,
"loss": 0.6438,
"step": 8740
},
{
"epoch": 2.07,
"learning_rate": 9.369340746624305e-05,
"loss": 0.6516,
"step": 8760
},
{
"epoch": 2.08,
"learning_rate": 9.321683876092136e-05,
"loss": 0.6576,
"step": 8780
},
{
"epoch": 2.08,
"learning_rate": 9.274027005559966e-05,
"loss": 0.6506,
"step": 8800
},
{
"epoch": 2.08,
"eval_loss": 0.6444578170776367,
"eval_runtime": 49.0631,
"eval_samples_per_second": 40.764,
"eval_steps_per_second": 2.548,
"step": 8800
},
{
"epoch": 2.09,
"learning_rate": 9.226370135027799e-05,
"loss": 0.6484,
"step": 8820
},
{
"epoch": 2.09,
"learning_rate": 9.17871326449563e-05,
"loss": 0.6566,
"step": 8840
},
{
"epoch": 2.09,
"learning_rate": 9.131056393963462e-05,
"loss": 0.6547,
"step": 8860
},
{
"epoch": 2.1,
"learning_rate": 9.083399523431295e-05,
"loss": 0.6532,
"step": 8880
},
{
"epoch": 2.1,
"learning_rate": 9.035742652899126e-05,
"loss": 0.6532,
"step": 8900
},
{
"epoch": 2.11,
"learning_rate": 8.988085782366956e-05,
"loss": 0.6479,
"step": 8920
},
{
"epoch": 2.11,
"learning_rate": 8.940428911834789e-05,
"loss": 0.6548,
"step": 8940
},
{
"epoch": 2.12,
"learning_rate": 8.89277204130262e-05,
"loss": 0.647,
"step": 8960
},
{
"epoch": 2.12,
"learning_rate": 8.845115170770452e-05,
"loss": 0.6478,
"step": 8980
},
{
"epoch": 2.13,
"learning_rate": 8.797458300238284e-05,
"loss": 0.6553,
"step": 9000
},
{
"epoch": 2.13,
"eval_loss": 0.6433074474334717,
"eval_runtime": 49.3831,
"eval_samples_per_second": 40.5,
"eval_steps_per_second": 2.531,
"step": 9000
},
{
"epoch": 2.13,
"learning_rate": 8.749801429706116e-05,
"loss": 0.6443,
"step": 9020
},
{
"epoch": 2.14,
"learning_rate": 8.702144559173947e-05,
"loss": 0.6518,
"step": 9040
},
{
"epoch": 2.14,
"learning_rate": 8.65448768864178e-05,
"loss": 0.6578,
"step": 9060
},
{
"epoch": 2.15,
"learning_rate": 8.60683081810961e-05,
"loss": 0.6472,
"step": 9080
},
{
"epoch": 2.15,
"learning_rate": 8.559173947577441e-05,
"loss": 0.6471,
"step": 9100
},
{
"epoch": 2.16,
"learning_rate": 8.511517077045274e-05,
"loss": 0.6482,
"step": 9120
},
{
"epoch": 2.16,
"learning_rate": 8.463860206513105e-05,
"loss": 0.6522,
"step": 9140
},
{
"epoch": 2.17,
"learning_rate": 8.416203335980937e-05,
"loss": 0.6584,
"step": 9160
},
{
"epoch": 2.17,
"learning_rate": 8.368546465448769e-05,
"loss": 0.6596,
"step": 9180
},
{
"epoch": 2.17,
"learning_rate": 8.320889594916599e-05,
"loss": 0.6581,
"step": 9200
},
{
"epoch": 2.17,
"eval_loss": 0.6426697969436646,
"eval_runtime": 49.0935,
"eval_samples_per_second": 40.739,
"eval_steps_per_second": 2.546,
"step": 9200
},
{
"epoch": 2.18,
"learning_rate": 8.273232724384431e-05,
"loss": 0.6441,
"step": 9220
},
{
"epoch": 2.18,
"learning_rate": 8.225575853852263e-05,
"loss": 0.6509,
"step": 9240
},
{
"epoch": 2.19,
"learning_rate": 8.177918983320095e-05,
"loss": 0.6409,
"step": 9260
},
{
"epoch": 2.19,
"learning_rate": 8.130262112787926e-05,
"loss": 0.6475,
"step": 9280
},
{
"epoch": 2.2,
"learning_rate": 8.082605242255759e-05,
"loss": 0.6597,
"step": 9300
},
{
"epoch": 2.2,
"learning_rate": 8.03494837172359e-05,
"loss": 0.6544,
"step": 9320
},
{
"epoch": 2.21,
"learning_rate": 7.98729150119142e-05,
"loss": 0.6528,
"step": 9340
},
{
"epoch": 2.21,
"learning_rate": 7.939634630659253e-05,
"loss": 0.644,
"step": 9360
},
{
"epoch": 2.22,
"learning_rate": 7.891977760127084e-05,
"loss": 0.6552,
"step": 9380
},
{
"epoch": 2.22,
"learning_rate": 7.844320889594916e-05,
"loss": 0.6548,
"step": 9400
},
{
"epoch": 2.22,
"eval_loss": 0.6423606276512146,
"eval_runtime": 49.6466,
"eval_samples_per_second": 40.285,
"eval_steps_per_second": 2.518,
"step": 9400
},
{
"epoch": 2.23,
"learning_rate": 7.796664019062748e-05,
"loss": 0.6568,
"step": 9420
},
{
"epoch": 2.23,
"learning_rate": 7.74900714853058e-05,
"loss": 0.6539,
"step": 9440
},
{
"epoch": 2.24,
"learning_rate": 7.70135027799841e-05,
"loss": 0.6468,
"step": 9460
},
{
"epoch": 2.24,
"learning_rate": 7.653693407466243e-05,
"loss": 0.6425,
"step": 9480
},
{
"epoch": 2.25,
"learning_rate": 7.606036536934074e-05,
"loss": 0.6523,
"step": 9500
},
{
"epoch": 2.25,
"learning_rate": 7.558379666401905e-05,
"loss": 0.6468,
"step": 9520
},
{
"epoch": 2.26,
"learning_rate": 7.510722795869738e-05,
"loss": 0.6518,
"step": 9540
},
{
"epoch": 2.26,
"learning_rate": 7.46306592533757e-05,
"loss": 0.6534,
"step": 9560
},
{
"epoch": 2.26,
"learning_rate": 7.415409054805401e-05,
"loss": 0.6471,
"step": 9580
},
{
"epoch": 2.27,
"learning_rate": 7.367752184273232e-05,
"loss": 0.6465,
"step": 9600
},
{
"epoch": 2.27,
"eval_loss": 0.6418060064315796,
"eval_runtime": 49.4954,
"eval_samples_per_second": 40.408,
"eval_steps_per_second": 2.525,
"step": 9600
},
{
"epoch": 2.27,
"learning_rate": 7.320095313741064e-05,
"loss": 0.6577,
"step": 9620
},
{
"epoch": 2.28,
"learning_rate": 7.272438443208895e-05,
"loss": 0.6453,
"step": 9640
},
{
"epoch": 2.28,
"learning_rate": 7.224781572676726e-05,
"loss": 0.6489,
"step": 9660
},
{
"epoch": 2.29,
"learning_rate": 7.177124702144559e-05,
"loss": 0.6466,
"step": 9680
},
{
"epoch": 2.29,
"learning_rate": 7.12946783161239e-05,
"loss": 0.6493,
"step": 9700
},
{
"epoch": 2.3,
"learning_rate": 7.081810961080222e-05,
"loss": 0.6537,
"step": 9720
},
{
"epoch": 2.3,
"learning_rate": 7.034154090548053e-05,
"loss": 0.6486,
"step": 9740
},
{
"epoch": 2.31,
"learning_rate": 6.986497220015885e-05,
"loss": 0.65,
"step": 9760
},
{
"epoch": 2.31,
"learning_rate": 6.938840349483717e-05,
"loss": 0.6387,
"step": 9780
},
{
"epoch": 2.32,
"learning_rate": 6.891183478951549e-05,
"loss": 0.6464,
"step": 9800
},
{
"epoch": 2.32,
"eval_loss": 0.6412256360054016,
"eval_runtime": 49.3752,
"eval_samples_per_second": 40.506,
"eval_steps_per_second": 2.532,
"step": 9800
},
{
"epoch": 2.32,
"learning_rate": 6.84352660841938e-05,
"loss": 0.6475,
"step": 9820
},
{
"epoch": 2.33,
"learning_rate": 6.795869737887211e-05,
"loss": 0.6543,
"step": 9840
},
{
"epoch": 2.33,
"learning_rate": 6.748212867355043e-05,
"loss": 0.6545,
"step": 9860
},
{
"epoch": 2.34,
"learning_rate": 6.700555996822874e-05,
"loss": 0.6468,
"step": 9880
},
{
"epoch": 2.34,
"learning_rate": 6.652899126290707e-05,
"loss": 0.651,
"step": 9900
},
{
"epoch": 2.35,
"learning_rate": 6.605242255758538e-05,
"loss": 0.641,
"step": 9920
},
{
"epoch": 2.35,
"learning_rate": 6.55758538522637e-05,
"loss": 0.657,
"step": 9940
},
{
"epoch": 2.35,
"learning_rate": 6.509928514694201e-05,
"loss": 0.6481,
"step": 9960
},
{
"epoch": 2.36,
"learning_rate": 6.462271644162034e-05,
"loss": 0.6496,
"step": 9980
},
{
"epoch": 2.36,
"learning_rate": 6.414614773629864e-05,
"loss": 0.6451,
"step": 10000
},
{
"epoch": 2.36,
"eval_loss": 0.6414454579353333,
"eval_runtime": 49.395,
"eval_samples_per_second": 40.49,
"eval_steps_per_second": 2.531,
"step": 10000
},
{
"epoch": 2.37,
"learning_rate": 6.366957903097696e-05,
"loss": 0.6555,
"step": 10020
},
{
"epoch": 2.37,
"learning_rate": 6.319301032565528e-05,
"loss": 0.6494,
"step": 10040
},
{
"epoch": 2.38,
"learning_rate": 6.271644162033359e-05,
"loss": 0.6487,
"step": 10060
},
{
"epoch": 2.38,
"learning_rate": 6.22398729150119e-05,
"loss": 0.6544,
"step": 10080
},
{
"epoch": 2.39,
"learning_rate": 6.176330420969023e-05,
"loss": 0.6468,
"step": 10100
},
{
"epoch": 2.39,
"learning_rate": 6.128673550436853e-05,
"loss": 0.6441,
"step": 10120
},
{
"epoch": 2.4,
"learning_rate": 6.081016679904686e-05,
"loss": 0.6478,
"step": 10140
},
{
"epoch": 2.4,
"learning_rate": 6.033359809372518e-05,
"loss": 0.6539,
"step": 10160
},
{
"epoch": 2.41,
"learning_rate": 5.985702938840349e-05,
"loss": 0.6486,
"step": 10180
},
{
"epoch": 2.41,
"learning_rate": 5.938046068308181e-05,
"loss": 0.6467,
"step": 10200
},
{
"epoch": 2.41,
"eval_loss": 0.6406835913658142,
"eval_runtime": 49.5084,
"eval_samples_per_second": 40.397,
"eval_steps_per_second": 2.525,
"step": 10200
},
{
"epoch": 2.42,
"learning_rate": 5.890389197776013e-05,
"loss": 0.6399,
"step": 10220
},
{
"epoch": 2.42,
"learning_rate": 5.8427323272438435e-05,
"loss": 0.6519,
"step": 10240
},
{
"epoch": 2.43,
"learning_rate": 5.7950754567116756e-05,
"loss": 0.6465,
"step": 10260
},
{
"epoch": 2.43,
"learning_rate": 5.7474185861795076e-05,
"loss": 0.6479,
"step": 10280
},
{
"epoch": 2.43,
"learning_rate": 5.6997617156473383e-05,
"loss": 0.6462,
"step": 10300
},
{
"epoch": 2.44,
"learning_rate": 5.6521048451151704e-05,
"loss": 0.6451,
"step": 10320
},
{
"epoch": 2.44,
"learning_rate": 5.604447974583002e-05,
"loss": 0.6453,
"step": 10340
},
{
"epoch": 2.45,
"learning_rate": 5.556791104050833e-05,
"loss": 0.6543,
"step": 10360
},
{
"epoch": 2.45,
"learning_rate": 5.509134233518665e-05,
"loss": 0.6428,
"step": 10380
},
{
"epoch": 2.46,
"learning_rate": 5.4614773629864966e-05,
"loss": 0.6491,
"step": 10400
},
{
"epoch": 2.46,
"eval_loss": 0.6400973796844482,
"eval_runtime": 49.3411,
"eval_samples_per_second": 40.534,
"eval_steps_per_second": 2.533,
"step": 10400
},
{
"epoch": 2.46,
"learning_rate": 5.413820492454328e-05,
"loss": 0.649,
"step": 10420
},
{
"epoch": 2.47,
"learning_rate": 5.36616362192216e-05,
"loss": 0.6494,
"step": 10440
},
{
"epoch": 2.47,
"learning_rate": 5.3185067513899913e-05,
"loss": 0.6431,
"step": 10460
},
{
"epoch": 2.48,
"learning_rate": 5.2708498808578234e-05,
"loss": 0.6478,
"step": 10480
},
{
"epoch": 2.48,
"learning_rate": 5.223193010325655e-05,
"loss": 0.6416,
"step": 10500
},
{
"epoch": 2.49,
"learning_rate": 5.175536139793486e-05,
"loss": 0.6507,
"step": 10520
},
{
"epoch": 2.49,
"learning_rate": 5.127879269261318e-05,
"loss": 0.6448,
"step": 10540
},
{
"epoch": 2.5,
"learning_rate": 5.0802223987291496e-05,
"loss": 0.6455,
"step": 10560
},
{
"epoch": 2.5,
"learning_rate": 5.032565528196981e-05,
"loss": 0.6437,
"step": 10580
},
{
"epoch": 2.51,
"learning_rate": 4.984908657664813e-05,
"loss": 0.6488,
"step": 10600
},
{
"epoch": 2.51,
"eval_loss": 0.6400858163833618,
"eval_runtime": 49.8084,
"eval_samples_per_second": 40.154,
"eval_steps_per_second": 2.51,
"step": 10600
},
{
"epoch": 2.51,
"learning_rate": 4.937251787132645e-05,
"loss": 0.6436,
"step": 10620
},
{
"epoch": 2.52,
"learning_rate": 4.889594916600476e-05,
"loss": 0.6446,
"step": 10640
},
{
"epoch": 2.52,
"learning_rate": 4.841938046068308e-05,
"loss": 0.6488,
"step": 10660
},
{
"epoch": 2.52,
"learning_rate": 4.79428117553614e-05,
"loss": 0.6485,
"step": 10680
},
{
"epoch": 2.53,
"learning_rate": 4.7466243050039705e-05,
"loss": 0.6524,
"step": 10700
},
{
"epoch": 2.53,
"learning_rate": 4.6989674344718026e-05,
"loss": 0.6376,
"step": 10720
},
{
"epoch": 2.54,
"learning_rate": 4.6513105639396346e-05,
"loss": 0.649,
"step": 10740
},
{
"epoch": 2.54,
"learning_rate": 4.603653693407465e-05,
"loss": 0.6444,
"step": 10760
},
{
"epoch": 2.55,
"learning_rate": 4.5559968228752974e-05,
"loss": 0.6407,
"step": 10780
},
{
"epoch": 2.55,
"learning_rate": 4.5083399523431294e-05,
"loss": 0.6448,
"step": 10800
},
{
"epoch": 2.55,
"eval_loss": 0.6392157077789307,
"eval_runtime": 49.7963,
"eval_samples_per_second": 40.164,
"eval_steps_per_second": 2.51,
"step": 10800
},
{
"epoch": 2.56,
"learning_rate": 4.46068308181096e-05,
"loss": 0.6454,
"step": 10820
},
{
"epoch": 2.56,
"learning_rate": 4.413026211278792e-05,
"loss": 0.6544,
"step": 10840
},
{
"epoch": 2.57,
"learning_rate": 4.365369340746624e-05,
"loss": 0.6478,
"step": 10860
},
{
"epoch": 2.57,
"learning_rate": 4.3177124702144556e-05,
"loss": 0.6434,
"step": 10880
},
{
"epoch": 2.58,
"learning_rate": 4.270055599682287e-05,
"loss": 0.6482,
"step": 10900
},
{
"epoch": 2.58,
"learning_rate": 4.222398729150119e-05,
"loss": 0.6403,
"step": 10920
},
{
"epoch": 2.59,
"learning_rate": 4.1747418586179504e-05,
"loss": 0.6501,
"step": 10940
},
{
"epoch": 2.59,
"learning_rate": 4.127084988085782e-05,
"loss": 0.6507,
"step": 10960
},
{
"epoch": 2.6,
"learning_rate": 4.079428117553614e-05,
"loss": 0.6496,
"step": 10980
},
{
"epoch": 2.6,
"learning_rate": 4.031771247021445e-05,
"loss": 0.6544,
"step": 11000
},
{
"epoch": 2.6,
"eval_loss": 0.6390016078948975,
"eval_runtime": 49.6306,
"eval_samples_per_second": 40.298,
"eval_steps_per_second": 2.519,
"step": 11000
},
{
"epoch": 2.61,
"learning_rate": 3.984114376489277e-05,
"loss": 0.6405,
"step": 11020
},
{
"epoch": 2.61,
"learning_rate": 3.9364575059571086e-05,
"loss": 0.6429,
"step": 11040
},
{
"epoch": 2.61,
"learning_rate": 3.88880063542494e-05,
"loss": 0.6403,
"step": 11060
},
{
"epoch": 2.62,
"learning_rate": 3.841143764892772e-05,
"loss": 0.6338,
"step": 11080
},
{
"epoch": 2.62,
"learning_rate": 3.7934868943606034e-05,
"loss": 0.6417,
"step": 11100
},
{
"epoch": 2.63,
"learning_rate": 3.7458300238284354e-05,
"loss": 0.6463,
"step": 11120
},
{
"epoch": 2.63,
"learning_rate": 3.698173153296267e-05,
"loss": 0.6498,
"step": 11140
},
{
"epoch": 2.64,
"learning_rate": 3.650516282764098e-05,
"loss": 0.6415,
"step": 11160
},
{
"epoch": 2.64,
"learning_rate": 3.6028594122319296e-05,
"loss": 0.645,
"step": 11180
},
{
"epoch": 2.65,
"learning_rate": 3.5552025416997616e-05,
"loss": 0.6467,
"step": 11200
},
{
"epoch": 2.65,
"eval_loss": 0.6387213468551636,
"eval_runtime": 49.1775,
"eval_samples_per_second": 40.669,
"eval_steps_per_second": 2.542,
"step": 11200
},
{
"epoch": 2.65,
"learning_rate": 3.507545671167593e-05,
"loss": 0.6515,
"step": 11220
},
{
"epoch": 2.66,
"learning_rate": 3.4598888006354244e-05,
"loss": 0.65,
"step": 11240
},
{
"epoch": 2.66,
"learning_rate": 3.4122319301032564e-05,
"loss": 0.6512,
"step": 11260
},
{
"epoch": 2.67,
"learning_rate": 3.364575059571088e-05,
"loss": 0.6443,
"step": 11280
},
{
"epoch": 2.67,
"learning_rate": 3.316918189038919e-05,
"loss": 0.6483,
"step": 11300
},
{
"epoch": 2.68,
"learning_rate": 3.269261318506751e-05,
"loss": 0.6455,
"step": 11320
},
{
"epoch": 2.68,
"learning_rate": 3.2216044479745826e-05,
"loss": 0.6461,
"step": 11340
},
{
"epoch": 2.69,
"learning_rate": 3.173947577442414e-05,
"loss": 0.6505,
"step": 11360
},
{
"epoch": 2.69,
"learning_rate": 3.126290706910246e-05,
"loss": 0.6517,
"step": 11380
},
{
"epoch": 2.7,
"learning_rate": 3.0786338363780774e-05,
"loss": 0.6406,
"step": 11400
},
{
"epoch": 2.7,
"eval_loss": 0.6380326151847839,
"eval_runtime": 49.4129,
"eval_samples_per_second": 40.475,
"eval_steps_per_second": 2.53,
"step": 11400
},
{
"epoch": 2.7,
"learning_rate": 3.030976965845909e-05,
"loss": 0.647,
"step": 11420
},
{
"epoch": 2.7,
"learning_rate": 2.9833200953137408e-05,
"loss": 0.6495,
"step": 11440
},
{
"epoch": 2.71,
"learning_rate": 2.9356632247815725e-05,
"loss": 0.6448,
"step": 11460
},
{
"epoch": 2.71,
"learning_rate": 2.888006354249404e-05,
"loss": 0.6447,
"step": 11480
},
{
"epoch": 2.72,
"learning_rate": 2.840349483717236e-05,
"loss": 0.6527,
"step": 11500
},
{
"epoch": 2.72,
"learning_rate": 2.7926926131850673e-05,
"loss": 0.6406,
"step": 11520
},
{
"epoch": 2.73,
"learning_rate": 2.7450357426528987e-05,
"loss": 0.6443,
"step": 11540
},
{
"epoch": 2.73,
"learning_rate": 2.6973788721207307e-05,
"loss": 0.6351,
"step": 11560
},
{
"epoch": 2.74,
"learning_rate": 2.649722001588562e-05,
"loss": 0.6417,
"step": 11580
},
{
"epoch": 2.74,
"learning_rate": 2.6020651310563938e-05,
"loss": 0.6356,
"step": 11600
},
{
"epoch": 2.74,
"eval_loss": 0.6381237506866455,
"eval_runtime": 49.5534,
"eval_samples_per_second": 40.36,
"eval_steps_per_second": 2.523,
"step": 11600
},
{
"epoch": 2.75,
"learning_rate": 2.5544082605242255e-05,
"loss": 0.6412,
"step": 11620
},
{
"epoch": 2.75,
"learning_rate": 2.506751389992057e-05,
"loss": 0.6418,
"step": 11640
},
{
"epoch": 2.76,
"learning_rate": 2.4590945194598886e-05,
"loss": 0.6426,
"step": 11660
},
{
"epoch": 2.76,
"learning_rate": 2.4114376489277203e-05,
"loss": 0.6461,
"step": 11680
},
{
"epoch": 2.77,
"learning_rate": 2.363780778395552e-05,
"loss": 0.6475,
"step": 11700
},
{
"epoch": 2.77,
"learning_rate": 2.3161239078633834e-05,
"loss": 0.6431,
"step": 11720
},
{
"epoch": 2.78,
"learning_rate": 2.2684670373312148e-05,
"loss": 0.6416,
"step": 11740
},
{
"epoch": 2.78,
"learning_rate": 2.2208101667990468e-05,
"loss": 0.6495,
"step": 11760
},
{
"epoch": 2.78,
"learning_rate": 2.1731532962668782e-05,
"loss": 0.6404,
"step": 11780
},
{
"epoch": 2.79,
"learning_rate": 2.1254964257347096e-05,
"loss": 0.6434,
"step": 11800
},
{
"epoch": 2.79,
"eval_loss": 0.6377163529396057,
"eval_runtime": 49.328,
"eval_samples_per_second": 40.545,
"eval_steps_per_second": 2.534,
"step": 11800
},
{
"epoch": 2.79,
"learning_rate": 2.0778395552025416e-05,
"loss": 0.6437,
"step": 11820
},
{
"epoch": 2.8,
"learning_rate": 2.030182684670373e-05,
"loss": 0.6393,
"step": 11840
},
{
"epoch": 2.8,
"learning_rate": 1.9825258141382047e-05,
"loss": 0.6412,
"step": 11860
},
{
"epoch": 2.81,
"learning_rate": 1.9348689436060364e-05,
"loss": 0.6494,
"step": 11880
},
{
"epoch": 2.81,
"learning_rate": 1.887212073073868e-05,
"loss": 0.6481,
"step": 11900
},
{
"epoch": 2.82,
"learning_rate": 1.8395552025416998e-05,
"loss": 0.6407,
"step": 11920
},
{
"epoch": 2.82,
"learning_rate": 1.7918983320095312e-05,
"loss": 0.6422,
"step": 11940
},
{
"epoch": 2.83,
"learning_rate": 1.744241461477363e-05,
"loss": 0.6487,
"step": 11960
},
{
"epoch": 2.83,
"learning_rate": 1.6965845909451946e-05,
"loss": 0.6478,
"step": 11980
},
{
"epoch": 2.84,
"learning_rate": 1.648927720413026e-05,
"loss": 0.6451,
"step": 12000
},
{
"epoch": 2.84,
"eval_loss": 0.6374698281288147,
"eval_runtime": 49.9107,
"eval_samples_per_second": 40.072,
"eval_steps_per_second": 2.504,
"step": 12000
},
{
"epoch": 2.84,
"learning_rate": 1.6012708498808577e-05,
"loss": 0.6454,
"step": 12020
},
{
"epoch": 2.85,
"learning_rate": 1.5536139793486894e-05,
"loss": 0.6399,
"step": 12040
},
{
"epoch": 2.85,
"learning_rate": 1.5059571088165208e-05,
"loss": 0.6479,
"step": 12060
},
{
"epoch": 2.86,
"learning_rate": 1.4583002382843525e-05,
"loss": 0.6412,
"step": 12080
},
{
"epoch": 2.86,
"learning_rate": 1.4106433677521842e-05,
"loss": 0.65,
"step": 12100
},
{
"epoch": 2.87,
"learning_rate": 1.3629864972200157e-05,
"loss": 0.6461,
"step": 12120
},
{
"epoch": 2.87,
"learning_rate": 1.3153296266878475e-05,
"loss": 0.6434,
"step": 12140
},
{
"epoch": 2.87,
"learning_rate": 1.2676727561556788e-05,
"loss": 0.6463,
"step": 12160
},
{
"epoch": 2.88,
"learning_rate": 1.2200158856235105e-05,
"loss": 0.6399,
"step": 12180
},
{
"epoch": 2.88,
"learning_rate": 1.1723590150913422e-05,
"loss": 0.6446,
"step": 12200
},
{
"epoch": 2.88,
"eval_loss": 0.6372544765472412,
"eval_runtime": 49.6265,
"eval_samples_per_second": 40.301,
"eval_steps_per_second": 2.519,
"step": 12200
},
{
"epoch": 2.89,
"learning_rate": 1.1247021445591738e-05,
"loss": 0.6411,
"step": 12220
},
{
"epoch": 2.89,
"learning_rate": 1.0770452740270055e-05,
"loss": 0.6523,
"step": 12240
},
{
"epoch": 2.9,
"learning_rate": 1.0293884034948372e-05,
"loss": 0.6456,
"step": 12260
},
{
"epoch": 2.9,
"learning_rate": 9.817315329626686e-06,
"loss": 0.6394,
"step": 12280
},
{
"epoch": 2.91,
"learning_rate": 9.340746624305003e-06,
"loss": 0.6466,
"step": 12300
},
{
"epoch": 2.91,
"learning_rate": 8.864177918983318e-06,
"loss": 0.6415,
"step": 12320
},
{
"epoch": 2.92,
"learning_rate": 8.387609213661635e-06,
"loss": 0.6349,
"step": 12340
},
{
"epoch": 2.92,
"learning_rate": 7.911040508339953e-06,
"loss": 0.6415,
"step": 12360
},
{
"epoch": 2.93,
"learning_rate": 7.434471803018268e-06,
"loss": 0.6484,
"step": 12380
},
{
"epoch": 2.93,
"learning_rate": 6.957903097696583e-06,
"loss": 0.6522,
"step": 12400
},
{
"epoch": 2.93,
"eval_loss": 0.6368712186813354,
"eval_runtime": 49.0693,
"eval_samples_per_second": 40.759,
"eval_steps_per_second": 2.547,
"step": 12400
},
{
"epoch": 2.94,
"learning_rate": 6.4813343923749005e-06,
"loss": 0.643,
"step": 12420
},
{
"epoch": 2.94,
"learning_rate": 6.004765687053216e-06,
"loss": 0.6515,
"step": 12440
},
{
"epoch": 2.95,
"learning_rate": 5.528196981731532e-06,
"loss": 0.6512,
"step": 12460
},
{
"epoch": 2.95,
"learning_rate": 5.051628276409849e-06,
"loss": 0.6483,
"step": 12480
},
{
"epoch": 2.96,
"learning_rate": 4.575059571088165e-06,
"loss": 0.6342,
"step": 12500
},
{
"epoch": 2.96,
"learning_rate": 4.098490865766481e-06,
"loss": 0.6445,
"step": 12520
},
{
"epoch": 2.96,
"learning_rate": 3.6219221604447972e-06,
"loss": 0.6419,
"step": 12540
},
{
"epoch": 2.97,
"learning_rate": 3.145353455123113e-06,
"loss": 0.642,
"step": 12560
},
{
"epoch": 2.97,
"learning_rate": 2.6687847498014293e-06,
"loss": 0.6541,
"step": 12580
},
{
"epoch": 2.98,
"learning_rate": 2.1922160444797456e-06,
"loss": 0.6334,
"step": 12600
},
{
"epoch": 2.98,
"eval_loss": 0.636846125125885,
"eval_runtime": 49.1332,
"eval_samples_per_second": 40.706,
"eval_steps_per_second": 2.544,
"step": 12600
}
],
"max_steps": 12690,
"num_train_epochs": 3,
"total_flos": 1.6375945250008465e+19,
"trial_name": null,
"trial_params": null
}