indobart-base-v2 / trainer_state.json
Gaduh Hartawan
initial commit
8751838
raw
history blame
No virus
30.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 18750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"grad_norm": 13.29955005645752,
"learning_rate": 1e-5,
"loss": 3.9042,
"step": 100
},
{
"epoch": 0.05,
"grad_norm": 1.7849366664886475,
"learning_rate": 2e-5,
"loss": 0.6647,
"step": 200
},
{
"epoch": 0.08,
"grad_norm": 1.459277868270874,
"learning_rate": 3e-5,
"loss": 0.5227,
"step": 300
},
{
"epoch": 0.11,
"grad_norm": 1.3880829811096191,
"learning_rate": 4e-5,
"loss": 0.5149,
"step": 400
},
{
"epoch": 0.13,
"grad_norm": 1.3338109254837036,
"learning_rate": 5e-5,
"loss": 0.4984,
"step": 500
},
{
"epoch": 0.16,
"grad_norm": 1.065587043762207,
"learning_rate": 4.972602739726028e-5,
"loss": 0.4847,
"step": 600
},
{
"epoch": 0.19,
"grad_norm": 1.0945565700531006,
"learning_rate": 4.945205479452055e-5,
"loss": 0.4615,
"step": 700
},
{
"epoch": 0.21,
"grad_norm": 1.0960415601730347,
"learning_rate": 4.917808219178082e-5,
"loss": 0.4504,
"step": 800
},
{
"epoch": 0.24,
"grad_norm": 1.0467826128005981,
"learning_rate": 4.89041095890411e-5,
"loss": 0.4415,
"step": 900
},
{
"epoch": 0.27,
"grad_norm": 1.0690410137176514,
"learning_rate": 4.863013698630137e-5,
"loss": 0.4497,
"step": 1000
},
{
"epoch": 0.29,
"grad_norm": 0.9305112957954407,
"learning_rate": 4.835616438356165e-5,
"loss": 0.4363,
"step": 1100
},
{
"epoch": 0.32,
"grad_norm": 0.8500822186470032,
"learning_rate": 4.808219178082192e-5,
"loss": 0.4306,
"step": 1200
},
{
"epoch": 0.35,
"grad_norm": 1.2137833833694458,
"learning_rate": 4.780821917808219e-5,
"loss": 0.423,
"step": 1300
},
{
"epoch": 0.37,
"grad_norm": 0.9352328777313232,
"learning_rate": 4.753424657534247e-5,
"loss": 0.4161,
"step": 1400
},
{
"epoch": 0.4,
"grad_norm": 1.1027398109436035,
"learning_rate": 4.726027397260274e-5,
"loss": 0.4083,
"step": 1500
},
{
"epoch": 0.43,
"grad_norm": 0.924199104309082,
"learning_rate": 4.698630136986302e-5,
"loss": 0.4254,
"step": 1600
},
{
"epoch": 0.45,
"grad_norm": 0.8912659883499146,
"learning_rate": 4.671232876712329e-5,
"loss": 0.3918,
"step": 1700
},
{
"epoch": 0.48,
"grad_norm": 1.0025393962860107,
"learning_rate": 4.643835616438356e-5,
"loss": 0.4046,
"step": 1800
},
{
"epoch": 0.51,
"grad_norm": 0.9362453818321228,
"learning_rate": 4.616438356164384e-5,
"loss": 0.3979,
"step": 1900
},
{
"epoch": 0.53,
"grad_norm": 0.8841680884361267,
"learning_rate": 4.589041095890411e-5,
"loss": 0.3938,
"step": 2000
},
{
"epoch": 0.56,
"grad_norm": 0.898572564125061,
"learning_rate": 4.561643835616439e-5,
"loss": 0.3974,
"step": 2100
},
{
"epoch": 0.59,
"grad_norm": 0.9011989831924438,
"learning_rate": 4.534246575342466e-5,
"loss": 0.3955,
"step": 2200
},
{
"epoch": 0.61,
"grad_norm": 0.913512110710144,
"learning_rate": 4.506849315068493e-5,
"loss": 0.3842,
"step": 2300
},
{
"epoch": 0.64,
"grad_norm": 0.9077229499816895,
"learning_rate": 4.479452054794521e-5,
"loss": 0.3756,
"step": 2400
},
{
"epoch": 0.67,
"grad_norm": 0.8107369542121887,
"learning_rate": 4.452054794520548e-5,
"loss": 0.3781,
"step": 2500
},
{
"epoch": 0.69,
"grad_norm": 0.962982714176178,
"learning_rate": 4.424657534246576e-5,
"loss": 0.3728,
"step": 2600
},
{
"epoch": 0.72,
"grad_norm": 0.9333651065826416,
"learning_rate": 4.3972602739726035e-5,
"loss": 0.3731,
"step": 2700
},
{
"epoch": 0.75,
"grad_norm": 0.9969388246536255,
"learning_rate": 4.36986301369863e-5,
"loss": 0.3793,
"step": 2800
},
{
"epoch": 0.77,
"grad_norm": 0.8959200978279114,
"learning_rate": 4.342465753424658e-5,
"loss": 0.3787,
"step": 2900
},
{
"epoch": 0.8,
"grad_norm": 0.8185614943504333,
"learning_rate": 4.3150684931506855e-5,
"loss": 0.3626,
"step": 3000
},
{
"epoch": 0.83,
"grad_norm": 1.1243007183074951,
"learning_rate": 4.2876712328767126e-5,
"loss": 0.3681,
"step": 3100
},
{
"epoch": 0.85,
"grad_norm": 1.0251150131225586,
"learning_rate": 4.2602739726027404e-5,
"loss": 0.3609,
"step": 3200
},
{
"epoch": 0.88,
"grad_norm": 0.8459119200706482,
"learning_rate": 4.232876712328767e-5,
"loss": 0.3608,
"step": 3300
},
{
"epoch": 0.91,
"grad_norm": 0.9018300175666809,
"learning_rate": 4.2054794520547946e-5,
"loss": 0.3779,
"step": 3400
},
{
"epoch": 0.93,
"grad_norm": 0.804707407951355,
"learning_rate": 4.1780821917808224e-5,
"loss": 0.3674,
"step": 3500
},
{
"epoch": 0.96,
"grad_norm": 0.8781819343566895,
"learning_rate": 4.1506849315068495e-5,
"loss": 0.3635,
"step": 3600
},
{
"epoch": 0.99,
"grad_norm": 0.9687257409095764,
"learning_rate": 4.123287671232877e-5,
"loss": 0.3617,
"step": 3700
},
{
"epoch": 1.01,
"grad_norm": 0.7628118991851807,
"learning_rate": 4.0958904109589044e-5,
"loss": 0.3352,
"step": 3800
},
{
"epoch": 1.04,
"grad_norm": 0.9802232980728149,
"learning_rate": 4.0684931506849315e-5,
"loss": 0.3244,
"step": 3900
},
{
"epoch": 1.07,
"grad_norm": 0.7366902828216553,
"learning_rate": 4.041095890410959e-5,
"loss": 0.3237,
"step": 4000
},
{
"epoch": 1.09,
"grad_norm": 0.8844860196113586,
"learning_rate": 4.0136986301369864e-5,
"loss": 0.3296,
"step": 4100
},
{
"epoch": 1.12,
"grad_norm": 0.692650556564331,
"learning_rate": 3.9863013698630135e-5,
"loss": 0.3165,
"step": 4200
},
{
"epoch": 1.15,
"grad_norm": 0.8171700239181519,
"learning_rate": 3.958904109589041e-5,
"loss": 0.323,
"step": 4300
},
{
"epoch": 1.17,
"grad_norm": 0.9350169897079468,
"learning_rate": 3.9315068493150684e-5,
"loss": 0.3259,
"step": 4400
},
{
"epoch": 1.2,
"grad_norm": 0.9551327228546143,
"learning_rate": 3.904109589041096e-5,
"loss": 0.3252,
"step": 4500
},
{
"epoch": 1.23,
"grad_norm": 0.8646096587181091,
"learning_rate": 3.8767123287671233e-5,
"loss": 0.3267,
"step": 4600
},
{
"epoch": 1.25,
"grad_norm": 0.8012389540672302,
"learning_rate": 3.8493150684931505e-5,
"loss": 0.3149,
"step": 4700
},
{
"epoch": 1.28,
"grad_norm": 0.833848774433136,
"learning_rate": 3.821917808219178e-5,
"loss": 0.3164,
"step": 4800
},
{
"epoch": 1.31,
"grad_norm": 0.7836089730262756,
"learning_rate": 3.7945205479452054e-5,
"loss": 0.3206,
"step": 4900
},
{
"epoch": 1.33,
"grad_norm": 0.8694811463356018,
"learning_rate": 3.767123287671233e-5,
"loss": 0.3187,
"step": 5000
},
{
"epoch": 1.36,
"grad_norm": 0.8749567866325378,
"learning_rate": 3.739726027397261e-5,
"loss": 0.3165,
"step": 5100
},
{
"epoch": 1.39,
"grad_norm": 0.8689484596252441,
"learning_rate": 3.7123287671232874e-5,
"loss": 0.3154,
"step": 5200
},
{
"epoch": 1.41,
"grad_norm": 0.8809706568717957,
"learning_rate": 3.684931506849315e-5,
"loss": 0.3301,
"step": 5300
},
{
"epoch": 1.44,
"grad_norm": 0.8677769899368286,
"learning_rate": 3.657534246575342e-5,
"loss": 0.3184,
"step": 5400
},
{
"epoch": 1.47,
"grad_norm": 0.8212382793426514,
"learning_rate": 3.63013698630137e-5,
"loss": 0.3181,
"step": 5500
},
{
"epoch": 1.49,
"grad_norm": 0.8636347651481628,
"learning_rate": 3.602739726027398e-5,
"loss": 0.3138,
"step": 5600
},
{
"epoch": 1.52,
"grad_norm": 0.8136293292045593,
"learning_rate": 3.575342465753424e-5,
"loss": 0.3156,
"step": 5700
},
{
"epoch": 1.55,
"grad_norm": 0.7700251936912537,
"learning_rate": 3.547945205479452e-5,
"loss": 0.3179,
"step": 5800
},
{
"epoch": 1.57,
"grad_norm": 0.7282480597496033,
"learning_rate": 3.52054794520548e-5,
"loss": 0.3188,
"step": 5900
},
{
"epoch": 1.6,
"grad_norm": 0.7657186388969421,
"learning_rate": 3.493150684931507e-5,
"loss": 0.3137,
"step": 6000
},
{
"epoch": 1.63,
"grad_norm": 0.8558144569396973,
"learning_rate": 3.465753424657535e-5,
"loss": 0.3192,
"step": 6100
},
{
"epoch": 1.65,
"grad_norm": 0.7496147751808167,
"learning_rate": 3.438356164383562e-5,
"loss": 0.3175,
"step": 6200
},
{
"epoch": 1.68,
"grad_norm": 0.9365683794021606,
"learning_rate": 3.410958904109589e-5,
"loss": 0.3124,
"step": 6300
},
{
"epoch": 1.71,
"grad_norm": 0.8127835392951965,
"learning_rate": 3.383561643835617e-5,
"loss": 0.3056,
"step": 6400
},
{
"epoch": 1.73,
"grad_norm": 0.819684624671936,
"learning_rate": 3.356164383561644e-5,
"loss": 0.3144,
"step": 6500
},
{
"epoch": 1.76,
"grad_norm": 0.7603724598884583,
"learning_rate": 3.328767123287672e-5,
"loss": 0.315,
"step": 6600
},
{
"epoch": 1.79,
"grad_norm": 0.8054817318916321,
"learning_rate": 3.301369863013699e-5,
"loss": 0.3073,
"step": 6700
},
{
"epoch": 1.81,
"grad_norm": 0.758423924446106,
"learning_rate": 3.273972602739726e-5,
"loss": 0.312,
"step": 6800
},
{
"epoch": 1.84,
"grad_norm": 0.8245046138763428,
"learning_rate": 3.246575342465754e-5,
"loss": 0.3125,
"step": 6900
},
{
"epoch": 1.87,
"grad_norm": 0.7906696796417236,
"learning_rate": 3.219178082191781e-5,
"loss": 0.3009,
"step": 7000
},
{
"epoch": 1.89,
"grad_norm": 0.8566040992736816,
"learning_rate": 3.1917808219178086e-5,
"loss": 0.3043,
"step": 7100
},
{
"epoch": 1.92,
"grad_norm": 0.7341597080230713,
"learning_rate": 3.164383561643836e-5,
"loss": 0.309,
"step": 7200
},
{
"epoch": 1.95,
"grad_norm": 0.7561280131340027,
"learning_rate": 3.136986301369863e-5,
"loss": 0.3051,
"step": 7300
},
{
"epoch": 1.97,
"grad_norm": 0.7900431156158447,
"learning_rate": 3.1095890410958906e-5,
"loss": 0.3093,
"step": 7400
},
{
"epoch": 2.0,
"grad_norm": 0.880424976348877,
"learning_rate": 3.082191780821918e-5,
"loss": 0.3058,
"step": 7500
},
{
"epoch": 2.03,
"grad_norm": 0.8830358982086182,
"learning_rate": 3.0547945205479455e-5,
"loss": 0.2673,
"step": 7600
},
{
"epoch": 2.05,
"grad_norm": 0.6983394026756287,
"learning_rate": 3.0273972602739726e-5,
"loss": 0.2739,
"step": 7700
},
{
"epoch": 2.08,
"grad_norm": 0.8467246890068054,
"learning_rate": 3e-5,
"loss": 0.2694,
"step": 7800
},
{
"epoch": 2.11,
"grad_norm": 0.8425388932228088,
"learning_rate": 2.9726027397260275e-5,
"loss": 0.2698,
"step": 7900
},
{
"epoch": 2.13,
"grad_norm": 0.6956115365028381,
"learning_rate": 2.945205479452055e-5,
"loss": 0.2616,
"step": 8000
},
{
"epoch": 2.16,
"grad_norm": 0.9649244546890259,
"learning_rate": 2.9178082191780824e-5,
"loss": 0.2763,
"step": 8100
},
{
"epoch": 2.19,
"grad_norm": 0.7081593871116638,
"learning_rate": 2.8904109589041095e-5,
"loss": 0.2683,
"step": 8200
},
{
"epoch": 2.21,
"grad_norm": 0.9411781430244446,
"learning_rate": 2.863013698630137e-5,
"loss": 0.2621,
"step": 8300
},
{
"epoch": 2.24,
"grad_norm": 0.8201924562454224,
"learning_rate": 2.8356164383561644e-5,
"loss": 0.2701,
"step": 8400
},
{
"epoch": 2.27,
"grad_norm": 0.8518856167793274,
"learning_rate": 2.808219178082192e-5,
"loss": 0.272,
"step": 8500
},
{
"epoch": 2.29,
"grad_norm": 0.8004194498062134,
"learning_rate": 2.7808219178082197e-5,
"loss": 0.267,
"step": 8600
},
{
"epoch": 2.32,
"grad_norm": 0.9312605857849121,
"learning_rate": 2.7534246575342465e-5,
"loss": 0.2632,
"step": 8700
},
{
"epoch": 2.35,
"grad_norm": 0.8414776921272278,
"learning_rate": 2.726027397260274e-5,
"loss": 0.2681,
"step": 8800
},
{
"epoch": 2.37,
"grad_norm": 0.6925989985466003,
"learning_rate": 2.6986301369863014e-5,
"loss": 0.2668,
"step": 8900
},
{
"epoch": 2.4,
"grad_norm": 0.9184579849243164,
"learning_rate": 2.671232876712329e-5,
"loss": 0.2673,
"step": 9000
},
{
"epoch": 2.43,
"grad_norm": 1.1033433675765991,
"learning_rate": 2.6438356164383566e-5,
"loss": 0.2684,
"step": 9100
},
{
"epoch": 2.45,
"grad_norm": 0.9113504886627197,
"learning_rate": 2.6164383561643834e-5,
"loss": 0.2644,
"step": 9200
},
{
"epoch": 2.48,
"grad_norm": 0.7905146479606628,
"learning_rate": 2.589041095890411e-5,
"loss": 0.2668,
"step": 9300
},
{
"epoch": 2.51,
"grad_norm": 0.6717493534088135,
"learning_rate": 2.5616438356164386e-5,
"loss": 0.271,
"step": 9400
},
{
"epoch": 2.53,
"grad_norm": 0.8438414335250854,
"learning_rate": 2.534246575342466e-5,
"loss": 0.2706,
"step": 9500
},
{
"epoch": 2.56,
"grad_norm": 0.8165556192398071,
"learning_rate": 2.5068493150684935e-5,
"loss": 0.2603,
"step": 9600
},
{
"epoch": 2.59,
"grad_norm": 0.8030436038970947,
"learning_rate": 2.4794520547945206e-5,
"loss": 0.2587,
"step": 9700
},
{
"epoch": 2.61,
"grad_norm": 0.8518214225769043,
"learning_rate": 2.452054794520548e-5,
"loss": 0.2533,
"step": 9800
},
{
"epoch": 2.64,
"grad_norm": 0.9882023930549622,
"learning_rate": 2.4246575342465755e-5,
"loss": 0.2561,
"step": 9900
},
{
"epoch": 2.67,
"grad_norm": 0.8175749182701111,
"learning_rate": 2.3972602739726026e-5,
"loss": 0.2572,
"step": 10000
},
{
"epoch": 2.69,
"grad_norm": 0.897048830986023,
"learning_rate": 2.36986301369863e-5,
"loss": 0.2587,
"step": 10100
},
{
"epoch": 2.72,
"grad_norm": 0.8218054175376892,
"learning_rate": 2.342465753424658e-5,
"loss": 0.2654,
"step": 10200
},
{
"epoch": 2.75,
"grad_norm": 0.7128798961639404,
"learning_rate": 2.315068493150685e-5,
"loss": 0.2642,
"step": 10300
},
{
"epoch": 2.77,
"grad_norm": 0.7982375621795654,
"learning_rate": 2.2876712328767124e-5,
"loss": 0.2537,
"step": 10400
},
{
"epoch": 2.8,
"grad_norm": 0.790105938911438,
"learning_rate": 2.2602739726027396e-5,
"loss": 0.2713,
"step": 10500
},
{
"epoch": 2.83,
"grad_norm": 0.7734562158584595,
"learning_rate": 2.2328767123287673e-5,
"loss": 0.2616,
"step": 10600
},
{
"epoch": 2.85,
"grad_norm": 0.8464659452438354,
"learning_rate": 2.2054794520547948e-5,
"loss": 0.2584,
"step": 10700
},
{
"epoch": 2.88,
"grad_norm": 0.7386855483055115,
"learning_rate": 2.178082191780822e-5,
"loss": 0.257,
"step": 10800
},
{
"epoch": 2.91,
"grad_norm": 0.7122279405593872,
"learning_rate": 2.1506849315068494e-5,
"loss": 0.2667,
"step": 10900
},
{
"epoch": 2.93,
"grad_norm": 0.8505749106407166,
"learning_rate": 2.1232876712328768e-5,
"loss": 0.2661,
"step": 11000
},
{
"epoch": 2.96,
"grad_norm": 0.8915577530860901,
"learning_rate": 2.0958904109589043e-5,
"loss": 0.2567,
"step": 11100
},
{
"epoch": 2.99,
"grad_norm": 0.9431042671203613,
"learning_rate": 2.0684931506849317e-5,
"loss": 0.2578,
"step": 11200
},
{
"epoch": 3.01,
"grad_norm": 0.7943726181983948,
"learning_rate": 2.0410958904109588e-5,
"loss": 0.2393,
"step": 11300
},
{
"epoch": 3.04,
"grad_norm": 0.8244442939758301,
"learning_rate": 2.0136986301369866e-5,
"loss": 0.2175,
"step": 11400
},
{
"epoch": 3.07,
"grad_norm": 0.7802647948265076,
"learning_rate": 1.9863013698630137e-5,
"loss": 0.2161,
"step": 11500
},
{
"epoch": 3.09,
"grad_norm": 1.1162070035934448,
"learning_rate": 1.9589041095890412e-5,
"loss": 0.2211,
"step": 11600
},
{
"epoch": 3.12,
"grad_norm": 1.0273113250732422,
"learning_rate": 1.9315068493150686e-5,
"loss": 0.2253,
"step": 11700
},
{
"epoch": 3.15,
"grad_norm": 1.0477781295776367,
"learning_rate": 1.904109589041096e-5,
"loss": 0.2213,
"step": 11800
},
{
"epoch": 3.17,
"grad_norm": 0.9134103655815125,
"learning_rate": 1.8767123287671235e-5,
"loss": 0.2269,
"step": 11900
},
{
"epoch": 3.2,
"grad_norm": 0.8156262636184692,
"learning_rate": 1.8493150684931506e-5,
"loss": 0.2245,
"step": 12000
},
{
"epoch": 3.23,
"grad_norm": 0.9004743695259094,
"learning_rate": 1.821917808219178e-5,
"loss": 0.2254,
"step": 12100
},
{
"epoch": 3.25,
"grad_norm": 0.8386040925979614,
"learning_rate": 1.7945205479452055e-5,
"loss": 0.2292,
"step": 12200
},
{
"epoch": 3.28,
"grad_norm": 0.9777556657791138,
"learning_rate": 1.767123287671233e-5,
"loss": 0.2213,
"step": 12300
},
{
"epoch": 3.31,
"grad_norm": 0.7827901244163513,
"learning_rate": 1.7397260273972604e-5,
"loss": 0.2174,
"step": 12400
},
{
"epoch": 3.33,
"grad_norm": 0.7424948811531067,
"learning_rate": 1.7123287671232875e-5,
"loss": 0.2199,
"step": 12500
},
{
"epoch": 3.36,
"grad_norm": 0.8807641267776489,
"learning_rate": 1.684931506849315e-5,
"loss": 0.2204,
"step": 12600
},
{
"epoch": 3.39,
"grad_norm": 0.8479088544845581,
"learning_rate": 1.6575342465753428e-5,
"loss": 0.2241,
"step": 12700
},
{
"epoch": 3.41,
"grad_norm": 0.9211342334747314,
"learning_rate": 1.63013698630137e-5,
"loss": 0.2237,
"step": 12800
},
{
"epoch": 3.44,
"grad_norm": 0.8683446645736694,
"learning_rate": 1.6027397260273974e-5,
"loss": 0.2248,
"step": 12900
},
{
"epoch": 3.47,
"grad_norm": 0.8828756213188171,
"learning_rate": 1.5753424657534248e-5,
"loss": 0.233,
"step": 13000
},
{
"epoch": 3.49,
"grad_norm": 0.9421214461326599,
"learning_rate": 1.5479452054794523e-5,
"loss": 0.2294,
"step": 13100
},
{
"epoch": 3.52,
"grad_norm": 0.765132486820221,
"learning_rate": 1.5205479452054797e-5,
"loss": 0.2277,
"step": 13200
},
{
"epoch": 3.55,
"grad_norm": 0.9406650066375732,
"learning_rate": 1.4931506849315068e-5,
"loss": 0.217,
"step": 13300
},
{
"epoch": 3.57,
"grad_norm": 1.0174639225006104,
"learning_rate": 1.4657534246575344e-5,
"loss": 0.2265,
"step": 13400
},
{
"epoch": 3.6,
"grad_norm": 0.826392412185669,
"learning_rate": 1.4383561643835617e-5,
"loss": 0.222,
"step": 13500
},
{
"epoch": 3.63,
"grad_norm": 0.9821271300315857,
"learning_rate": 1.4109589041095892e-5,
"loss": 0.2186,
"step": 13600
},
{
"epoch": 3.65,
"grad_norm": 0.8172212839126587,
"learning_rate": 1.3835616438356164e-5,
"loss": 0.2238,
"step": 13700
},
{
"epoch": 3.68,
"grad_norm": 0.8128436207771301,
"learning_rate": 1.3561643835616439e-5,
"loss": 0.2168,
"step": 13800
},
{
"epoch": 3.71,
"grad_norm": 0.8061575293540955,
"learning_rate": 1.3287671232876714e-5,
"loss": 0.2244,
"step": 13900
},
{
"epoch": 3.73,
"grad_norm": 0.8976914882659912,
"learning_rate": 1.3013698630136986e-5,
"loss": 0.2212,
"step": 14000
},
{
"epoch": 3.76,
"grad_norm": 0.9973928332328796,
"learning_rate": 1.273972602739726e-5,
"loss": 0.2248,
"step": 14100
},
{
"epoch": 3.79,
"grad_norm": 0.8042004108428955,
"learning_rate": 1.2465753424657535e-5,
"loss": 0.2178,
"step": 14200
},
{
"epoch": 3.81,
"grad_norm": 0.8282990455627441,
"learning_rate": 1.2191780821917808e-5,
"loss": 0.2227,
"step": 14300
},
{
"epoch": 3.84,
"grad_norm": 0.6668768525123596,
"learning_rate": 1.1917808219178083e-5,
"loss": 0.2226,
"step": 14400
},
{
"epoch": 3.87,
"grad_norm": 0.7972692847251892,
"learning_rate": 1.1643835616438355e-5,
"loss": 0.2193,
"step": 14500
},
{
"epoch": 3.89,
"grad_norm": 0.7637550830841064,
"learning_rate": 1.1369863013698632e-5,
"loss": 0.2157,
"step": 14600
},
{
"epoch": 3.92,
"grad_norm": 0.8487162590026855,
"learning_rate": 1.1095890410958904e-5,
"loss": 0.2251,
"step": 14700
},
{
"epoch": 3.95,
"grad_norm": 0.8710606694221497,
"learning_rate": 1.0821917808219179e-5,
"loss": 0.2153,
"step": 14800
},
{
"epoch": 3.97,
"grad_norm": 0.8085966110229492,
"learning_rate": 1.0547945205479452e-5,
"loss": 0.2191,
"step": 14900
},
{
"epoch": 4.0,
"grad_norm": 0.94338059425354,
"learning_rate": 1.0273972602739726e-5,
"loss": 0.2184,
"step": 15000
},
{
"epoch": 4.03,
"grad_norm": 1.4945096969604492,
"learning_rate": 1e-5,
"loss": 0.1863,
"step": 15100
},
{
"epoch": 4.05,
"grad_norm": 0.9178032279014587,
"learning_rate": 9.726027397260275e-6,
"loss": 0.1854,
"step": 15200
},
{
"epoch": 4.08,
"grad_norm": 0.8616482615470886,
"learning_rate": 9.452054794520548e-6,
"loss": 0.1843,
"step": 15300
},
{
"epoch": 4.11,
"grad_norm": 0.9844592213630676,
"learning_rate": 9.178082191780823e-6,
"loss": 0.1909,
"step": 15400
},
{
"epoch": 4.13,
"grad_norm": 0.7312936186790466,
"learning_rate": 8.904109589041095e-6,
"loss": 0.1899,
"step": 15500
},
{
"epoch": 4.16,
"grad_norm": 0.9658412933349609,
"learning_rate": 8.630136986301372e-6,
"loss": 0.1878,
"step": 15600
},
{
"epoch": 4.19,
"grad_norm": 1.0498002767562866,
"learning_rate": 8.356164383561644e-6,
"loss": 0.1825,
"step": 15700
},
{
"epoch": 4.21,
"grad_norm": 0.7098029255867004,
"learning_rate": 8.082191780821919e-6,
"loss": 0.1864,
"step": 15800
},
{
"epoch": 4.24,
"grad_norm": 0.9946851134300232,
"learning_rate": 7.808219178082192e-6,
"loss": 0.1852,
"step": 15900
},
{
"epoch": 4.27,
"grad_norm": 0.9338549375534058,
"learning_rate": 7.5342465753424655e-6,
"loss": 0.1865,
"step": 16000
},
{
"epoch": 4.29,
"grad_norm": 0.8193784952163696,
"learning_rate": 7.260273972602739e-6,
"loss": 0.184,
"step": 16100
},
{
"epoch": 4.32,
"grad_norm": 0.9323195815086365,
"learning_rate": 6.9863013698630145e-6,
"loss": 0.1845,
"step": 16200
},
{
"epoch": 4.35,
"grad_norm": 0.9668224453926086,
"learning_rate": 6.712328767123288e-6,
"loss": 0.1911,
"step": 16300
},
{
"epoch": 4.37,
"grad_norm": 0.9941351413726807,
"learning_rate": 6.438356164383562e-6,
"loss": 0.1859,
"step": 16400
},
{
"epoch": 4.4,
"grad_norm": 0.9229924082756042,
"learning_rate": 6.1643835616438354e-6,
"loss": 0.1861,
"step": 16500
},
{
"epoch": 4.43,
"grad_norm": 0.8792287111282349,
"learning_rate": 5.89041095890411e-6,
"loss": 0.1903,
"step": 16600
},
{
"epoch": 4.45,
"grad_norm": 0.682725191116333,
"learning_rate": 5.616438356164384e-6,
"loss": 0.1822,
"step": 16700
},
{
"epoch": 4.48,
"grad_norm": 0.8012785315513611,
"learning_rate": 5.342465753424658e-6,
"loss": 0.1888,
"step": 16800
},
{
"epoch": 4.51,
"grad_norm": 0.7928184270858765,
"learning_rate": 5.068493150684932e-6,
"loss": 0.1869,
"step": 16900
},
{
"epoch": 4.53,
"grad_norm": 1.2073571681976318,
"learning_rate": 4.7945205479452054e-6,
"loss": 0.184,
"step": 17000
},
{
"epoch": 4.56,
"grad_norm": 0.763810396194458,
"learning_rate": 4.52054794520548e-6,
"loss": 0.1824,
"step": 17100
},
{
"epoch": 4.59,
"grad_norm": 0.8932220935821533,
"learning_rate": 4.246575342465754e-6,
"loss": 0.1898,
"step": 17200
},
{
"epoch": 4.61,
"grad_norm": 0.7250128984451294,
"learning_rate": 3.972602739726028e-6,
"loss": 0.1886,
"step": 17300
},
{
"epoch": 4.64,
"grad_norm": 1.0617702007293701,
"learning_rate": 3.6986301369863018e-6,
"loss": 0.1889,
"step": 17400
},
{
"epoch": 4.67,
"grad_norm": 0.983672022819519,
"learning_rate": 3.4246575342465754e-6,
"loss": 0.1871,
"step": 17500
},
{
"epoch": 4.69,
"grad_norm": 0.9392043352127075,
"learning_rate": 3.1506849315068495e-6,
"loss": 0.1869,
"step": 17600
},
{
"epoch": 4.72,
"grad_norm": 0.8135913014411926,
"learning_rate": 2.8767123287671236e-6,
"loss": 0.1861,
"step": 17700
},
{
"epoch": 4.75,
"grad_norm": 0.7956686615943909,
"learning_rate": 2.6027397260273973e-6,
"loss": 0.1864,
"step": 17800
},
{
"epoch": 4.77,
"grad_norm": 0.8956461548805237,
"learning_rate": 2.3287671232876713e-6,
"loss": 0.1889,
"step": 17900
},
{
"epoch": 4.8,
"grad_norm": 0.9515472054481506,
"learning_rate": 2.054794520547945e-6,
"loss": 0.1871,
"step": 18000
},
{
"epoch": 4.83,
"grad_norm": 0.8886680006980896,
"learning_rate": 1.7808219178082193e-6,
"loss": 0.187,
"step": 18100
},
{
"epoch": 4.85,
"grad_norm": 0.8525242805480957,
"learning_rate": 1.5068493150684932e-6,
"loss": 0.1832,
"step": 18200
},
{
"epoch": 4.88,
"grad_norm": 0.9522444009780884,
"learning_rate": 1.232876712328767e-6,
"loss": 0.186,
"step": 18300
},
{
"epoch": 4.91,
"grad_norm": 0.8611086010932922,
"learning_rate": 9.589041095890411e-7,
"loss": 0.1855,
"step": 18400
},
{
"epoch": 4.93,
"grad_norm": 0.9658819437026978,
"learning_rate": 6.849315068493151e-7,
"loss": 0.177,
"step": 18500
},
{
"epoch": 4.96,
"grad_norm": 0.9198510646820068,
"learning_rate": 4.1095890410958903e-7,
"loss": 0.178,
"step": 18600
},
{
"epoch": 4.99,
"grad_norm": 0.8326091766357422,
"learning_rate": 1.36986301369863e-7,
"loss": 0.1865,
"step": 18700
}
],
"logging_steps": 100,
"max_steps": 18750,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 6250,
"total_flos": 8.12664225792e16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}