{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 18750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 13.29955005645752, "learning_rate": 1e-5, "loss": 3.9042, "step": 100 }, { "epoch": 0.05, "grad_norm": 1.7849366664886475, "learning_rate": 2e-5, "loss": 0.6647, "step": 200 }, { "epoch": 0.08, "grad_norm": 1.459277868270874, "learning_rate": 3e-5, "loss": 0.5227, "step": 300 }, { "epoch": 0.11, "grad_norm": 1.3880829811096191, "learning_rate": 4e-5, "loss": 0.5149, "step": 400 }, { "epoch": 0.13, "grad_norm": 1.3338109254837036, "learning_rate": 5e-5, "loss": 0.4984, "step": 500 }, { "epoch": 0.16, "grad_norm": 1.065587043762207, "learning_rate": 4.972602739726028e-5, "loss": 0.4847, "step": 600 }, { "epoch": 0.19, "grad_norm": 1.0945565700531006, "learning_rate": 4.945205479452055e-5, "loss": 0.4615, "step": 700 }, { "epoch": 0.21, "grad_norm": 1.0960415601730347, "learning_rate": 4.917808219178082e-5, "loss": 0.4504, "step": 800 }, { "epoch": 0.24, "grad_norm": 1.0467826128005981, "learning_rate": 4.89041095890411e-5, "loss": 0.4415, "step": 900 }, { "epoch": 0.27, "grad_norm": 1.0690410137176514, "learning_rate": 4.863013698630137e-5, "loss": 0.4497, "step": 1000 }, { "epoch": 0.29, "grad_norm": 0.9305112957954407, "learning_rate": 4.835616438356165e-5, "loss": 0.4363, "step": 1100 }, { "epoch": 0.32, "grad_norm": 0.8500822186470032, "learning_rate": 4.808219178082192e-5, "loss": 0.4306, "step": 1200 }, { "epoch": 0.35, "grad_norm": 1.2137833833694458, "learning_rate": 4.780821917808219e-5, "loss": 0.423, "step": 1300 }, { "epoch": 0.37, "grad_norm": 0.9352328777313232, "learning_rate": 4.753424657534247e-5, "loss": 0.4161, "step": 1400 }, { "epoch": 0.4, "grad_norm": 1.1027398109436035, "learning_rate": 4.726027397260274e-5, "loss": 0.4083, "step": 1500 }, { "epoch": 0.43, "grad_norm": 0.924199104309082, "learning_rate": 4.698630136986302e-5, "loss": 0.4254, "step": 1600 }, { "epoch": 0.45, "grad_norm": 0.8912659883499146, "learning_rate": 4.671232876712329e-5, "loss": 0.3918, "step": 1700 }, { "epoch": 0.48, "grad_norm": 1.0025393962860107, "learning_rate": 4.643835616438356e-5, "loss": 0.4046, "step": 1800 }, { "epoch": 0.51, "grad_norm": 0.9362453818321228, "learning_rate": 4.616438356164384e-5, "loss": 0.3979, "step": 1900 }, { "epoch": 0.53, "grad_norm": 0.8841680884361267, "learning_rate": 4.589041095890411e-5, "loss": 0.3938, "step": 2000 }, { "epoch": 0.56, "grad_norm": 0.898572564125061, "learning_rate": 4.561643835616439e-5, "loss": 0.3974, "step": 2100 }, { "epoch": 0.59, "grad_norm": 0.9011989831924438, "learning_rate": 4.534246575342466e-5, "loss": 0.3955, "step": 2200 }, { "epoch": 0.61, "grad_norm": 0.913512110710144, "learning_rate": 4.506849315068493e-5, "loss": 0.3842, "step": 2300 }, { "epoch": 0.64, "grad_norm": 0.9077229499816895, "learning_rate": 4.479452054794521e-5, "loss": 0.3756, "step": 2400 }, { "epoch": 0.67, "grad_norm": 0.8107369542121887, "learning_rate": 4.452054794520548e-5, "loss": 0.3781, "step": 2500 }, { "epoch": 0.69, "grad_norm": 0.962982714176178, "learning_rate": 4.424657534246576e-5, "loss": 0.3728, "step": 2600 }, { "epoch": 0.72, "grad_norm": 0.9333651065826416, "learning_rate": 4.3972602739726035e-5, "loss": 0.3731, "step": 2700 }, { "epoch": 0.75, "grad_norm": 0.9969388246536255, "learning_rate": 4.36986301369863e-5, "loss": 0.3793, "step": 2800 }, { "epoch": 0.77, "grad_norm": 0.8959200978279114, "learning_rate": 4.342465753424658e-5, "loss": 0.3787, "step": 2900 }, { "epoch": 0.8, "grad_norm": 0.8185614943504333, "learning_rate": 4.3150684931506855e-5, "loss": 0.3626, "step": 3000 }, { "epoch": 0.83, "grad_norm": 1.1243007183074951, "learning_rate": 4.2876712328767126e-5, "loss": 0.3681, "step": 3100 }, { "epoch": 0.85, "grad_norm": 1.0251150131225586, "learning_rate": 4.2602739726027404e-5, "loss": 0.3609, "step": 3200 }, { "epoch": 0.88, "grad_norm": 0.8459119200706482, "learning_rate": 4.232876712328767e-5, "loss": 0.3608, "step": 3300 }, { "epoch": 0.91, "grad_norm": 0.9018300175666809, "learning_rate": 4.2054794520547946e-5, "loss": 0.3779, "step": 3400 }, { "epoch": 0.93, "grad_norm": 0.804707407951355, "learning_rate": 4.1780821917808224e-5, "loss": 0.3674, "step": 3500 }, { "epoch": 0.96, "grad_norm": 0.8781819343566895, "learning_rate": 4.1506849315068495e-5, "loss": 0.3635, "step": 3600 }, { "epoch": 0.99, "grad_norm": 0.9687257409095764, "learning_rate": 4.123287671232877e-5, "loss": 0.3617, "step": 3700 }, { "epoch": 1.01, "grad_norm": 0.7628118991851807, "learning_rate": 4.0958904109589044e-5, "loss": 0.3352, "step": 3800 }, { "epoch": 1.04, "grad_norm": 0.9802232980728149, "learning_rate": 4.0684931506849315e-5, "loss": 0.3244, "step": 3900 }, { "epoch": 1.07, "grad_norm": 0.7366902828216553, "learning_rate": 4.041095890410959e-5, "loss": 0.3237, "step": 4000 }, { "epoch": 1.09, "grad_norm": 0.8844860196113586, "learning_rate": 4.0136986301369864e-5, "loss": 0.3296, "step": 4100 }, { "epoch": 1.12, "grad_norm": 0.692650556564331, "learning_rate": 3.9863013698630135e-5, "loss": 0.3165, "step": 4200 }, { "epoch": 1.15, "grad_norm": 0.8171700239181519, "learning_rate": 3.958904109589041e-5, "loss": 0.323, "step": 4300 }, { "epoch": 1.17, "grad_norm": 0.9350169897079468, "learning_rate": 3.9315068493150684e-5, "loss": 0.3259, "step": 4400 }, { "epoch": 1.2, "grad_norm": 0.9551327228546143, "learning_rate": 3.904109589041096e-5, "loss": 0.3252, "step": 4500 }, { "epoch": 1.23, "grad_norm": 0.8646096587181091, "learning_rate": 3.8767123287671233e-5, "loss": 0.3267, "step": 4600 }, { "epoch": 1.25, "grad_norm": 0.8012389540672302, "learning_rate": 3.8493150684931505e-5, "loss": 0.3149, "step": 4700 }, { "epoch": 1.28, "grad_norm": 0.833848774433136, "learning_rate": 3.821917808219178e-5, "loss": 0.3164, "step": 4800 }, { "epoch": 1.31, "grad_norm": 0.7836089730262756, "learning_rate": 3.7945205479452054e-5, "loss": 0.3206, "step": 4900 }, { "epoch": 1.33, "grad_norm": 0.8694811463356018, "learning_rate": 3.767123287671233e-5, "loss": 0.3187, "step": 5000 }, { "epoch": 1.36, "grad_norm": 0.8749567866325378, "learning_rate": 3.739726027397261e-5, "loss": 0.3165, "step": 5100 }, { "epoch": 1.39, "grad_norm": 0.8689484596252441, "learning_rate": 3.7123287671232874e-5, "loss": 0.3154, "step": 5200 }, { "epoch": 1.41, "grad_norm": 0.8809706568717957, "learning_rate": 3.684931506849315e-5, "loss": 0.3301, "step": 5300 }, { "epoch": 1.44, "grad_norm": 0.8677769899368286, "learning_rate": 3.657534246575342e-5, "loss": 0.3184, "step": 5400 }, { "epoch": 1.47, "grad_norm": 0.8212382793426514, "learning_rate": 3.63013698630137e-5, "loss": 0.3181, "step": 5500 }, { "epoch": 1.49, "grad_norm": 0.8636347651481628, "learning_rate": 3.602739726027398e-5, "loss": 0.3138, "step": 5600 }, { "epoch": 1.52, "grad_norm": 0.8136293292045593, "learning_rate": 3.575342465753424e-5, "loss": 0.3156, "step": 5700 }, { "epoch": 1.55, "grad_norm": 0.7700251936912537, "learning_rate": 3.547945205479452e-5, "loss": 0.3179, "step": 5800 }, { "epoch": 1.57, "grad_norm": 0.7282480597496033, "learning_rate": 3.52054794520548e-5, "loss": 0.3188, "step": 5900 }, { "epoch": 1.6, "grad_norm": 0.7657186388969421, "learning_rate": 3.493150684931507e-5, "loss": 0.3137, "step": 6000 }, { "epoch": 1.63, "grad_norm": 0.8558144569396973, "learning_rate": 3.465753424657535e-5, "loss": 0.3192, "step": 6100 }, { "epoch": 1.65, "grad_norm": 0.7496147751808167, "learning_rate": 3.438356164383562e-5, "loss": 0.3175, "step": 6200 }, { "epoch": 1.68, "grad_norm": 0.9365683794021606, "learning_rate": 3.410958904109589e-5, "loss": 0.3124, "step": 6300 }, { "epoch": 1.71, "grad_norm": 0.8127835392951965, "learning_rate": 3.383561643835617e-5, "loss": 0.3056, "step": 6400 }, { "epoch": 1.73, "grad_norm": 0.819684624671936, "learning_rate": 3.356164383561644e-5, "loss": 0.3144, "step": 6500 }, { "epoch": 1.76, "grad_norm": 0.7603724598884583, "learning_rate": 3.328767123287672e-5, "loss": 0.315, "step": 6600 }, { "epoch": 1.79, "grad_norm": 0.8054817318916321, "learning_rate": 3.301369863013699e-5, "loss": 0.3073, "step": 6700 }, { "epoch": 1.81, "grad_norm": 0.758423924446106, "learning_rate": 3.273972602739726e-5, "loss": 0.312, "step": 6800 }, { "epoch": 1.84, "grad_norm": 0.8245046138763428, "learning_rate": 3.246575342465754e-5, "loss": 0.3125, "step": 6900 }, { "epoch": 1.87, "grad_norm": 0.7906696796417236, "learning_rate": 3.219178082191781e-5, "loss": 0.3009, "step": 7000 }, { "epoch": 1.89, "grad_norm": 0.8566040992736816, "learning_rate": 3.1917808219178086e-5, "loss": 0.3043, "step": 7100 }, { "epoch": 1.92, "grad_norm": 0.7341597080230713, "learning_rate": 3.164383561643836e-5, "loss": 0.309, "step": 7200 }, { "epoch": 1.95, "grad_norm": 0.7561280131340027, "learning_rate": 3.136986301369863e-5, "loss": 0.3051, "step": 7300 }, { "epoch": 1.97, "grad_norm": 0.7900431156158447, "learning_rate": 3.1095890410958906e-5, "loss": 0.3093, "step": 7400 }, { "epoch": 2.0, "grad_norm": 0.880424976348877, "learning_rate": 3.082191780821918e-5, "loss": 0.3058, "step": 7500 }, { "epoch": 2.03, "grad_norm": 0.8830358982086182, "learning_rate": 3.0547945205479455e-5, "loss": 0.2673, "step": 7600 }, { "epoch": 2.05, "grad_norm": 0.6983394026756287, "learning_rate": 3.0273972602739726e-5, "loss": 0.2739, "step": 7700 }, { "epoch": 2.08, "grad_norm": 0.8467246890068054, "learning_rate": 3e-5, "loss": 0.2694, "step": 7800 }, { "epoch": 2.11, "grad_norm": 0.8425388932228088, "learning_rate": 2.9726027397260275e-5, "loss": 0.2698, "step": 7900 }, { "epoch": 2.13, "grad_norm": 0.6956115365028381, "learning_rate": 2.945205479452055e-5, "loss": 0.2616, "step": 8000 }, { "epoch": 2.16, "grad_norm": 0.9649244546890259, "learning_rate": 2.9178082191780824e-5, "loss": 0.2763, "step": 8100 }, { "epoch": 2.19, "grad_norm": 0.7081593871116638, "learning_rate": 2.8904109589041095e-5, "loss": 0.2683, "step": 8200 }, { "epoch": 2.21, "grad_norm": 0.9411781430244446, "learning_rate": 2.863013698630137e-5, "loss": 0.2621, "step": 8300 }, { "epoch": 2.24, "grad_norm": 0.8201924562454224, "learning_rate": 2.8356164383561644e-5, "loss": 0.2701, "step": 8400 }, { "epoch": 2.27, "grad_norm": 0.8518856167793274, "learning_rate": 2.808219178082192e-5, "loss": 0.272, "step": 8500 }, { "epoch": 2.29, "grad_norm": 0.8004194498062134, "learning_rate": 2.7808219178082197e-5, "loss": 0.267, "step": 8600 }, { "epoch": 2.32, "grad_norm": 0.9312605857849121, "learning_rate": 2.7534246575342465e-5, "loss": 0.2632, "step": 8700 }, { "epoch": 2.35, "grad_norm": 0.8414776921272278, "learning_rate": 2.726027397260274e-5, "loss": 0.2681, "step": 8800 }, { "epoch": 2.37, "grad_norm": 0.6925989985466003, "learning_rate": 2.6986301369863014e-5, "loss": 0.2668, "step": 8900 }, { "epoch": 2.4, "grad_norm": 0.9184579849243164, "learning_rate": 2.671232876712329e-5, "loss": 0.2673, "step": 9000 }, { "epoch": 2.43, "grad_norm": 1.1033433675765991, "learning_rate": 2.6438356164383566e-5, "loss": 0.2684, "step": 9100 }, { "epoch": 2.45, "grad_norm": 0.9113504886627197, "learning_rate": 2.6164383561643834e-5, "loss": 0.2644, "step": 9200 }, { "epoch": 2.48, "grad_norm": 0.7905146479606628, "learning_rate": 2.589041095890411e-5, "loss": 0.2668, "step": 9300 }, { "epoch": 2.51, "grad_norm": 0.6717493534088135, "learning_rate": 2.5616438356164386e-5, "loss": 0.271, "step": 9400 }, { "epoch": 2.53, "grad_norm": 0.8438414335250854, "learning_rate": 2.534246575342466e-5, "loss": 0.2706, "step": 9500 }, { "epoch": 2.56, "grad_norm": 0.8165556192398071, "learning_rate": 2.5068493150684935e-5, "loss": 0.2603, "step": 9600 }, { "epoch": 2.59, "grad_norm": 0.8030436038970947, "learning_rate": 2.4794520547945206e-5, "loss": 0.2587, "step": 9700 }, { "epoch": 2.61, "grad_norm": 0.8518214225769043, "learning_rate": 2.452054794520548e-5, "loss": 0.2533, "step": 9800 }, { "epoch": 2.64, "grad_norm": 0.9882023930549622, "learning_rate": 2.4246575342465755e-5, "loss": 0.2561, "step": 9900 }, { "epoch": 2.67, "grad_norm": 0.8175749182701111, "learning_rate": 2.3972602739726026e-5, "loss": 0.2572, "step": 10000 }, { "epoch": 2.69, "grad_norm": 0.897048830986023, "learning_rate": 2.36986301369863e-5, "loss": 0.2587, "step": 10100 }, { "epoch": 2.72, "grad_norm": 0.8218054175376892, "learning_rate": 2.342465753424658e-5, "loss": 0.2654, "step": 10200 }, { "epoch": 2.75, "grad_norm": 0.7128798961639404, "learning_rate": 2.315068493150685e-5, "loss": 0.2642, "step": 10300 }, { "epoch": 2.77, "grad_norm": 0.7982375621795654, "learning_rate": 2.2876712328767124e-5, "loss": 0.2537, "step": 10400 }, { "epoch": 2.8, "grad_norm": 0.790105938911438, "learning_rate": 2.2602739726027396e-5, "loss": 0.2713, "step": 10500 }, { "epoch": 2.83, "grad_norm": 0.7734562158584595, "learning_rate": 2.2328767123287673e-5, "loss": 0.2616, "step": 10600 }, { "epoch": 2.85, "grad_norm": 0.8464659452438354, "learning_rate": 2.2054794520547948e-5, "loss": 0.2584, "step": 10700 }, { "epoch": 2.88, "grad_norm": 0.7386855483055115, "learning_rate": 2.178082191780822e-5, "loss": 0.257, "step": 10800 }, { "epoch": 2.91, "grad_norm": 0.7122279405593872, "learning_rate": 2.1506849315068494e-5, "loss": 0.2667, "step": 10900 }, { "epoch": 2.93, "grad_norm": 0.8505749106407166, "learning_rate": 2.1232876712328768e-5, "loss": 0.2661, "step": 11000 }, { "epoch": 2.96, "grad_norm": 0.8915577530860901, "learning_rate": 2.0958904109589043e-5, "loss": 0.2567, "step": 11100 }, { "epoch": 2.99, "grad_norm": 0.9431042671203613, "learning_rate": 2.0684931506849317e-5, "loss": 0.2578, "step": 11200 }, { "epoch": 3.01, "grad_norm": 0.7943726181983948, "learning_rate": 2.0410958904109588e-5, "loss": 0.2393, "step": 11300 }, { "epoch": 3.04, "grad_norm": 0.8244442939758301, "learning_rate": 2.0136986301369866e-5, "loss": 0.2175, "step": 11400 }, { "epoch": 3.07, "grad_norm": 0.7802647948265076, "learning_rate": 1.9863013698630137e-5, "loss": 0.2161, "step": 11500 }, { "epoch": 3.09, "grad_norm": 1.1162070035934448, "learning_rate": 1.9589041095890412e-5, "loss": 0.2211, "step": 11600 }, { "epoch": 3.12, "grad_norm": 1.0273113250732422, "learning_rate": 1.9315068493150686e-5, "loss": 0.2253, "step": 11700 }, { "epoch": 3.15, "grad_norm": 1.0477781295776367, "learning_rate": 1.904109589041096e-5, "loss": 0.2213, "step": 11800 }, { "epoch": 3.17, "grad_norm": 0.9134103655815125, "learning_rate": 1.8767123287671235e-5, "loss": 0.2269, "step": 11900 }, { "epoch": 3.2, "grad_norm": 0.8156262636184692, "learning_rate": 1.8493150684931506e-5, "loss": 0.2245, "step": 12000 }, { "epoch": 3.23, "grad_norm": 0.9004743695259094, "learning_rate": 1.821917808219178e-5, "loss": 0.2254, "step": 12100 }, { "epoch": 3.25, "grad_norm": 0.8386040925979614, "learning_rate": 1.7945205479452055e-5, "loss": 0.2292, "step": 12200 }, { "epoch": 3.28, "grad_norm": 0.9777556657791138, "learning_rate": 1.767123287671233e-5, "loss": 0.2213, "step": 12300 }, { "epoch": 3.31, "grad_norm": 0.7827901244163513, "learning_rate": 1.7397260273972604e-5, "loss": 0.2174, "step": 12400 }, { "epoch": 3.33, "grad_norm": 0.7424948811531067, "learning_rate": 1.7123287671232875e-5, "loss": 0.2199, "step": 12500 }, { "epoch": 3.36, "grad_norm": 0.8807641267776489, "learning_rate": 1.684931506849315e-5, "loss": 0.2204, "step": 12600 }, { "epoch": 3.39, "grad_norm": 0.8479088544845581, "learning_rate": 1.6575342465753428e-5, "loss": 0.2241, "step": 12700 }, { "epoch": 3.41, "grad_norm": 0.9211342334747314, "learning_rate": 1.63013698630137e-5, "loss": 0.2237, "step": 12800 }, { "epoch": 3.44, "grad_norm": 0.8683446645736694, "learning_rate": 1.6027397260273974e-5, "loss": 0.2248, "step": 12900 }, { "epoch": 3.47, "grad_norm": 0.8828756213188171, "learning_rate": 1.5753424657534248e-5, "loss": 0.233, "step": 13000 }, { "epoch": 3.49, "grad_norm": 0.9421214461326599, "learning_rate": 1.5479452054794523e-5, "loss": 0.2294, "step": 13100 }, { "epoch": 3.52, "grad_norm": 0.765132486820221, "learning_rate": 1.5205479452054797e-5, "loss": 0.2277, "step": 13200 }, { "epoch": 3.55, "grad_norm": 0.9406650066375732, "learning_rate": 1.4931506849315068e-5, "loss": 0.217, "step": 13300 }, { "epoch": 3.57, "grad_norm": 1.0174639225006104, "learning_rate": 1.4657534246575344e-5, "loss": 0.2265, "step": 13400 }, { "epoch": 3.6, "grad_norm": 0.826392412185669, "learning_rate": 1.4383561643835617e-5, "loss": 0.222, "step": 13500 }, { "epoch": 3.63, "grad_norm": 0.9821271300315857, "learning_rate": 1.4109589041095892e-5, "loss": 0.2186, "step": 13600 }, { "epoch": 3.65, "grad_norm": 0.8172212839126587, "learning_rate": 1.3835616438356164e-5, "loss": 0.2238, "step": 13700 }, { "epoch": 3.68, "grad_norm": 0.8128436207771301, "learning_rate": 1.3561643835616439e-5, "loss": 0.2168, "step": 13800 }, { "epoch": 3.71, "grad_norm": 0.8061575293540955, "learning_rate": 1.3287671232876714e-5, "loss": 0.2244, "step": 13900 }, { "epoch": 3.73, "grad_norm": 0.8976914882659912, "learning_rate": 1.3013698630136986e-5, "loss": 0.2212, "step": 14000 }, { "epoch": 3.76, "grad_norm": 0.9973928332328796, "learning_rate": 1.273972602739726e-5, "loss": 0.2248, "step": 14100 }, { "epoch": 3.79, "grad_norm": 0.8042004108428955, "learning_rate": 1.2465753424657535e-5, "loss": 0.2178, "step": 14200 }, { "epoch": 3.81, "grad_norm": 0.8282990455627441, "learning_rate": 1.2191780821917808e-5, "loss": 0.2227, "step": 14300 }, { "epoch": 3.84, "grad_norm": 0.6668768525123596, "learning_rate": 1.1917808219178083e-5, "loss": 0.2226, "step": 14400 }, { "epoch": 3.87, "grad_norm": 0.7972692847251892, "learning_rate": 1.1643835616438355e-5, "loss": 0.2193, "step": 14500 }, { "epoch": 3.89, "grad_norm": 0.7637550830841064, "learning_rate": 1.1369863013698632e-5, "loss": 0.2157, "step": 14600 }, { "epoch": 3.92, "grad_norm": 0.8487162590026855, "learning_rate": 1.1095890410958904e-5, "loss": 0.2251, "step": 14700 }, { "epoch": 3.95, "grad_norm": 0.8710606694221497, "learning_rate": 1.0821917808219179e-5, "loss": 0.2153, "step": 14800 }, { "epoch": 3.97, "grad_norm": 0.8085966110229492, "learning_rate": 1.0547945205479452e-5, "loss": 0.2191, "step": 14900 }, { "epoch": 4.0, "grad_norm": 0.94338059425354, "learning_rate": 1.0273972602739726e-5, "loss": 0.2184, "step": 15000 }, { "epoch": 4.03, "grad_norm": 1.4945096969604492, "learning_rate": 1e-5, "loss": 0.1863, "step": 15100 }, { "epoch": 4.05, "grad_norm": 0.9178032279014587, "learning_rate": 9.726027397260275e-6, "loss": 0.1854, "step": 15200 }, { "epoch": 4.08, "grad_norm": 0.8616482615470886, "learning_rate": 9.452054794520548e-6, "loss": 0.1843, "step": 15300 }, { "epoch": 4.11, "grad_norm": 0.9844592213630676, "learning_rate": 9.178082191780823e-6, "loss": 0.1909, "step": 15400 }, { "epoch": 4.13, "grad_norm": 0.7312936186790466, "learning_rate": 8.904109589041095e-6, "loss": 0.1899, "step": 15500 }, { "epoch": 4.16, "grad_norm": 0.9658412933349609, "learning_rate": 8.630136986301372e-6, "loss": 0.1878, "step": 15600 }, { "epoch": 4.19, "grad_norm": 1.0498002767562866, "learning_rate": 8.356164383561644e-6, "loss": 0.1825, "step": 15700 }, { "epoch": 4.21, "grad_norm": 0.7098029255867004, "learning_rate": 8.082191780821919e-6, "loss": 0.1864, "step": 15800 }, { "epoch": 4.24, "grad_norm": 0.9946851134300232, "learning_rate": 7.808219178082192e-6, "loss": 0.1852, "step": 15900 }, { "epoch": 4.27, "grad_norm": 0.9338549375534058, "learning_rate": 7.5342465753424655e-6, "loss": 0.1865, "step": 16000 }, { "epoch": 4.29, "grad_norm": 0.8193784952163696, "learning_rate": 7.260273972602739e-6, "loss": 0.184, "step": 16100 }, { "epoch": 4.32, "grad_norm": 0.9323195815086365, "learning_rate": 6.9863013698630145e-6, "loss": 0.1845, "step": 16200 }, { "epoch": 4.35, "grad_norm": 0.9668224453926086, "learning_rate": 6.712328767123288e-6, "loss": 0.1911, "step": 16300 }, { "epoch": 4.37, "grad_norm": 0.9941351413726807, "learning_rate": 6.438356164383562e-6, "loss": 0.1859, "step": 16400 }, { "epoch": 4.4, "grad_norm": 0.9229924082756042, "learning_rate": 6.1643835616438354e-6, "loss": 0.1861, "step": 16500 }, { "epoch": 4.43, "grad_norm": 0.8792287111282349, "learning_rate": 5.89041095890411e-6, "loss": 0.1903, "step": 16600 }, { "epoch": 4.45, "grad_norm": 0.682725191116333, "learning_rate": 5.616438356164384e-6, "loss": 0.1822, "step": 16700 }, { "epoch": 4.48, "grad_norm": 0.8012785315513611, "learning_rate": 5.342465753424658e-6, "loss": 0.1888, "step": 16800 }, { "epoch": 4.51, "grad_norm": 0.7928184270858765, "learning_rate": 5.068493150684932e-6, "loss": 0.1869, "step": 16900 }, { "epoch": 4.53, "grad_norm": 1.2073571681976318, "learning_rate": 4.7945205479452054e-6, "loss": 0.184, "step": 17000 }, { "epoch": 4.56, "grad_norm": 0.763810396194458, "learning_rate": 4.52054794520548e-6, "loss": 0.1824, "step": 17100 }, { "epoch": 4.59, "grad_norm": 0.8932220935821533, "learning_rate": 4.246575342465754e-6, "loss": 0.1898, "step": 17200 }, { "epoch": 4.61, "grad_norm": 0.7250128984451294, "learning_rate": 3.972602739726028e-6, "loss": 0.1886, "step": 17300 }, { "epoch": 4.64, "grad_norm": 1.0617702007293701, "learning_rate": 3.6986301369863018e-6, "loss": 0.1889, "step": 17400 }, { "epoch": 4.67, "grad_norm": 0.983672022819519, "learning_rate": 3.4246575342465754e-6, "loss": 0.1871, "step": 17500 }, { "epoch": 4.69, "grad_norm": 0.9392043352127075, "learning_rate": 3.1506849315068495e-6, "loss": 0.1869, "step": 17600 }, { "epoch": 4.72, "grad_norm": 0.8135913014411926, "learning_rate": 2.8767123287671236e-6, "loss": 0.1861, "step": 17700 }, { "epoch": 4.75, "grad_norm": 0.7956686615943909, "learning_rate": 2.6027397260273973e-6, "loss": 0.1864, "step": 17800 }, { "epoch": 4.77, "grad_norm": 0.8956461548805237, "learning_rate": 2.3287671232876713e-6, "loss": 0.1889, "step": 17900 }, { "epoch": 4.8, "grad_norm": 0.9515472054481506, "learning_rate": 2.054794520547945e-6, "loss": 0.1871, "step": 18000 }, { "epoch": 4.83, "grad_norm": 0.8886680006980896, "learning_rate": 1.7808219178082193e-6, "loss": 0.187, "step": 18100 }, { "epoch": 4.85, "grad_norm": 0.8525242805480957, "learning_rate": 1.5068493150684932e-6, "loss": 0.1832, "step": 18200 }, { "epoch": 4.88, "grad_norm": 0.9522444009780884, "learning_rate": 1.232876712328767e-6, "loss": 0.186, "step": 18300 }, { "epoch": 4.91, "grad_norm": 0.8611086010932922, "learning_rate": 9.589041095890411e-7, "loss": 0.1855, "step": 18400 }, { "epoch": 4.93, "grad_norm": 0.9658819437026978, "learning_rate": 6.849315068493151e-7, "loss": 0.177, "step": 18500 }, { "epoch": 4.96, "grad_norm": 0.9198510646820068, "learning_rate": 4.1095890410958903e-7, "loss": 0.178, "step": 18600 }, { "epoch": 4.99, "grad_norm": 0.8326091766357422, "learning_rate": 1.36986301369863e-7, "loss": 0.1865, "step": 18700 } ], "logging_steps": 100, "max_steps": 18750, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 6250, "total_flos": 8.12664225792e16, "train_batch_size": 8, "trial_name": null, "trial_params": null }