{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 3748, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026689797824781477, "grad_norm": 2.147552967071533, "learning_rate": 2.6133333333333334e-06, "loss": 0.7325, "step": 50 }, { "epoch": 0.053379595649562954, "grad_norm": 1.0603501796722412, "learning_rate": 5.28e-06, "loss": 0.3367, "step": 100 }, { "epoch": 0.08006939347434443, "grad_norm": 0.6418544054031372, "learning_rate": 7.946666666666666e-06, "loss": 0.3204, "step": 150 }, { "epoch": 0.10675919129912591, "grad_norm": 0.5255997776985168, "learning_rate": 1.0613333333333334e-05, "loss": 0.3304, "step": 200 }, { "epoch": 0.13344898912390737, "grad_norm": 0.3151794970035553, "learning_rate": 1.3280000000000002e-05, "loss": 0.2894, "step": 250 }, { "epoch": 0.16013878694868885, "grad_norm": 0.8053786754608154, "learning_rate": 1.5946666666666668e-05, "loss": 0.2991, "step": 300 }, { "epoch": 0.18682858477347034, "grad_norm": 0.02903924137353897, "learning_rate": 1.8613333333333334e-05, "loss": 0.3086, "step": 350 }, { "epoch": 0.21351838259825182, "grad_norm": 0.7904114723205566, "learning_rate": 1.9997501717954778e-05, "loss": 0.3291, "step": 400 }, { "epoch": 0.2402081804230333, "grad_norm": 0.41448771953582764, "learning_rate": 1.9976257383527736e-05, "loss": 0.3051, "step": 450 }, { "epoch": 0.26689797824781475, "grad_norm": 0.3972539007663727, "learning_rate": 1.9933381071415265e-05, "loss": 0.3149, "step": 500 }, { "epoch": 0.26689797824781475, "eval_loss": 0.4319009780883789, "eval_runtime": 118.7587, "eval_samples_per_second": 29.185, "eval_steps_per_second": 29.185, "step": 500 }, { "epoch": 0.29358777607259623, "grad_norm": 0.9960196018218994, "learning_rate": 1.9868965752296757e-05, "loss": 0.2993, "step": 550 }, { "epoch": 0.3202775738973777, "grad_norm": 0.4068482518196106, "learning_rate": 1.978315110087108e-05, "loss": 0.289, "step": 600 }, { "epoch": 0.3469673717221592, "grad_norm": 0.16865970194339752, "learning_rate": 1.967612319299347e-05, "loss": 0.3031, "step": 650 }, { "epoch": 0.37365716954694067, "grad_norm": 0.5249218344688416, "learning_rate": 1.954811410219871e-05, "loss": 0.296, "step": 700 }, { "epoch": 0.40034696737172215, "grad_norm": 0.2836828827857971, "learning_rate": 1.9399401396485418e-05, "loss": 0.2986, "step": 750 }, { "epoch": 0.42703676519650363, "grad_norm": 0.024336909875273705, "learning_rate": 1.923030753645258e-05, "loss": 0.2963, "step": 800 }, { "epoch": 0.4537265630212851, "grad_norm": 0.15610457956790924, "learning_rate": 1.9041199176093403e-05, "loss": 0.2939, "step": 850 }, { "epoch": 0.4804163608460666, "grad_norm": 0.3119260370731354, "learning_rate": 1.8832486367762608e-05, "loss": 0.3027, "step": 900 }, { "epoch": 0.5071061586708481, "grad_norm": 1.0931050777435303, "learning_rate": 1.8604621673041056e-05, "loss": 0.3102, "step": 950 }, { "epoch": 0.5337959564956295, "grad_norm": 0.16979913413524628, "learning_rate": 1.8358099181425628e-05, "loss": 0.2789, "step": 1000 }, { "epoch": 0.5337959564956295, "eval_loss": 0.42964276671409607, "eval_runtime": 118.8013, "eval_samples_per_second": 29.175, "eval_steps_per_second": 29.175, "step": 1000 }, { "epoch": 0.560485754320411, "grad_norm": 0.5423979163169861, "learning_rate": 1.809345343897229e-05, "loss": 0.2991, "step": 1050 }, { "epoch": 0.5871755521451925, "grad_norm": 0.9803158640861511, "learning_rate": 1.7811258289215265e-05, "loss": 0.2987, "step": 1100 }, { "epoch": 0.613865349969974, "grad_norm": 0.25261151790618896, "learning_rate": 1.7512125628875722e-05, "loss": 0.3019, "step": 1150 }, { "epoch": 0.6405551477947554, "grad_norm": 0.34850814938545227, "learning_rate": 1.7196704081057955e-05, "loss": 0.2888, "step": 1200 }, { "epoch": 0.667244945619537, "grad_norm": 0.32236334681510925, "learning_rate": 1.6865677588810112e-05, "loss": 0.2895, "step": 1250 }, { "epoch": 0.6939347434443184, "grad_norm": 0.32494327425956726, "learning_rate": 1.6519763932099e-05, "loss": 0.2983, "step": 1300 }, { "epoch": 0.7206245412690999, "grad_norm": 0.022694284096360207, "learning_rate": 1.615971317141477e-05, "loss": 0.2883, "step": 1350 }, { "epoch": 0.7473143390938813, "grad_norm": 0.02763156034052372, "learning_rate": 1.578630602138029e-05, "loss": 0.3111, "step": 1400 }, { "epoch": 0.7740041369186629, "grad_norm": 0.4600750207901001, "learning_rate": 1.54003521578917e-05, "loss": 0.3019, "step": 1450 }, { "epoch": 0.8006939347434443, "grad_norm": 0.024699728935956955, "learning_rate": 1.5002688462460931e-05, "loss": 0.3074, "step": 1500 }, { "epoch": 0.8006939347434443, "eval_loss": 0.42814743518829346, "eval_runtime": 118.8349, "eval_samples_per_second": 29.167, "eval_steps_per_second": 29.167, "step": 1500 }, { "epoch": 0.8273837325682258, "grad_norm": 0.43051406741142273, "learning_rate": 1.459417720756705e-05, "loss": 0.2929, "step": 1550 }, { "epoch": 0.8540735303930073, "grad_norm": 0.03149699047207832, "learning_rate": 1.4175704186951178e-05, "loss": 0.2858, "step": 1600 }, { "epoch": 0.8807633282177888, "grad_norm": 0.30078360438346863, "learning_rate": 1.3748176794909173e-05, "loss": 0.296, "step": 1650 }, { "epoch": 0.9074531260425702, "grad_norm": 0.2556113302707672, "learning_rate": 1.3312522058746883e-05, "loss": 0.2844, "step": 1700 }, { "epoch": 0.9341429238673518, "grad_norm": 0.09224811941385269, "learning_rate": 1.2869684628664158e-05, "loss": 0.2997, "step": 1750 }, { "epoch": 0.9608327216921332, "grad_norm": 0.4570249319076538, "learning_rate": 1.2420624729426419e-05, "loss": 0.286, "step": 1800 }, { "epoch": 0.9875225195169146, "grad_norm": 0.013208149001002312, "learning_rate": 1.1966316078265114e-05, "loss": 0.2889, "step": 1850 }, { "epoch": 1.0138786948688863, "grad_norm": 0.049777038395404816, "learning_rate": 1.150774377352188e-05, "loss": 0.2915, "step": 1900 }, { "epoch": 1.0405684926936678, "grad_norm": 0.019075842574238777, "learning_rate": 1.1045902158614493e-05, "loss": 0.2997, "step": 1950 }, { "epoch": 1.0672582905184493, "grad_norm": 0.3245074152946472, "learning_rate": 1.0581792665956311e-05, "loss": 0.2819, "step": 2000 }, { "epoch": 1.0672582905184493, "eval_loss": 0.4269392490386963, "eval_runtime": 118.8174, "eval_samples_per_second": 29.171, "eval_steps_per_second": 29.171, "step": 2000 }, { "epoch": 1.0939480883432309, "grad_norm": 0.1012788936495781, "learning_rate": 1.0116421645504322e-05, "loss": 0.3026, "step": 2050 }, { "epoch": 1.1206378861680122, "grad_norm": 0.005864244420081377, "learning_rate": 9.650798182644238e-06, "loss": 0.2996, "step": 2100 }, { "epoch": 1.1473276839927937, "grad_norm": 0.20553366839885712, "learning_rate": 9.185931910144259e-06, "loss": 0.2947, "step": 2150 }, { "epoch": 1.1740174818175753, "grad_norm": 0.1598707139492035, "learning_rate": 8.722830818921908e-06, "loss": 0.3082, "step": 2200 }, { "epoch": 1.2007072796423568, "grad_norm": 0.013660128228366375, "learning_rate": 8.262499072370962e-06, "loss": 0.2808, "step": 2250 }, { "epoch": 1.227397077467138, "grad_norm": 0.2775779068470001, "learning_rate": 7.805934828987778e-06, "loss": 0.2956, "step": 2300 }, { "epoch": 1.2540868752919196, "grad_norm": 0.11179913580417633, "learning_rate": 7.354128078018343e-06, "loss": 0.2766, "step": 2350 }, { "epoch": 1.2807766731167012, "grad_norm": 0.013183626346290112, "learning_rate": 6.908058492819043e-06, "loss": 0.3051, "step": 2400 }, { "epoch": 1.3074664709414825, "grad_norm": 0.030972259119153023, "learning_rate": 6.468693306585873e-06, "loss": 0.2966, "step": 2450 }, { "epoch": 1.334156268766264, "grad_norm": 0.004325371701270342, "learning_rate": 6.036985215058232e-06, "loss": 0.2993, "step": 2500 }, { "epoch": 1.334156268766264, "eval_loss": 0.4272440969944, "eval_runtime": 118.9153, "eval_samples_per_second": 29.147, "eval_steps_per_second": 29.147, "step": 2500 }, { "epoch": 1.3608460665910456, "grad_norm": 0.005641893949359655, "learning_rate": 5.613870310744911e-06, "loss": 0.2827, "step": 2550 }, { "epoch": 1.387535864415827, "grad_norm": 0.06420216709375381, "learning_rate": 5.20026605315167e-06, "loss": 0.2675, "step": 2600 }, { "epoch": 1.4142256622406086, "grad_norm": 0.7545241117477417, "learning_rate": 4.797069279411617e-06, "loss": 0.3023, "step": 2650 }, { "epoch": 1.44091546006539, "grad_norm": 0.01520050223916769, "learning_rate": 4.405154259631967e-06, "loss": 0.2743, "step": 2700 }, { "epoch": 1.4676052578901715, "grad_norm": 0.11380592733621597, "learning_rate": 4.0253708011739915e-06, "loss": 0.2878, "step": 2750 }, { "epoch": 1.494295055714953, "grad_norm": 0.013577022589743137, "learning_rate": 3.6585424059766296e-06, "loss": 0.2902, "step": 2800 }, { "epoch": 1.5209848535397343, "grad_norm": 0.020401863381266594, "learning_rate": 3.3054644849193495e-06, "loss": 0.2884, "step": 2850 }, { "epoch": 1.5476746513645159, "grad_norm": 0.006767068989574909, "learning_rate": 2.966902633096178e-06, "loss": 0.2952, "step": 2900 }, { "epoch": 1.5743644491892974, "grad_norm": 0.01109298225492239, "learning_rate": 2.643590969740637e-06, "loss": 0.2839, "step": 2950 }, { "epoch": 1.601054247014079, "grad_norm": 0.014958917163312435, "learning_rate": 2.33623054640124e-06, "loss": 0.2909, "step": 3000 }, { "epoch": 1.601054247014079, "eval_loss": 0.42682579159736633, "eval_runtime": 118.7962, "eval_samples_per_second": 29.176, "eval_steps_per_second": 29.176, "step": 3000 }, { "epoch": 1.6277440448388605, "grad_norm": 0.26821330189704895, "learning_rate": 2.0454878268191925e-06, "loss": 0.2818, "step": 3050 }, { "epoch": 1.6544338426636418, "grad_norm": 0.21184305846691132, "learning_rate": 1.7719932418044105e-06, "loss": 0.2839, "step": 3100 }, { "epoch": 1.6811236404884233, "grad_norm": 0.1029989942908287, "learning_rate": 1.516339822243398e-06, "loss": 0.2828, "step": 3150 }, { "epoch": 1.7078134383132046, "grad_norm": 0.03526095300912857, "learning_rate": 1.2790819132030974e-06, "loss": 0.2851, "step": 3200 }, { "epoch": 1.7345032361379862, "grad_norm": 0.004737787880003452, "learning_rate": 1.0607339719190002e-06, "loss": 0.2797, "step": 3250 }, { "epoch": 1.7611930339627677, "grad_norm": 0.010323897004127502, "learning_rate": 8.617694522738518e-07, "loss": 0.274, "step": 3300 }, { "epoch": 1.7878828317875493, "grad_norm": 0.6822851896286011, "learning_rate": 6.826197781858324e-07, "loss": 0.2727, "step": 3350 }, { "epoch": 1.8145726296123308, "grad_norm": 0.007676210254430771, "learning_rate": 5.236734081322281e-07, "loss": 0.2932, "step": 3400 }, { "epoch": 1.8412624274371123, "grad_norm": 0.45130208134651184, "learning_rate": 3.852749928370536e-07, "loss": 0.2885, "step": 3450 }, { "epoch": 1.8679522252618936, "grad_norm": 0.007974912412464619, "learning_rate": 2.677246279490309e-07, "loss": 0.2996, "step": 3500 }, { "epoch": 1.8679522252618936, "eval_loss": 0.42648929357528687, "eval_runtime": 118.9628, "eval_samples_per_second": 29.135, "eval_steps_per_second": 29.135, "step": 3500 }, { "epoch": 1.894642023086675, "grad_norm": 0.20535898208618164, "learning_rate": 1.7127720333040442e-07, "loss": 0.2885, "step": 3550 }, { "epoch": 1.9213318209114565, "grad_norm": 0.005906387697905302, "learning_rate": 9.614185036752155e-08, "loss": 0.2867, "step": 3600 }, { "epoch": 1.948021618736238, "grad_norm": 0.1817421317100525, "learning_rate": 4.248148850162892e-08, "loss": 0.273, "step": 3650 }, { "epoch": 1.9747114165610196, "grad_norm": 0.4754043221473694, "learning_rate": 1.041247196316264e-08, "loss": 0.3001, "step": 3700 }, { "epoch": 2.0, "step": 3748, "total_flos": 3.7463946988098355e+17, "train_loss": 0.3005621947244748, "train_runtime": 4284.8367, "train_samples_per_second": 6.995, "train_steps_per_second": 0.875 } ], "logging_steps": 50, "max_steps": 3748, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.7463946988098355e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }