| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 3748, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.026689797824781477, |
| "grad_norm": 2.147552967071533, |
| "learning_rate": 2.6133333333333334e-06, |
| "loss": 0.7325, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.053379595649562954, |
| "grad_norm": 1.0603501796722412, |
| "learning_rate": 5.28e-06, |
| "loss": 0.3367, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08006939347434443, |
| "grad_norm": 0.6418544054031372, |
| "learning_rate": 7.946666666666666e-06, |
| "loss": 0.3204, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.10675919129912591, |
| "grad_norm": 0.5255997776985168, |
| "learning_rate": 1.0613333333333334e-05, |
| "loss": 0.3304, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13344898912390737, |
| "grad_norm": 0.3151794970035553, |
| "learning_rate": 1.3280000000000002e-05, |
| "loss": 0.2894, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.16013878694868885, |
| "grad_norm": 0.8053786754608154, |
| "learning_rate": 1.5946666666666668e-05, |
| "loss": 0.2991, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.18682858477347034, |
| "grad_norm": 0.02903924137353897, |
| "learning_rate": 1.8613333333333334e-05, |
| "loss": 0.3086, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.21351838259825182, |
| "grad_norm": 0.7904114723205566, |
| "learning_rate": 1.9997501717954778e-05, |
| "loss": 0.3291, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2402081804230333, |
| "grad_norm": 0.41448771953582764, |
| "learning_rate": 1.9976257383527736e-05, |
| "loss": 0.3051, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.26689797824781475, |
| "grad_norm": 0.3972539007663727, |
| "learning_rate": 1.9933381071415265e-05, |
| "loss": 0.3149, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.26689797824781475, |
| "eval_loss": 0.4319009780883789, |
| "eval_runtime": 118.7587, |
| "eval_samples_per_second": 29.185, |
| "eval_steps_per_second": 29.185, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.29358777607259623, |
| "grad_norm": 0.9960196018218994, |
| "learning_rate": 1.9868965752296757e-05, |
| "loss": 0.2993, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3202775738973777, |
| "grad_norm": 0.4068482518196106, |
| "learning_rate": 1.978315110087108e-05, |
| "loss": 0.289, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3469673717221592, |
| "grad_norm": 0.16865970194339752, |
| "learning_rate": 1.967612319299347e-05, |
| "loss": 0.3031, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.37365716954694067, |
| "grad_norm": 0.5249218344688416, |
| "learning_rate": 1.954811410219871e-05, |
| "loss": 0.296, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.40034696737172215, |
| "grad_norm": 0.2836828827857971, |
| "learning_rate": 1.9399401396485418e-05, |
| "loss": 0.2986, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.42703676519650363, |
| "grad_norm": 0.024336909875273705, |
| "learning_rate": 1.923030753645258e-05, |
| "loss": 0.2963, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4537265630212851, |
| "grad_norm": 0.15610457956790924, |
| "learning_rate": 1.9041199176093403e-05, |
| "loss": 0.2939, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.4804163608460666, |
| "grad_norm": 0.3119260370731354, |
| "learning_rate": 1.8832486367762608e-05, |
| "loss": 0.3027, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5071061586708481, |
| "grad_norm": 1.0931050777435303, |
| "learning_rate": 1.8604621673041056e-05, |
| "loss": 0.3102, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.5337959564956295, |
| "grad_norm": 0.16979913413524628, |
| "learning_rate": 1.8358099181425628e-05, |
| "loss": 0.2789, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5337959564956295, |
| "eval_loss": 0.42964276671409607, |
| "eval_runtime": 118.8013, |
| "eval_samples_per_second": 29.175, |
| "eval_steps_per_second": 29.175, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.560485754320411, |
| "grad_norm": 0.5423979163169861, |
| "learning_rate": 1.809345343897229e-05, |
| "loss": 0.2991, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.5871755521451925, |
| "grad_norm": 0.9803158640861511, |
| "learning_rate": 1.7811258289215265e-05, |
| "loss": 0.2987, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.613865349969974, |
| "grad_norm": 0.25261151790618896, |
| "learning_rate": 1.7512125628875722e-05, |
| "loss": 0.3019, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.6405551477947554, |
| "grad_norm": 0.34850814938545227, |
| "learning_rate": 1.7196704081057955e-05, |
| "loss": 0.2888, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.667244945619537, |
| "grad_norm": 0.32236334681510925, |
| "learning_rate": 1.6865677588810112e-05, |
| "loss": 0.2895, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.6939347434443184, |
| "grad_norm": 0.32494327425956726, |
| "learning_rate": 1.6519763932099e-05, |
| "loss": 0.2983, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7206245412690999, |
| "grad_norm": 0.022694284096360207, |
| "learning_rate": 1.615971317141477e-05, |
| "loss": 0.2883, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.7473143390938813, |
| "grad_norm": 0.02763156034052372, |
| "learning_rate": 1.578630602138029e-05, |
| "loss": 0.3111, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.7740041369186629, |
| "grad_norm": 0.4600750207901001, |
| "learning_rate": 1.54003521578917e-05, |
| "loss": 0.3019, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8006939347434443, |
| "grad_norm": 0.024699728935956955, |
| "learning_rate": 1.5002688462460931e-05, |
| "loss": 0.3074, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8006939347434443, |
| "eval_loss": 0.42814743518829346, |
| "eval_runtime": 118.8349, |
| "eval_samples_per_second": 29.167, |
| "eval_steps_per_second": 29.167, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8273837325682258, |
| "grad_norm": 0.43051406741142273, |
| "learning_rate": 1.459417720756705e-05, |
| "loss": 0.2929, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.8540735303930073, |
| "grad_norm": 0.03149699047207832, |
| "learning_rate": 1.4175704186951178e-05, |
| "loss": 0.2858, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.8807633282177888, |
| "grad_norm": 0.30078360438346863, |
| "learning_rate": 1.3748176794909173e-05, |
| "loss": 0.296, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.9074531260425702, |
| "grad_norm": 0.2556113302707672, |
| "learning_rate": 1.3312522058746883e-05, |
| "loss": 0.2844, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.9341429238673518, |
| "grad_norm": 0.09224811941385269, |
| "learning_rate": 1.2869684628664158e-05, |
| "loss": 0.2997, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.9608327216921332, |
| "grad_norm": 0.4570249319076538, |
| "learning_rate": 1.2420624729426419e-05, |
| "loss": 0.286, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.9875225195169146, |
| "grad_norm": 0.013208149001002312, |
| "learning_rate": 1.1966316078265114e-05, |
| "loss": 0.2889, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.0138786948688863, |
| "grad_norm": 0.049777038395404816, |
| "learning_rate": 1.150774377352188e-05, |
| "loss": 0.2915, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.0405684926936678, |
| "grad_norm": 0.019075842574238777, |
| "learning_rate": 1.1045902158614493e-05, |
| "loss": 0.2997, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.0672582905184493, |
| "grad_norm": 0.3245074152946472, |
| "learning_rate": 1.0581792665956311e-05, |
| "loss": 0.2819, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.0672582905184493, |
| "eval_loss": 0.4269392490386963, |
| "eval_runtime": 118.8174, |
| "eval_samples_per_second": 29.171, |
| "eval_steps_per_second": 29.171, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.0939480883432309, |
| "grad_norm": 0.1012788936495781, |
| "learning_rate": 1.0116421645504322e-05, |
| "loss": 0.3026, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.1206378861680122, |
| "grad_norm": 0.005864244420081377, |
| "learning_rate": 9.650798182644238e-06, |
| "loss": 0.2996, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.1473276839927937, |
| "grad_norm": 0.20553366839885712, |
| "learning_rate": 9.185931910144259e-06, |
| "loss": 0.2947, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.1740174818175753, |
| "grad_norm": 0.1598707139492035, |
| "learning_rate": 8.722830818921908e-06, |
| "loss": 0.3082, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.2007072796423568, |
| "grad_norm": 0.013660128228366375, |
| "learning_rate": 8.262499072370962e-06, |
| "loss": 0.2808, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.227397077467138, |
| "grad_norm": 0.2775779068470001, |
| "learning_rate": 7.805934828987778e-06, |
| "loss": 0.2956, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.2540868752919196, |
| "grad_norm": 0.11179913580417633, |
| "learning_rate": 7.354128078018343e-06, |
| "loss": 0.2766, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.2807766731167012, |
| "grad_norm": 0.013183626346290112, |
| "learning_rate": 6.908058492819043e-06, |
| "loss": 0.3051, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.3074664709414825, |
| "grad_norm": 0.030972259119153023, |
| "learning_rate": 6.468693306585873e-06, |
| "loss": 0.2966, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.334156268766264, |
| "grad_norm": 0.004325371701270342, |
| "learning_rate": 6.036985215058232e-06, |
| "loss": 0.2993, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.334156268766264, |
| "eval_loss": 0.4272440969944, |
| "eval_runtime": 118.9153, |
| "eval_samples_per_second": 29.147, |
| "eval_steps_per_second": 29.147, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.3608460665910456, |
| "grad_norm": 0.005641893949359655, |
| "learning_rate": 5.613870310744911e-06, |
| "loss": 0.2827, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.387535864415827, |
| "grad_norm": 0.06420216709375381, |
| "learning_rate": 5.20026605315167e-06, |
| "loss": 0.2675, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.4142256622406086, |
| "grad_norm": 0.7545241117477417, |
| "learning_rate": 4.797069279411617e-06, |
| "loss": 0.3023, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.44091546006539, |
| "grad_norm": 0.01520050223916769, |
| "learning_rate": 4.405154259631967e-06, |
| "loss": 0.2743, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.4676052578901715, |
| "grad_norm": 0.11380592733621597, |
| "learning_rate": 4.0253708011739915e-06, |
| "loss": 0.2878, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.494295055714953, |
| "grad_norm": 0.013577022589743137, |
| "learning_rate": 3.6585424059766296e-06, |
| "loss": 0.2902, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.5209848535397343, |
| "grad_norm": 0.020401863381266594, |
| "learning_rate": 3.3054644849193495e-06, |
| "loss": 0.2884, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.5476746513645159, |
| "grad_norm": 0.006767068989574909, |
| "learning_rate": 2.966902633096178e-06, |
| "loss": 0.2952, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.5743644491892974, |
| "grad_norm": 0.01109298225492239, |
| "learning_rate": 2.643590969740637e-06, |
| "loss": 0.2839, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.601054247014079, |
| "grad_norm": 0.014958917163312435, |
| "learning_rate": 2.33623054640124e-06, |
| "loss": 0.2909, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.601054247014079, |
| "eval_loss": 0.42682579159736633, |
| "eval_runtime": 118.7962, |
| "eval_samples_per_second": 29.176, |
| "eval_steps_per_second": 29.176, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.6277440448388605, |
| "grad_norm": 0.26821330189704895, |
| "learning_rate": 2.0454878268191925e-06, |
| "loss": 0.2818, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.6544338426636418, |
| "grad_norm": 0.21184305846691132, |
| "learning_rate": 1.7719932418044105e-06, |
| "loss": 0.2839, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.6811236404884233, |
| "grad_norm": 0.1029989942908287, |
| "learning_rate": 1.516339822243398e-06, |
| "loss": 0.2828, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.7078134383132046, |
| "grad_norm": 0.03526095300912857, |
| "learning_rate": 1.2790819132030974e-06, |
| "loss": 0.2851, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.7345032361379862, |
| "grad_norm": 0.004737787880003452, |
| "learning_rate": 1.0607339719190002e-06, |
| "loss": 0.2797, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.7611930339627677, |
| "grad_norm": 0.010323897004127502, |
| "learning_rate": 8.617694522738518e-07, |
| "loss": 0.274, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.7878828317875493, |
| "grad_norm": 0.6822851896286011, |
| "learning_rate": 6.826197781858324e-07, |
| "loss": 0.2727, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.8145726296123308, |
| "grad_norm": 0.007676210254430771, |
| "learning_rate": 5.236734081322281e-07, |
| "loss": 0.2932, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.8412624274371123, |
| "grad_norm": 0.45130208134651184, |
| "learning_rate": 3.852749928370536e-07, |
| "loss": 0.2885, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.8679522252618936, |
| "grad_norm": 0.007974912412464619, |
| "learning_rate": 2.677246279490309e-07, |
| "loss": 0.2996, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.8679522252618936, |
| "eval_loss": 0.42648929357528687, |
| "eval_runtime": 118.9628, |
| "eval_samples_per_second": 29.135, |
| "eval_steps_per_second": 29.135, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.894642023086675, |
| "grad_norm": 0.20535898208618164, |
| "learning_rate": 1.7127720333040442e-07, |
| "loss": 0.2885, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.9213318209114565, |
| "grad_norm": 0.005906387697905302, |
| "learning_rate": 9.614185036752155e-08, |
| "loss": 0.2867, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.948021618736238, |
| "grad_norm": 0.1817421317100525, |
| "learning_rate": 4.248148850162892e-08, |
| "loss": 0.273, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.9747114165610196, |
| "grad_norm": 0.4754043221473694, |
| "learning_rate": 1.041247196316264e-08, |
| "loss": 0.3001, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 3748, |
| "total_flos": 3.7463946988098355e+17, |
| "train_loss": 0.3005621947244748, |
| "train_runtime": 4284.8367, |
| "train_samples_per_second": 6.995, |
| "train_steps_per_second": 0.875 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 3748, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.7463946988098355e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|