| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 50, |
| "global_step": 744, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.026899798251513115, |
| "grad_norm": 29.746873221105215, |
| "learning_rate": 1.894736842105263e-06, |
| "loss": 1.8652, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05379959650302623, |
| "grad_norm": 2.6569803505324483, |
| "learning_rate": 4e-06, |
| "loss": 0.9876, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08069939475453934, |
| "grad_norm": 1.866327801527443, |
| "learning_rate": 6.105263157894737e-06, |
| "loss": 0.7748, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10759919300605246, |
| "grad_norm": 1.5350711741145153, |
| "learning_rate": 7.999960397771768e-06, |
| "loss": 0.7036, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.13449899125756556, |
| "grad_norm": 1.4235434017069168, |
| "learning_rate": 7.995209079154332e-06, |
| "loss": 0.6701, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13449899125756556, |
| "eval_loss": 0.6571411490440369, |
| "eval_runtime": 20.3665, |
| "eval_samples_per_second": 23.863, |
| "eval_steps_per_second": 0.786, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16139878950907868, |
| "grad_norm": 1.5470132855221335, |
| "learning_rate": 7.982548093693699e-06, |
| "loss": 0.6427, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1882985877605918, |
| "grad_norm": 1.4003108383520437, |
| "learning_rate": 7.962002507456483e-06, |
| "loss": 0.6309, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.21519838601210492, |
| "grad_norm": 1.411199665593455, |
| "learning_rate": 7.933612996347003e-06, |
| "loss": 0.6348, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.242098184263618, |
| "grad_norm": 1.3778290968625495, |
| "learning_rate": 7.897435765577615e-06, |
| "loss": 0.6165, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.26899798251513113, |
| "grad_norm": 1.4467043447342405, |
| "learning_rate": 7.853542438394323e-06, |
| "loss": 0.5984, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.26899798251513113, |
| "eval_loss": 0.6147489547729492, |
| "eval_runtime": 20.4764, |
| "eval_samples_per_second": 23.735, |
| "eval_steps_per_second": 0.781, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.29589778076664425, |
| "grad_norm": 1.4148215321149555, |
| "learning_rate": 7.802019914277922e-06, |
| "loss": 0.6115, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.32279757901815737, |
| "grad_norm": 1.566581827874729, |
| "learning_rate": 7.742970196901463e-06, |
| "loss": 0.6062, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3496973772696705, |
| "grad_norm": 1.3254884536596998, |
| "learning_rate": 7.676510192184609e-06, |
| "loss": 0.6006, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3765971755211836, |
| "grad_norm": 1.447517875694622, |
| "learning_rate": 7.602771476844694e-06, |
| "loss": 0.5898, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4034969737726967, |
| "grad_norm": 1.3445376199324792, |
| "learning_rate": 7.5219000379027296e-06, |
| "loss": 0.5934, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4034969737726967, |
| "eval_loss": 0.5962715148925781, |
| "eval_runtime": 20.1716, |
| "eval_samples_per_second": 24.093, |
| "eval_steps_per_second": 0.793, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.43039677202420984, |
| "grad_norm": 1.3246907469950613, |
| "learning_rate": 7.434055983660057e-06, |
| "loss": 0.5993, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.45729657027572296, |
| "grad_norm": 1.3467258206153316, |
| "learning_rate": 7.339413226717854e-06, |
| "loss": 0.5915, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.484196368527236, |
| "grad_norm": 1.458985018698563, |
| "learning_rate": 7.23815913966707e-06, |
| "loss": 0.5881, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5110961667787491, |
| "grad_norm": 1.2485290337068158, |
| "learning_rate": 7.130494184130416e-06, |
| "loss": 0.5887, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5379959650302623, |
| "grad_norm": 1.3124694415167568, |
| "learning_rate": 7.016631513890864e-06, |
| "loss": 0.5855, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5379959650302623, |
| "eval_loss": 0.5851770043373108, |
| "eval_runtime": 20.3049, |
| "eval_samples_per_second": 23.935, |
| "eval_steps_per_second": 0.788, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5648957632817754, |
| "grad_norm": 1.4701321084658303, |
| "learning_rate": 6.896796552892348e-06, |
| "loss": 0.5833, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5917955615332885, |
| "grad_norm": 1.4050016198469426, |
| "learning_rate": 6.771226548948162e-06, |
| "loss": 0.5812, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6186953597848016, |
| "grad_norm": 1.4295194453088238, |
| "learning_rate": 6.64017010404058e-06, |
| "loss": 0.5822, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6455951580363147, |
| "grad_norm": 7.967159671444082, |
| "learning_rate": 6.503886682141661e-06, |
| "loss": 0.573, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6724949562878278, |
| "grad_norm": 1.2442332016904551, |
| "learning_rate": 6.3626460955295895e-06, |
| "loss": 0.5837, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6724949562878278, |
| "eval_loss": 0.5775083303451538, |
| "eval_runtime": 20.6713, |
| "eval_samples_per_second": 23.511, |
| "eval_steps_per_second": 0.774, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.699394754539341, |
| "grad_norm": 1.3850365164976395, |
| "learning_rate": 6.2167279706175765e-06, |
| "loss": 0.5726, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7262945527908541, |
| "grad_norm": 1.2304207907101277, |
| "learning_rate": 6.066421194352859e-06, |
| "loss": 0.5565, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7531943510423672, |
| "grad_norm": 1.2714930735972703, |
| "learning_rate": 5.912023342281789e-06, |
| "loss": 0.5641, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7800941492938803, |
| "grad_norm": 1.460856311527336, |
| "learning_rate": 5.753840089413357e-06, |
| "loss": 0.566, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8069939475453934, |
| "grad_norm": 1.5514244928702674, |
| "learning_rate": 5.592184605047483e-06, |
| "loss": 0.557, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8069939475453934, |
| "eval_loss": 0.5692653656005859, |
| "eval_runtime": 20.3766, |
| "eval_samples_per_second": 23.851, |
| "eval_steps_per_second": 0.785, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8338937457969066, |
| "grad_norm": 1.360092781672721, |
| "learning_rate": 5.427376932766216e-06, |
| "loss": 0.5657, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.8607935440484197, |
| "grad_norm": 1.5946830149551399, |
| "learning_rate": 5.259743356815289e-06, |
| "loss": 0.574, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8876933422999328, |
| "grad_norm": 1.2977931263056453, |
| "learning_rate": 5.089615756130505e-06, |
| "loss": 0.5549, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9145931405514459, |
| "grad_norm": 1.7677476514254604, |
| "learning_rate": 4.917330947287818e-06, |
| "loss": 0.5546, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.9414929388029589, |
| "grad_norm": 1.365159498913211, |
| "learning_rate": 4.743230017677918e-06, |
| "loss": 0.5525, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9414929388029589, |
| "eval_loss": 0.5623395442962646, |
| "eval_runtime": 20.4098, |
| "eval_samples_per_second": 23.812, |
| "eval_steps_per_second": 0.784, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.968392737054472, |
| "grad_norm": 1.4020192853941864, |
| "learning_rate": 4.567657650225538e-06, |
| "loss": 0.5491, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9952925353059852, |
| "grad_norm": 1.2810961882383074, |
| "learning_rate": 4.390961440990333e-06, |
| "loss": 0.5617, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.0215198386012105, |
| "grad_norm": 1.4422549713674222, |
| "learning_rate": 4.213491211000394e-06, |
| "loss": 0.4705, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.0484196368527237, |
| "grad_norm": 1.339607087215748, |
| "learning_rate": 4.035598313680784e-06, |
| "loss": 0.472, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.0753194351042368, |
| "grad_norm": 1.3004229760969053, |
| "learning_rate": 3.8576349392482585e-06, |
| "loss": 0.4579, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0753194351042368, |
| "eval_loss": 0.5656763315200806, |
| "eval_runtime": 20.3701, |
| "eval_samples_per_second": 23.858, |
| "eval_steps_per_second": 0.785, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.10221923335575, |
| "grad_norm": 1.2705614543341326, |
| "learning_rate": 3.67995341744931e-06, |
| "loss": 0.4475, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.129119031607263, |
| "grad_norm": 1.3779121525385867, |
| "learning_rate": 3.5029055200219857e-06, |
| "loss": 0.4596, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.1560188298587761, |
| "grad_norm": 1.3453138554439108, |
| "learning_rate": 3.326841764262423e-06, |
| "loss": 0.4427, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.1829186281102892, |
| "grad_norm": 1.3661390821897004, |
| "learning_rate": 3.1521107190749343e-06, |
| "loss": 0.4531, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.2098184263618024, |
| "grad_norm": 1.283068671688067, |
| "learning_rate": 2.9790583148794834e-06, |
| "loss": 0.4454, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.2098184263618024, |
| "eval_loss": 0.5635669231414795, |
| "eval_runtime": 20.4014, |
| "eval_samples_per_second": 23.822, |
| "eval_steps_per_second": 0.784, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.2367182246133155, |
| "grad_norm": 1.387077223080061, |
| "learning_rate": 2.808027158742806e-06, |
| "loss": 0.4504, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.2636180228648284, |
| "grad_norm": 1.3934948532441245, |
| "learning_rate": 2.6393558560890605e-06, |
| "loss": 0.4372, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.2905178211163415, |
| "grad_norm": 1.3508965770850627, |
| "learning_rate": 2.4733783403328845e-06, |
| "loss": 0.4459, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.3174176193678546, |
| "grad_norm": 1.3563554813682979, |
| "learning_rate": 2.3104232117620433e-06, |
| "loss": 0.4511, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.3443174176193677, |
| "grad_norm": 1.2997155659311739, |
| "learning_rate": 2.150813086978535e-06, |
| "loss": 0.443, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.3443174176193677, |
| "eval_loss": 0.5615652799606323, |
| "eval_runtime": 20.1025, |
| "eval_samples_per_second": 24.176, |
| "eval_steps_per_second": 0.796, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.3712172158708809, |
| "grad_norm": 1.2836148832407157, |
| "learning_rate": 1.9948639601861227e-06, |
| "loss": 0.4511, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.398117014122394, |
| "grad_norm": 1.3107604919378975, |
| "learning_rate": 1.8428845775888169e-06, |
| "loss": 0.4412, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.425016812373907, |
| "grad_norm": 1.317635662287699, |
| "learning_rate": 1.6951758261388555e-06, |
| "loss": 0.4506, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.4519166106254202, |
| "grad_norm": 1.2801225745819735, |
| "learning_rate": 1.5520301378443373e-06, |
| "loss": 0.4421, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.4788164088769333, |
| "grad_norm": 1.3520095118040596, |
| "learning_rate": 1.4137309108158554e-06, |
| "loss": 0.4443, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.4788164088769333, |
| "eval_loss": 0.5601685643196106, |
| "eval_runtime": 20.2436, |
| "eval_samples_per_second": 24.008, |
| "eval_steps_per_second": 0.79, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.5057162071284464, |
| "grad_norm": 1.3661516182508175, |
| "learning_rate": 1.2805519481983216e-06, |
| "loss": 0.4397, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.5326160053799596, |
| "grad_norm": 1.2619493983661074, |
| "learning_rate": 1.1527569160988053e-06, |
| "loss": 0.4517, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.5595158036314727, |
| "grad_norm": 1.3136850406090343, |
| "learning_rate": 1.0305988215835468e-06, |
| "loss": 0.4459, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.5864156018829858, |
| "grad_norm": 1.3675820589439671, |
| "learning_rate": 9.143195117776081e-07, |
| "loss": 0.4367, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.613315400134499, |
| "grad_norm": 1.2781582138814331, |
| "learning_rate": 8.041491950588457e-07, |
| "loss": 0.4422, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.613315400134499, |
| "eval_loss": 0.5570077896118164, |
| "eval_runtime": 20.3295, |
| "eval_samples_per_second": 23.906, |
| "eval_steps_per_second": 0.787, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.640215198386012, |
| "grad_norm": 1.272071588739883, |
| "learning_rate": 7.003059852941429e-07, |
| "loss": 0.4483, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.6671149966375252, |
| "grad_norm": 1.4222402008140032, |
| "learning_rate": 6.029954700201938e-07, |
| "loss": 0.427, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.6940147948890383, |
| "grad_norm": 1.2265154550516224, |
| "learning_rate": 5.124103034237804e-07, |
| "loss": 0.4319, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.7209145931405514, |
| "grad_norm": 1.2705738648920262, |
| "learning_rate": 4.2872982492732256e-07, |
| "loss": 0.4433, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.7478143913920645, |
| "grad_norm": 1.3331198275976817, |
| "learning_rate": 3.521197041348576e-07, |
| "loss": 0.4462, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.7478143913920645, |
| "eval_loss": 0.5559241771697998, |
| "eval_runtime": 20.3664, |
| "eval_samples_per_second": 23.863, |
| "eval_steps_per_second": 0.786, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.7747141896435776, |
| "grad_norm": 1.2371719113362185, |
| "learning_rate": 2.827316128413475e-07, |
| "loss": 0.4447, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.8016139878950908, |
| "grad_norm": 1.244858103875357, |
| "learning_rate": 2.2070292475468677e-07, |
| "loss": 0.4358, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.8285137861466039, |
| "grad_norm": 1.2807189158760035, |
| "learning_rate": 1.6615644352488923e-07, |
| "loss": 0.4503, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.855413584398117, |
| "grad_norm": 1.3405147314829868, |
| "learning_rate": 1.1920015961889785e-07, |
| "loss": 0.4375, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.88231338264963, |
| "grad_norm": 1.3156533030443653, |
| "learning_rate": 7.992703652236122e-08, |
| "loss": 0.4404, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.88231338264963, |
| "eval_loss": 0.554787814617157, |
| "eval_runtime": 20.3816, |
| "eval_samples_per_second": 23.845, |
| "eval_steps_per_second": 0.785, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.9092131809011432, |
| "grad_norm": 1.3136336467093748, |
| "learning_rate": 4.8414826691641985e-08, |
| "loss": 0.4459, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.9361129791526563, |
| "grad_norm": 1.2848602231721438, |
| "learning_rate": 2.4725917620438408e-08, |
| "loss": 0.4293, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.9630127774041695, |
| "grad_norm": 1.284534380656877, |
| "learning_rate": 8.907208325779069e-09, |
| "loss": 0.4366, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.9899125756556826, |
| "grad_norm": 1.3165574333322239, |
| "learning_rate": 9.900164979099735e-10, |
| "loss": 0.4422, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 744, |
| "total_flos": 77049352159232.0, |
| "train_loss": 0.5424991769175376, |
| "train_runtime": 9734.9137, |
| "train_samples_per_second": 4.887, |
| "train_steps_per_second": 0.076 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 744, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 77049352159232.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|