{ "best_metric": 1.7116761207580566, "best_model_checkpoint": "/content/drive/MyDrive/W210 Capstone - Lyric Generation with Melody/loaf/models/lyrlen/bart/bart-finetuned-lyrlen-512/checkpoint-22500", "epoch": 3.0, "eval_steps": 500, "global_step": 36000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 2.445735216140747, "learning_rate": 4.8958333333333335e-05, "loss": 2.221, "step": 500 }, { "epoch": 0.04, "eval_loss": 1.966707706451416, "eval_runtime": 523.1555, "eval_samples_per_second": 5.734, "eval_steps_per_second": 2.867, "step": 500 }, { "epoch": 0.08, "grad_norm": 2.755873680114746, "learning_rate": 4.791666666666667e-05, "loss": 2.0336, "step": 1000 }, { "epoch": 0.08, "eval_loss": 1.8762251138687134, "eval_runtime": 521.8304, "eval_samples_per_second": 5.749, "eval_steps_per_second": 2.874, "step": 1000 }, { "epoch": 0.12, "grad_norm": 1.6574139595031738, "learning_rate": 4.6875e-05, "loss": 1.9563, "step": 1500 }, { "epoch": 0.12, "eval_loss": 1.8565189838409424, "eval_runtime": 523.2148, "eval_samples_per_second": 5.734, "eval_steps_per_second": 2.867, "step": 1500 }, { "epoch": 0.17, "grad_norm": 2.1005289554595947, "learning_rate": 4.5833333333333334e-05, "loss": 1.9555, "step": 2000 }, { "epoch": 0.17, "eval_loss": 1.8391904830932617, "eval_runtime": 523.6661, "eval_samples_per_second": 5.729, "eval_steps_per_second": 2.864, "step": 2000 }, { "epoch": 0.21, "grad_norm": 2.272672414779663, "learning_rate": 4.4791666666666673e-05, "loss": 1.9072, "step": 2500 }, { "epoch": 0.21, "eval_loss": 1.8213719129562378, "eval_runtime": 522.7152, "eval_samples_per_second": 5.739, "eval_steps_per_second": 2.87, "step": 2500 }, { "epoch": 0.25, "grad_norm": 2.835890769958496, "learning_rate": 4.375e-05, "loss": 1.8796, "step": 3000 }, { "epoch": 0.25, "eval_loss": 1.8246040344238281, "eval_runtime": 523.1839, "eval_samples_per_second": 5.734, "eval_steps_per_second": 2.867, "step": 3000 }, { "epoch": 0.29, "grad_norm": 3.6644091606140137, "learning_rate": 4.270833333333333e-05, "loss": 1.8955, "step": 3500 }, { "epoch": 0.29, "eval_loss": 1.8050193786621094, "eval_runtime": 523.9576, "eval_samples_per_second": 5.726, "eval_steps_per_second": 2.863, "step": 3500 }, { "epoch": 0.33, "grad_norm": 3.492926836013794, "learning_rate": 4.166666666666667e-05, "loss": 1.8254, "step": 4000 }, { "epoch": 0.33, "eval_loss": 1.806920051574707, "eval_runtime": 522.6132, "eval_samples_per_second": 5.74, "eval_steps_per_second": 2.87, "step": 4000 }, { "epoch": 0.38, "grad_norm": 2.235563278198242, "learning_rate": 4.0625000000000005e-05, "loss": 1.8518, "step": 4500 }, { "epoch": 0.38, "eval_loss": 1.7872947454452515, "eval_runtime": 521.8198, "eval_samples_per_second": 5.749, "eval_steps_per_second": 2.875, "step": 4500 }, { "epoch": 0.42, "grad_norm": 3.2742645740509033, "learning_rate": 3.958333333333333e-05, "loss": 1.8471, "step": 5000 }, { "epoch": 0.42, "eval_loss": 1.7879706621170044, "eval_runtime": 522.5919, "eval_samples_per_second": 5.741, "eval_steps_per_second": 2.87, "step": 5000 }, { "epoch": 0.46, "grad_norm": 2.861785411834717, "learning_rate": 3.854166666666667e-05, "loss": 1.8536, "step": 5500 }, { "epoch": 0.46, "eval_loss": 1.773565649986267, "eval_runtime": 522.5748, "eval_samples_per_second": 5.741, "eval_steps_per_second": 2.87, "step": 5500 }, { "epoch": 0.5, "grad_norm": 2.342087745666504, "learning_rate": 3.7500000000000003e-05, "loss": 1.8075, "step": 6000 }, { "epoch": 0.5, "eval_loss": 1.7772246599197388, "eval_runtime": 521.8614, "eval_samples_per_second": 5.749, "eval_steps_per_second": 2.874, "step": 6000 }, { "epoch": 0.54, "grad_norm": 3.7963709831237793, "learning_rate": 3.6458333333333336e-05, "loss": 1.8143, "step": 6500 }, { "epoch": 0.54, "eval_loss": 1.7724062204360962, "eval_runtime": 522.2168, "eval_samples_per_second": 5.745, "eval_steps_per_second": 2.872, "step": 6500 }, { "epoch": 0.58, "grad_norm": 2.7261409759521484, "learning_rate": 3.541666666666667e-05, "loss": 1.8383, "step": 7000 }, { "epoch": 0.58, "eval_loss": 1.7670141458511353, "eval_runtime": 523.5327, "eval_samples_per_second": 5.73, "eval_steps_per_second": 2.865, "step": 7000 }, { "epoch": 0.62, "grad_norm": 2.521083354949951, "learning_rate": 3.4375e-05, "loss": 1.746, "step": 7500 }, { "epoch": 0.62, "eval_loss": 1.774078607559204, "eval_runtime": 522.8622, "eval_samples_per_second": 5.738, "eval_steps_per_second": 2.869, "step": 7500 }, { "epoch": 0.67, "grad_norm": 2.275449752807617, "learning_rate": 3.3333333333333335e-05, "loss": 1.7844, "step": 8000 }, { "epoch": 0.67, "eval_loss": 1.7608264684677124, "eval_runtime": 524.0843, "eval_samples_per_second": 5.724, "eval_steps_per_second": 2.862, "step": 8000 }, { "epoch": 0.71, "grad_norm": 3.3412251472473145, "learning_rate": 3.229166666666667e-05, "loss": 1.7761, "step": 8500 }, { "epoch": 0.71, "eval_loss": 1.7679654359817505, "eval_runtime": 523.8138, "eval_samples_per_second": 5.727, "eval_steps_per_second": 2.864, "step": 8500 }, { "epoch": 0.75, "grad_norm": 2.000788688659668, "learning_rate": 3.125e-05, "loss": 1.7367, "step": 9000 }, { "epoch": 0.75, "eval_loss": 1.7554887533187866, "eval_runtime": 522.9431, "eval_samples_per_second": 5.737, "eval_steps_per_second": 2.868, "step": 9000 }, { "epoch": 0.79, "grad_norm": 2.744258403778076, "learning_rate": 3.0208333333333334e-05, "loss": 1.7656, "step": 9500 }, { "epoch": 0.79, "eval_loss": 1.7508151531219482, "eval_runtime": 522.4025, "eval_samples_per_second": 5.743, "eval_steps_per_second": 2.871, "step": 9500 }, { "epoch": 0.83, "grad_norm": 2.7946290969848633, "learning_rate": 2.916666666666667e-05, "loss": 1.7467, "step": 10000 }, { "epoch": 0.83, "eval_loss": 1.7557835578918457, "eval_runtime": 523.7331, "eval_samples_per_second": 5.728, "eval_steps_per_second": 2.864, "step": 10000 }, { "epoch": 0.88, "grad_norm": 2.4864296913146973, "learning_rate": 2.8125000000000003e-05, "loss": 1.7744, "step": 10500 }, { "epoch": 0.88, "eval_loss": 1.744908094406128, "eval_runtime": 523.6435, "eval_samples_per_second": 5.729, "eval_steps_per_second": 2.865, "step": 10500 }, { "epoch": 0.92, "grad_norm": 2.4293007850646973, "learning_rate": 2.7083333333333332e-05, "loss": 1.7513, "step": 11000 }, { "epoch": 0.92, "eval_loss": 1.7462434768676758, "eval_runtime": 523.5176, "eval_samples_per_second": 5.73, "eval_steps_per_second": 2.865, "step": 11000 }, { "epoch": 0.96, "grad_norm": 2.8948864936828613, "learning_rate": 2.604166666666667e-05, "loss": 1.7482, "step": 11500 }, { "epoch": 0.96, "eval_loss": 1.757570505142212, "eval_runtime": 523.9557, "eval_samples_per_second": 5.726, "eval_steps_per_second": 2.863, "step": 11500 }, { "epoch": 1.0, "grad_norm": 3.327749490737915, "learning_rate": 2.5e-05, "loss": 1.724, "step": 12000 }, { "epoch": 1.0, "eval_loss": 1.752493143081665, "eval_runtime": 523.6941, "eval_samples_per_second": 5.729, "eval_steps_per_second": 2.864, "step": 12000 }, { "epoch": 1.04, "grad_norm": 2.6060996055603027, "learning_rate": 2.3958333333333334e-05, "loss": 1.7043, "step": 12500 }, { "epoch": 1.04, "eval_loss": 1.7745699882507324, "eval_runtime": 524.2816, "eval_samples_per_second": 5.722, "eval_steps_per_second": 2.861, "step": 12500 }, { "epoch": 1.08, "grad_norm": 3.1417739391326904, "learning_rate": 2.2916666666666667e-05, "loss": 1.6869, "step": 13000 }, { "epoch": 1.08, "eval_loss": 1.7530784606933594, "eval_runtime": 524.8963, "eval_samples_per_second": 5.715, "eval_steps_per_second": 2.858, "step": 13000 }, { "epoch": 1.12, "grad_norm": 2.0913174152374268, "learning_rate": 2.1875e-05, "loss": 1.7405, "step": 13500 }, { "epoch": 1.12, "eval_loss": 1.7472872734069824, "eval_runtime": 523.5132, "eval_samples_per_second": 5.731, "eval_steps_per_second": 2.865, "step": 13500 }, { "epoch": 1.17, "grad_norm": 2.501850128173828, "learning_rate": 2.0833333333333336e-05, "loss": 1.7343, "step": 14000 }, { "epoch": 1.17, "eval_loss": 1.73961341381073, "eval_runtime": 523.4238, "eval_samples_per_second": 5.731, "eval_steps_per_second": 2.866, "step": 14000 }, { "epoch": 1.21, "grad_norm": 2.3180103302001953, "learning_rate": 1.9791666666666665e-05, "loss": 1.649, "step": 14500 }, { "epoch": 1.21, "eval_loss": 1.738362431526184, "eval_runtime": 522.9782, "eval_samples_per_second": 5.736, "eval_steps_per_second": 2.868, "step": 14500 }, { "epoch": 1.25, "grad_norm": 2.6544899940490723, "learning_rate": 1.8750000000000002e-05, "loss": 1.7208, "step": 15000 }, { "epoch": 1.25, "eval_loss": 1.7367714643478394, "eval_runtime": 522.0439, "eval_samples_per_second": 5.747, "eval_steps_per_second": 2.873, "step": 15000 }, { "epoch": 1.29, "grad_norm": 2.5193896293640137, "learning_rate": 1.7708333333333335e-05, "loss": 1.6931, "step": 15500 }, { "epoch": 1.29, "eval_loss": 1.7404463291168213, "eval_runtime": 523.0385, "eval_samples_per_second": 5.736, "eval_steps_per_second": 2.868, "step": 15500 }, { "epoch": 1.33, "grad_norm": 2.3002140522003174, "learning_rate": 1.6666666666666667e-05, "loss": 1.5941, "step": 16000 }, { "epoch": 1.33, "eval_loss": 1.8222692012786865, "eval_runtime": 521.374, "eval_samples_per_second": 5.754, "eval_steps_per_second": 2.877, "step": 16000 }, { "epoch": 1.38, "grad_norm": 3.3250951766967773, "learning_rate": 1.5625e-05, "loss": 1.6651, "step": 16500 }, { "epoch": 1.38, "eval_loss": 1.728702187538147, "eval_runtime": 522.2011, "eval_samples_per_second": 5.745, "eval_steps_per_second": 2.872, "step": 16500 }, { "epoch": 1.42, "grad_norm": 2.0962421894073486, "learning_rate": 1.4583333333333335e-05, "loss": 1.6649, "step": 17000 }, { "epoch": 1.42, "eval_loss": 1.741267204284668, "eval_runtime": 522.024, "eval_samples_per_second": 5.747, "eval_steps_per_second": 2.873, "step": 17000 }, { "epoch": 1.46, "grad_norm": 2.672060012817383, "learning_rate": 1.3541666666666666e-05, "loss": 1.7108, "step": 17500 }, { "epoch": 1.46, "eval_loss": 1.7304407358169556, "eval_runtime": 521.9513, "eval_samples_per_second": 5.748, "eval_steps_per_second": 2.874, "step": 17500 }, { "epoch": 1.5, "grad_norm": 2.358701229095459, "learning_rate": 1.25e-05, "loss": 1.713, "step": 18000 }, { "epoch": 1.5, "eval_loss": 1.7263309955596924, "eval_runtime": 521.5686, "eval_samples_per_second": 5.752, "eval_steps_per_second": 2.876, "step": 18000 }, { "epoch": 1.54, "grad_norm": 3.01208233833313, "learning_rate": 1.1458333333333333e-05, "loss": 1.6866, "step": 18500 }, { "epoch": 1.54, "eval_loss": 1.7139294147491455, "eval_runtime": 521.9607, "eval_samples_per_second": 5.748, "eval_steps_per_second": 2.874, "step": 18500 }, { "epoch": 1.58, "grad_norm": 1.647988200187683, "learning_rate": 1.0416666666666668e-05, "loss": 1.6461, "step": 19000 }, { "epoch": 1.58, "eval_loss": 1.7220714092254639, "eval_runtime": 523.0668, "eval_samples_per_second": 5.735, "eval_steps_per_second": 2.868, "step": 19000 }, { "epoch": 1.62, "grad_norm": 2.259550094604492, "learning_rate": 9.375000000000001e-06, "loss": 1.6886, "step": 19500 }, { "epoch": 1.62, "eval_loss": 1.7159427404403687, "eval_runtime": 522.0391, "eval_samples_per_second": 5.747, "eval_steps_per_second": 2.873, "step": 19500 }, { "epoch": 1.67, "grad_norm": 2.249917507171631, "learning_rate": 8.333333333333334e-06, "loss": 1.6511, "step": 20000 }, { "epoch": 1.67, "eval_loss": 1.7302324771881104, "eval_runtime": 521.3364, "eval_samples_per_second": 5.754, "eval_steps_per_second": 2.877, "step": 20000 }, { "epoch": 1.71, "grad_norm": 1.9024745225906372, "learning_rate": 7.2916666666666674e-06, "loss": 1.6626, "step": 20500 }, { "epoch": 1.71, "eval_loss": 1.7181695699691772, "eval_runtime": 489.9754, "eval_samples_per_second": 6.123, "eval_steps_per_second": 3.061, "step": 20500 }, { "epoch": 1.75, "grad_norm": 2.484917640686035, "learning_rate": 6.25e-06, "loss": 1.7052, "step": 21000 }, { "epoch": 1.75, "eval_loss": 1.716320514678955, "eval_runtime": 489.0607, "eval_samples_per_second": 6.134, "eval_steps_per_second": 3.067, "step": 21000 }, { "epoch": 1.79, "grad_norm": 2.4450478553771973, "learning_rate": 5.208333333333334e-06, "loss": 1.6831, "step": 21500 }, { "epoch": 1.79, "eval_loss": 1.7168132066726685, "eval_runtime": 489.1379, "eval_samples_per_second": 6.133, "eval_steps_per_second": 3.067, "step": 21500 }, { "epoch": 1.83, "grad_norm": 2.011652946472168, "learning_rate": 4.166666666666667e-06, "loss": 1.6057, "step": 22000 }, { "epoch": 1.83, "eval_loss": 1.715084195137024, "eval_runtime": 489.2379, "eval_samples_per_second": 6.132, "eval_steps_per_second": 3.066, "step": 22000 }, { "epoch": 1.88, "grad_norm": 1.7278447151184082, "learning_rate": 3.125e-06, "loss": 1.6761, "step": 22500 }, { "epoch": 1.88, "eval_loss": 1.7116761207580566, "eval_runtime": 488.632, "eval_samples_per_second": 6.14, "eval_steps_per_second": 3.07, "step": 22500 }, { "epoch": 1.92, "grad_norm": 1.4899113178253174, "learning_rate": 2.0833333333333334e-06, "loss": 1.6668, "step": 23000 }, { "epoch": 1.92, "eval_loss": 1.7163901329040527, "eval_runtime": 488.958, "eval_samples_per_second": 6.135, "eval_steps_per_second": 3.068, "step": 23000 }, { "epoch": 1.96, "grad_norm": 2.4776082038879395, "learning_rate": 1.0416666666666667e-06, "loss": 1.612, "step": 23500 }, { "epoch": 1.96, "eval_loss": 1.712184190750122, "eval_runtime": 490.7154, "eval_samples_per_second": 6.114, "eval_steps_per_second": 3.057, "step": 23500 }, { "epoch": 2.0, "grad_norm": 1.9447611570358276, "learning_rate": 0.0, "loss": 1.6617, "step": 24000 }, { "epoch": 2.0, "eval_loss": 1.7131377458572388, "eval_runtime": 488.7943, "eval_samples_per_second": 6.138, "eval_steps_per_second": 3.069, "step": 24000 }, { "epoch": 2.04, "grad_norm": 1.3564224243164062, "learning_rate": 1.597222222222222e-05, "loss": 1.641, "step": 24500 }, { "epoch": 2.04, "eval_loss": 1.727720856666565, "eval_runtime": 492.4028, "eval_samples_per_second": 6.093, "eval_steps_per_second": 3.046, "step": 24500 }, { "epoch": 2.08, "grad_norm": 2.5462839603424072, "learning_rate": 1.527777777777778e-05, "loss": 1.6595, "step": 25000 }, { "epoch": 2.08, "eval_loss": 1.7289303541183472, "eval_runtime": 491.6091, "eval_samples_per_second": 6.102, "eval_steps_per_second": 3.051, "step": 25000 }, { "epoch": 2.12, "grad_norm": 3.4221107959747314, "learning_rate": 1.4583333333333335e-05, "loss": 1.6723, "step": 25500 }, { "epoch": 2.12, "eval_loss": 1.7191786766052246, "eval_runtime": 491.508, "eval_samples_per_second": 6.104, "eval_steps_per_second": 3.052, "step": 25500 }, { "epoch": 2.17, "grad_norm": 2.4440248012542725, "learning_rate": 1.388888888888889e-05, "loss": 1.6347, "step": 26000 }, { "epoch": 2.17, "eval_loss": 1.725876808166504, "eval_runtime": 490.9308, "eval_samples_per_second": 6.111, "eval_steps_per_second": 3.055, "step": 26000 }, { "epoch": 2.21, "grad_norm": 2.328885316848755, "learning_rate": 1.3194444444444446e-05, "loss": 1.6684, "step": 26500 }, { "epoch": 2.21, "eval_loss": 1.7210530042648315, "eval_runtime": 490.3203, "eval_samples_per_second": 6.118, "eval_steps_per_second": 3.059, "step": 26500 }, { "epoch": 2.25, "grad_norm": 2.388496160507202, "learning_rate": 1.25e-05, "loss": 1.6098, "step": 27000 }, { "epoch": 2.25, "eval_loss": 1.7316367626190186, "eval_runtime": 489.0261, "eval_samples_per_second": 6.135, "eval_steps_per_second": 3.067, "step": 27000 }, { "epoch": 2.29, "grad_norm": 2.6930313110351562, "learning_rate": 1.1805555555555555e-05, "loss": 1.6025, "step": 27500 }, { "epoch": 2.29, "eval_loss": 1.7213103771209717, "eval_runtime": 490.0, "eval_samples_per_second": 6.122, "eval_steps_per_second": 3.061, "step": 27500 }, { "epoch": 2.33, "grad_norm": 2.5809788703918457, "learning_rate": 1.1111111111111112e-05, "loss": 1.5567, "step": 28000 }, { "epoch": 2.33, "eval_loss": 1.7238024473190308, "eval_runtime": 492.142, "eval_samples_per_second": 6.096, "eval_steps_per_second": 3.048, "step": 28000 }, { "epoch": 2.38, "grad_norm": 3.040238618850708, "learning_rate": 1.0416666666666668e-05, "loss": 1.6564, "step": 28500 }, { "epoch": 2.38, "eval_loss": 1.7184983491897583, "eval_runtime": 498.5346, "eval_samples_per_second": 6.018, "eval_steps_per_second": 3.009, "step": 28500 }, { "epoch": 2.42, "grad_norm": 1.9776560068130493, "learning_rate": 9.722222222222223e-06, "loss": 1.7078, "step": 29000 }, { "epoch": 2.42, "eval_loss": 1.7392594814300537, "eval_runtime": 495.7995, "eval_samples_per_second": 6.051, "eval_steps_per_second": 3.025, "step": 29000 }, { "epoch": 2.46, "grad_norm": 2.065131664276123, "learning_rate": 9.027777777777777e-06, "loss": 1.6308, "step": 29500 }, { "epoch": 2.46, "eval_loss": 1.7234176397323608, "eval_runtime": 495.0669, "eval_samples_per_second": 6.06, "eval_steps_per_second": 3.03, "step": 29500 }, { "epoch": 2.5, "grad_norm": 2.066938877105713, "learning_rate": 8.333333333333334e-06, "loss": 1.6402, "step": 30000 }, { "epoch": 2.5, "eval_loss": 1.7319450378417969, "eval_runtime": 495.0885, "eval_samples_per_second": 6.06, "eval_steps_per_second": 3.03, "step": 30000 }, { "epoch": 2.54, "grad_norm": 3.136577844619751, "learning_rate": 7.63888888888889e-06, "loss": 1.6333, "step": 30500 }, { "epoch": 2.54, "eval_loss": 1.7196767330169678, "eval_runtime": 494.8297, "eval_samples_per_second": 6.063, "eval_steps_per_second": 3.031, "step": 30500 }, { "epoch": 2.58, "grad_norm": 2.5112669467926025, "learning_rate": 6.944444444444445e-06, "loss": 1.6249, "step": 31000 }, { "epoch": 2.58, "eval_loss": 1.7298192977905273, "eval_runtime": 495.4931, "eval_samples_per_second": 6.055, "eval_steps_per_second": 3.027, "step": 31000 }, { "epoch": 2.62, "grad_norm": 2.3685288429260254, "learning_rate": 6.25e-06, "loss": 1.6366, "step": 31500 }, { "epoch": 2.62, "eval_loss": 1.7235329151153564, "eval_runtime": 496.2286, "eval_samples_per_second": 6.046, "eval_steps_per_second": 3.023, "step": 31500 }, { "epoch": 2.67, "grad_norm": 3.0769245624542236, "learning_rate": 5.555555555555556e-06, "loss": 1.6245, "step": 32000 }, { "epoch": 2.67, "eval_loss": 1.7288986444473267, "eval_runtime": 495.8639, "eval_samples_per_second": 6.05, "eval_steps_per_second": 3.025, "step": 32000 }, { "epoch": 2.71, "grad_norm": 2.3203799724578857, "learning_rate": 4.861111111111111e-06, "loss": 1.6044, "step": 32500 }, { "epoch": 2.71, "eval_loss": 1.716009497642517, "eval_runtime": 496.0984, "eval_samples_per_second": 6.047, "eval_steps_per_second": 3.024, "step": 32500 }, { "epoch": 2.75, "grad_norm": 2.5120131969451904, "learning_rate": 4.166666666666667e-06, "loss": 1.6095, "step": 33000 }, { "epoch": 2.75, "eval_loss": 1.7171862125396729, "eval_runtime": 495.1558, "eval_samples_per_second": 6.059, "eval_steps_per_second": 3.029, "step": 33000 }, { "epoch": 2.79, "grad_norm": 2.938133955001831, "learning_rate": 3.4722222222222224e-06, "loss": 1.6621, "step": 33500 }, { "epoch": 2.79, "eval_loss": 1.7209596633911133, "eval_runtime": 495.594, "eval_samples_per_second": 6.053, "eval_steps_per_second": 3.027, "step": 33500 }, { "epoch": 2.83, "grad_norm": 1.8592183589935303, "learning_rate": 2.777777777777778e-06, "loss": 1.6883, "step": 34000 }, { "epoch": 2.83, "eval_loss": 1.716880202293396, "eval_runtime": 495.2039, "eval_samples_per_second": 6.058, "eval_steps_per_second": 3.029, "step": 34000 }, { "epoch": 2.88, "grad_norm": 2.0370161533355713, "learning_rate": 2.0833333333333334e-06, "loss": 1.6449, "step": 34500 }, { "epoch": 2.88, "eval_loss": 1.715524435043335, "eval_runtime": 494.6462, "eval_samples_per_second": 6.065, "eval_steps_per_second": 3.032, "step": 34500 }, { "epoch": 2.92, "grad_norm": 2.4444518089294434, "learning_rate": 1.388888888888889e-06, "loss": 1.6439, "step": 35000 }, { "epoch": 2.92, "eval_loss": 1.7200895547866821, "eval_runtime": 494.3013, "eval_samples_per_second": 6.069, "eval_steps_per_second": 3.035, "step": 35000 }, { "epoch": 2.96, "grad_norm": 3.6948931217193604, "learning_rate": 6.944444444444445e-07, "loss": 1.6358, "step": 35500 }, { "epoch": 2.96, "eval_loss": 1.7187682390213013, "eval_runtime": 495.0903, "eval_samples_per_second": 6.06, "eval_steps_per_second": 3.03, "step": 35500 }, { "epoch": 3.0, "grad_norm": 2.0012359619140625, "learning_rate": 0.0, "loss": 1.6033, "step": 36000 }, { "epoch": 3.0, "eval_loss": 1.7206045389175415, "eval_runtime": 494.6912, "eval_samples_per_second": 6.064, "eval_steps_per_second": 3.032, "step": 36000 } ], "logging_steps": 500, "max_steps": 36000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 7.8015765676032e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }