{ "best_metric": 0.5004217028617859, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_t5/t5_base_ledgar/checkpoint-2800", "epoch": 3.0, "eval_steps": 100, "global_step": 2814, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 2.0540711879730225, "learning_rate": 0.0004955579246624023, "loss": 4.1527, "step": 25 }, { "epoch": 0.05, "grad_norm": 2.241663694381714, "learning_rate": 0.0004911158493248046, "loss": 2.8808, "step": 50 }, { "epoch": 0.08, "grad_norm": 2.093815803527832, "learning_rate": 0.00048667377398720687, "loss": 1.9075, "step": 75 }, { "epoch": 0.11, "grad_norm": 2.396036148071289, "learning_rate": 0.0004822316986496091, "loss": 1.443, "step": 100 }, { "epoch": 0.11, "eval_accuracy": 0.7291, "eval_f1_macro": 0.5312263291596806, "eval_f1_micro": 0.7291, "eval_loss": 1.113275408744812, "eval_runtime": 24.8696, "eval_samples_per_second": 402.097, "eval_steps_per_second": 6.313, "step": 100 }, { "epoch": 0.13, "grad_norm": 2.6226117610931396, "learning_rate": 0.0004777896233120114, "loss": 1.1778, "step": 125 }, { "epoch": 0.16, "grad_norm": 2.527249574661255, "learning_rate": 0.00047334754797441367, "loss": 1.0394, "step": 150 }, { "epoch": 0.19, "grad_norm": 2.435964584350586, "learning_rate": 0.00046890547263681595, "loss": 0.9535, "step": 175 }, { "epoch": 0.21, "grad_norm": 2.0345797538757324, "learning_rate": 0.00046446339729921824, "loss": 0.8813, "step": 200 }, { "epoch": 0.21, "eval_accuracy": 0.7712, "eval_f1_macro": 0.6296223926135491, "eval_f1_micro": 0.7712, "eval_loss": 0.8404093980789185, "eval_runtime": 24.3258, "eval_samples_per_second": 411.086, "eval_steps_per_second": 6.454, "step": 200 }, { "epoch": 0.24, "grad_norm": 4.247000217437744, "learning_rate": 0.0004600213219616205, "loss": 0.8062, "step": 225 }, { "epoch": 0.27, "grad_norm": 1.9799127578735352, "learning_rate": 0.00045557924662402275, "loss": 0.8654, "step": 250 }, { "epoch": 0.29, "grad_norm": 2.0902786254882812, "learning_rate": 0.000451137171286425, "loss": 0.8073, "step": 275 }, { "epoch": 0.32, "grad_norm": 1.7158573865890503, "learning_rate": 0.00044669509594882727, "loss": 0.761, "step": 300 }, { "epoch": 0.32, "eval_accuracy": 0.8021, "eval_f1_macro": 0.6788544961571827, "eval_f1_micro": 0.8021, "eval_loss": 0.738567590713501, "eval_runtime": 24.2955, "eval_samples_per_second": 411.599, "eval_steps_per_second": 6.462, "step": 300 }, { "epoch": 0.35, "grad_norm": 2.2180001735687256, "learning_rate": 0.00044225302061122956, "loss": 0.7375, "step": 325 }, { "epoch": 0.37, "grad_norm": 2.029712677001953, "learning_rate": 0.00043781094527363184, "loss": 0.7601, "step": 350 }, { "epoch": 0.4, "grad_norm": 2.0835516452789307, "learning_rate": 0.0004333688699360341, "loss": 0.6685, "step": 375 }, { "epoch": 0.43, "grad_norm": 2.2809343338012695, "learning_rate": 0.0004289267945984364, "loss": 0.7358, "step": 400 }, { "epoch": 0.43, "eval_accuracy": 0.805, "eval_f1_macro": 0.6786916719278343, "eval_f1_micro": 0.805, "eval_loss": 0.731271505355835, "eval_runtime": 24.2168, "eval_samples_per_second": 412.937, "eval_steps_per_second": 6.483, "step": 400 }, { "epoch": 0.45, "grad_norm": 2.0364644527435303, "learning_rate": 0.00042448471926083864, "loss": 0.7414, "step": 425 }, { "epoch": 0.48, "grad_norm": 2.7770190238952637, "learning_rate": 0.00042004264392324093, "loss": 0.8142, "step": 450 }, { "epoch": 0.51, "grad_norm": 9.388680458068848, "learning_rate": 0.0004156005685856432, "loss": 0.7282, "step": 475 }, { "epoch": 0.53, "grad_norm": 2.113584041595459, "learning_rate": 0.0004111584932480455, "loss": 0.7624, "step": 500 }, { "epoch": 0.53, "eval_accuracy": 0.8164, "eval_f1_macro": 0.7134072796381032, "eval_f1_micro": 0.8164, "eval_loss": 0.6560911536216736, "eval_runtime": 24.2135, "eval_samples_per_second": 412.993, "eval_steps_per_second": 6.484, "step": 500 }, { "epoch": 0.56, "grad_norm": 1.490932822227478, "learning_rate": 0.0004067164179104478, "loss": 0.7123, "step": 525 }, { "epoch": 0.59, "grad_norm": 1.1674153804779053, "learning_rate": 0.00040227434257285007, "loss": 0.692, "step": 550 }, { "epoch": 0.61, "grad_norm": 2.5101075172424316, "learning_rate": 0.00039783226723525235, "loss": 0.6392, "step": 575 }, { "epoch": 0.64, "grad_norm": 1.797776460647583, "learning_rate": 0.0003933901918976546, "loss": 0.7067, "step": 600 }, { "epoch": 0.64, "eval_accuracy": 0.821, "eval_f1_macro": 0.7273350797174626, "eval_f1_micro": 0.821, "eval_loss": 0.6418954133987427, "eval_runtime": 24.1496, "eval_samples_per_second": 414.086, "eval_steps_per_second": 6.501, "step": 600 }, { "epoch": 0.67, "grad_norm": 1.9408563375473022, "learning_rate": 0.00038894811656005687, "loss": 0.6762, "step": 625 }, { "epoch": 0.69, "grad_norm": 1.9674413204193115, "learning_rate": 0.00038450604122245916, "loss": 0.6784, "step": 650 }, { "epoch": 0.72, "grad_norm": 2.5168087482452393, "learning_rate": 0.00038006396588486144, "loss": 0.7106, "step": 675 }, { "epoch": 0.75, "grad_norm": 1.7439873218536377, "learning_rate": 0.0003756218905472637, "loss": 0.6298, "step": 700 }, { "epoch": 0.75, "eval_accuracy": 0.8254, "eval_f1_macro": 0.7229632924236448, "eval_f1_micro": 0.8254, "eval_loss": 0.6412187218666077, "eval_runtime": 24.0401, "eval_samples_per_second": 415.971, "eval_steps_per_second": 6.531, "step": 700 }, { "epoch": 0.77, "grad_norm": 1.6693323850631714, "learning_rate": 0.00037117981520966596, "loss": 0.6267, "step": 725 }, { "epoch": 0.8, "grad_norm": 2.042109251022339, "learning_rate": 0.00036673773987206824, "loss": 0.7153, "step": 750 }, { "epoch": 0.83, "grad_norm": 1.5407096147537231, "learning_rate": 0.0003622956645344705, "loss": 0.6641, "step": 775 }, { "epoch": 0.85, "grad_norm": 1.5067718029022217, "learning_rate": 0.00035785358919687276, "loss": 0.6544, "step": 800 }, { "epoch": 0.85, "eval_accuracy": 0.8217, "eval_f1_macro": 0.7223377361999187, "eval_f1_micro": 0.8217, "eval_loss": 0.6277242302894592, "eval_runtime": 24.1366, "eval_samples_per_second": 414.309, "eval_steps_per_second": 6.505, "step": 800 }, { "epoch": 0.88, "grad_norm": 2.1698966026306152, "learning_rate": 0.00035341151385927504, "loss": 0.7108, "step": 825 }, { "epoch": 0.91, "grad_norm": 1.2698220014572144, "learning_rate": 0.00034896943852167733, "loss": 0.5897, "step": 850 }, { "epoch": 0.93, "grad_norm": 2.0006988048553467, "learning_rate": 0.0003445273631840796, "loss": 0.6386, "step": 875 }, { "epoch": 0.96, "grad_norm": 1.7718091011047363, "learning_rate": 0.0003400852878464819, "loss": 0.5781, "step": 900 }, { "epoch": 0.96, "eval_accuracy": 0.8305, "eval_f1_macro": 0.7420217283556048, "eval_f1_micro": 0.8305, "eval_loss": 0.6054204702377319, "eval_runtime": 24.1901, "eval_samples_per_second": 413.393, "eval_steps_per_second": 6.49, "step": 900 }, { "epoch": 0.99, "grad_norm": 1.9609644412994385, "learning_rate": 0.00033564321250888413, "loss": 0.6415, "step": 925 }, { "epoch": 1.01, "grad_norm": 1.4652588367462158, "learning_rate": 0.0003312011371712864, "loss": 0.5699, "step": 950 }, { "epoch": 1.04, "grad_norm": 1.6652113199234009, "learning_rate": 0.0003267590618336887, "loss": 0.486, "step": 975 }, { "epoch": 1.07, "grad_norm": 1.3951687812805176, "learning_rate": 0.000322316986496091, "loss": 0.4674, "step": 1000 }, { "epoch": 1.07, "eval_accuracy": 0.8346, "eval_f1_macro": 0.737133370653897, "eval_f1_micro": 0.8346, "eval_loss": 0.6210275292396545, "eval_runtime": 24.1347, "eval_samples_per_second": 414.342, "eval_steps_per_second": 6.505, "step": 1000 }, { "epoch": 1.09, "grad_norm": 1.52338445186615, "learning_rate": 0.00031787491115849327, "loss": 0.4979, "step": 1025 }, { "epoch": 1.12, "grad_norm": 1.9127336740493774, "learning_rate": 0.00031343283582089556, "loss": 0.4658, "step": 1050 }, { "epoch": 1.15, "grad_norm": 1.5914901494979858, "learning_rate": 0.00030899076048329784, "loss": 0.5625, "step": 1075 }, { "epoch": 1.17, "grad_norm": 2.0196564197540283, "learning_rate": 0.0003045486851457001, "loss": 0.4929, "step": 1100 }, { "epoch": 1.17, "eval_accuracy": 0.8387, "eval_f1_macro": 0.7423130077227065, "eval_f1_micro": 0.8387, "eval_loss": 0.5875550508499146, "eval_runtime": 24.1428, "eval_samples_per_second": 414.202, "eval_steps_per_second": 6.503, "step": 1100 }, { "epoch": 1.2, "grad_norm": 1.3181229829788208, "learning_rate": 0.00030010660980810236, "loss": 0.4999, "step": 1125 }, { "epoch": 1.23, "grad_norm": 1.321296215057373, "learning_rate": 0.00029566453447050464, "loss": 0.4456, "step": 1150 }, { "epoch": 1.25, "grad_norm": 1.4967468976974487, "learning_rate": 0.0002912224591329069, "loss": 0.443, "step": 1175 }, { "epoch": 1.28, "grad_norm": 1.622883915901184, "learning_rate": 0.00028678038379530916, "loss": 0.566, "step": 1200 }, { "epoch": 1.28, "eval_accuracy": 0.8475, "eval_f1_macro": 0.7633448680546241, "eval_f1_micro": 0.8475, "eval_loss": 0.5779463648796082, "eval_runtime": 24.1848, "eval_samples_per_second": 413.483, "eval_steps_per_second": 6.492, "step": 1200 }, { "epoch": 1.31, "grad_norm": 2.2820820808410645, "learning_rate": 0.00028233830845771145, "loss": 0.5382, "step": 1225 }, { "epoch": 1.33, "grad_norm": 1.5108492374420166, "learning_rate": 0.00027789623312011373, "loss": 0.5314, "step": 1250 }, { "epoch": 1.36, "grad_norm": 1.2868916988372803, "learning_rate": 0.00027345415778251596, "loss": 0.4951, "step": 1275 }, { "epoch": 1.39, "grad_norm": 2.0262043476104736, "learning_rate": 0.00026901208244491825, "loss": 0.4577, "step": 1300 }, { "epoch": 1.39, "eval_accuracy": 0.8435, "eval_f1_macro": 0.7507843808294652, "eval_f1_micro": 0.8435, "eval_loss": 0.5771787762641907, "eval_runtime": 24.1138, "eval_samples_per_second": 414.7, "eval_steps_per_second": 6.511, "step": 1300 }, { "epoch": 1.41, "grad_norm": 1.7890022993087769, "learning_rate": 0.00026457000710732053, "loss": 0.5097, "step": 1325 }, { "epoch": 1.44, "grad_norm": 1.5777374505996704, "learning_rate": 0.0002601279317697228, "loss": 0.4894, "step": 1350 }, { "epoch": 1.47, "grad_norm": 1.6835675239562988, "learning_rate": 0.0002556858564321251, "loss": 0.4487, "step": 1375 }, { "epoch": 1.49, "grad_norm": 1.7419664859771729, "learning_rate": 0.0002512437810945274, "loss": 0.4233, "step": 1400 }, { "epoch": 1.49, "eval_accuracy": 0.8476, "eval_f1_macro": 0.7624728239220356, "eval_f1_micro": 0.8476, "eval_loss": 0.5580869913101196, "eval_runtime": 24.1623, "eval_samples_per_second": 413.868, "eval_steps_per_second": 6.498, "step": 1400 }, { "epoch": 1.52, "grad_norm": 1.6330546140670776, "learning_rate": 0.0002468017057569296, "loss": 0.4985, "step": 1425 }, { "epoch": 1.55, "grad_norm": 1.5560388565063477, "learning_rate": 0.00024235963041933193, "loss": 0.4902, "step": 1450 }, { "epoch": 1.57, "grad_norm": 1.1412893533706665, "learning_rate": 0.0002379175550817342, "loss": 0.4811, "step": 1475 }, { "epoch": 1.6, "grad_norm": 1.4423662424087524, "learning_rate": 0.00023347547974413648, "loss": 0.4567, "step": 1500 }, { "epoch": 1.6, "eval_accuracy": 0.8462, "eval_f1_macro": 0.7575976290013776, "eval_f1_micro": 0.8462, "eval_loss": 0.5687991976737976, "eval_runtime": 24.159, "eval_samples_per_second": 413.925, "eval_steps_per_second": 6.499, "step": 1500 }, { "epoch": 1.63, "grad_norm": 1.9873683452606201, "learning_rate": 0.00022903340440653876, "loss": 0.4455, "step": 1525 }, { "epoch": 1.65, "grad_norm": 1.4635204076766968, "learning_rate": 0.000224591329068941, "loss": 0.4457, "step": 1550 }, { "epoch": 1.68, "grad_norm": 1.56647789478302, "learning_rate": 0.00022014925373134328, "loss": 0.4223, "step": 1575 }, { "epoch": 1.71, "grad_norm": 1.8017354011535645, "learning_rate": 0.00021570717839374556, "loss": 0.483, "step": 1600 }, { "epoch": 1.71, "eval_accuracy": 0.8478, "eval_f1_macro": 0.7608717953670078, "eval_f1_micro": 0.8478, "eval_loss": 0.5547010898590088, "eval_runtime": 24.1023, "eval_samples_per_second": 414.897, "eval_steps_per_second": 6.514, "step": 1600 }, { "epoch": 1.73, "grad_norm": 1.768385887145996, "learning_rate": 0.00021126510305614785, "loss": 0.4593, "step": 1625 }, { "epoch": 1.76, "grad_norm": 1.8453024625778198, "learning_rate": 0.0002068230277185501, "loss": 0.4378, "step": 1650 }, { "epoch": 1.79, "grad_norm": 1.5103825330734253, "learning_rate": 0.0002023809523809524, "loss": 0.455, "step": 1675 }, { "epoch": 1.81, "grad_norm": 1.8187497854232788, "learning_rate": 0.00019793887704335468, "loss": 0.4649, "step": 1700 }, { "epoch": 1.81, "eval_accuracy": 0.851, "eval_f1_macro": 0.7680217546192462, "eval_f1_micro": 0.851, "eval_loss": 0.5395861864089966, "eval_runtime": 24.1531, "eval_samples_per_second": 414.025, "eval_steps_per_second": 6.5, "step": 1700 }, { "epoch": 1.84, "grad_norm": 2.1151626110076904, "learning_rate": 0.00019349680170575694, "loss": 0.5057, "step": 1725 }, { "epoch": 1.87, "grad_norm": 1.4344931840896606, "learning_rate": 0.00018905472636815922, "loss": 0.4275, "step": 1750 }, { "epoch": 1.89, "grad_norm": 3.410400152206421, "learning_rate": 0.00018461265103056148, "loss": 0.4973, "step": 1775 }, { "epoch": 1.92, "grad_norm": 2.3107552528381348, "learning_rate": 0.00018017057569296374, "loss": 0.4288, "step": 1800 }, { "epoch": 1.92, "eval_accuracy": 0.8577, "eval_f1_macro": 0.7759139835888773, "eval_f1_micro": 0.8577, "eval_loss": 0.5235319137573242, "eval_runtime": 24.1535, "eval_samples_per_second": 414.018, "eval_steps_per_second": 6.5, "step": 1800 }, { "epoch": 1.95, "grad_norm": 1.2763726711273193, "learning_rate": 0.00017572850035536602, "loss": 0.5008, "step": 1825 }, { "epoch": 1.97, "grad_norm": 1.5869358777999878, "learning_rate": 0.0001712864250177683, "loss": 0.4581, "step": 1850 }, { "epoch": 2.0, "grad_norm": 1.450799822807312, "learning_rate": 0.0001668443496801706, "loss": 0.4103, "step": 1875 }, { "epoch": 2.03, "grad_norm": 1.3206963539123535, "learning_rate": 0.00016240227434257285, "loss": 0.3445, "step": 1900 }, { "epoch": 2.03, "eval_accuracy": 0.8603, "eval_f1_macro": 0.7790782413257752, "eval_f1_micro": 0.8603, "eval_loss": 0.520423948764801, "eval_runtime": 24.0544, "eval_samples_per_second": 415.724, "eval_steps_per_second": 6.527, "step": 1900 }, { "epoch": 2.05, "grad_norm": 1.3110324144363403, "learning_rate": 0.00015796019900497514, "loss": 0.3278, "step": 1925 }, { "epoch": 2.08, "grad_norm": 1.4223313331604004, "learning_rate": 0.00015351812366737742, "loss": 0.3376, "step": 1950 }, { "epoch": 2.11, "grad_norm": 1.6518670320510864, "learning_rate": 0.00014907604832977968, "loss": 0.3519, "step": 1975 }, { "epoch": 2.13, "grad_norm": 1.5389341115951538, "learning_rate": 0.00014463397299218194, "loss": 0.3014, "step": 2000 }, { "epoch": 2.13, "eval_accuracy": 0.8607, "eval_f1_macro": 0.786161944348828, "eval_f1_micro": 0.8607, "eval_loss": 0.5268532037734985, "eval_runtime": 24.1182, "eval_samples_per_second": 414.624, "eval_steps_per_second": 6.51, "step": 2000 }, { "epoch": 2.16, "grad_norm": 1.810062289237976, "learning_rate": 0.00014019189765458422, "loss": 0.3184, "step": 2025 }, { "epoch": 2.19, "grad_norm": 1.4731919765472412, "learning_rate": 0.00013574982231698648, "loss": 0.3457, "step": 2050 }, { "epoch": 2.21, "grad_norm": 1.8230247497558594, "learning_rate": 0.00013130774697938877, "loss": 0.3107, "step": 2075 }, { "epoch": 2.24, "grad_norm": 1.9620997905731201, "learning_rate": 0.00012686567164179105, "loss": 0.3301, "step": 2100 }, { "epoch": 2.24, "eval_accuracy": 0.8591, "eval_f1_macro": 0.7826263587368261, "eval_f1_micro": 0.8591, "eval_loss": 0.5233541131019592, "eval_runtime": 24.1529, "eval_samples_per_second": 414.03, "eval_steps_per_second": 6.5, "step": 2100 }, { "epoch": 2.27, "grad_norm": 1.9270436763763428, "learning_rate": 0.0001224235963041933, "loss": 0.3273, "step": 2125 }, { "epoch": 2.29, "grad_norm": 1.2540065050125122, "learning_rate": 0.0001179815209665956, "loss": 0.2682, "step": 2150 }, { "epoch": 2.32, "grad_norm": 0.6836864948272705, "learning_rate": 0.00011353944562899787, "loss": 0.2787, "step": 2175 }, { "epoch": 2.35, "grad_norm": 1.7758402824401855, "learning_rate": 0.00010909737029140014, "loss": 0.3069, "step": 2200 }, { "epoch": 2.35, "eval_accuracy": 0.8624, "eval_f1_macro": 0.785058711126236, "eval_f1_micro": 0.8624, "eval_loss": 0.5265922546386719, "eval_runtime": 24.117, "eval_samples_per_second": 414.644, "eval_steps_per_second": 6.51, "step": 2200 }, { "epoch": 2.37, "grad_norm": 1.3381538391113281, "learning_rate": 0.00010465529495380242, "loss": 0.2866, "step": 2225 }, { "epoch": 2.4, "grad_norm": 1.759792685508728, "learning_rate": 0.0001002132196162047, "loss": 0.3031, "step": 2250 }, { "epoch": 2.43, "grad_norm": 1.2064933776855469, "learning_rate": 9.577114427860697e-05, "loss": 0.3182, "step": 2275 }, { "epoch": 2.45, "grad_norm": 1.0604907274246216, "learning_rate": 9.132906894100924e-05, "loss": 0.3095, "step": 2300 }, { "epoch": 2.45, "eval_accuracy": 0.8629, "eval_f1_macro": 0.7846414495209991, "eval_f1_micro": 0.8629, "eval_loss": 0.5154865980148315, "eval_runtime": 24.1319, "eval_samples_per_second": 414.389, "eval_steps_per_second": 6.506, "step": 2300 }, { "epoch": 2.48, "grad_norm": 1.9324097633361816, "learning_rate": 8.688699360341151e-05, "loss": 0.3321, "step": 2325 }, { "epoch": 2.51, "grad_norm": 1.9070733785629272, "learning_rate": 8.24449182658138e-05, "loss": 0.3165, "step": 2350 }, { "epoch": 2.53, "grad_norm": 1.3031100034713745, "learning_rate": 7.800284292821607e-05, "loss": 0.2939, "step": 2375 }, { "epoch": 2.56, "grad_norm": 1.479374647140503, "learning_rate": 7.356076759061834e-05, "loss": 0.3164, "step": 2400 }, { "epoch": 2.56, "eval_accuracy": 0.8646, "eval_f1_macro": 0.7909189563960074, "eval_f1_micro": 0.8646, "eval_loss": 0.5106394290924072, "eval_runtime": 24.1589, "eval_samples_per_second": 413.927, "eval_steps_per_second": 6.499, "step": 2400 }, { "epoch": 2.59, "grad_norm": 1.6215256452560425, "learning_rate": 6.911869225302061e-05, "loss": 0.3524, "step": 2425 }, { "epoch": 2.61, "grad_norm": 1.5538333654403687, "learning_rate": 6.467661691542288e-05, "loss": 0.3354, "step": 2450 }, { "epoch": 2.64, "grad_norm": 1.515882134437561, "learning_rate": 6.023454157782516e-05, "loss": 0.2992, "step": 2475 }, { "epoch": 2.67, "grad_norm": 2.216226100921631, "learning_rate": 5.579246624022743e-05, "loss": 0.2914, "step": 2500 }, { "epoch": 2.67, "eval_accuracy": 0.8647, "eval_f1_macro": 0.7934095832580423, "eval_f1_micro": 0.8647, "eval_loss": 0.5055064558982849, "eval_runtime": 24.1523, "eval_samples_per_second": 414.039, "eval_steps_per_second": 6.5, "step": 2500 }, { "epoch": 2.69, "grad_norm": 2.8203630447387695, "learning_rate": 5.135039090262971e-05, "loss": 0.321, "step": 2525 }, { "epoch": 2.72, "grad_norm": 2.173407793045044, "learning_rate": 4.690831556503199e-05, "loss": 0.3, "step": 2550 }, { "epoch": 2.75, "grad_norm": 1.960857629776001, "learning_rate": 4.2466240227434255e-05, "loss": 0.3181, "step": 2575 }, { "epoch": 2.77, "grad_norm": 1.7531359195709229, "learning_rate": 3.802416488983653e-05, "loss": 0.2946, "step": 2600 }, { "epoch": 2.77, "eval_accuracy": 0.8643, "eval_f1_macro": 0.7917093314024903, "eval_f1_micro": 0.8643, "eval_loss": 0.502694308757782, "eval_runtime": 24.1319, "eval_samples_per_second": 414.389, "eval_steps_per_second": 6.506, "step": 2600 }, { "epoch": 2.8, "grad_norm": 1.2817922830581665, "learning_rate": 3.3582089552238805e-05, "loss": 0.2963, "step": 2625 }, { "epoch": 2.83, "grad_norm": 2.3058297634124756, "learning_rate": 2.9140014214641083e-05, "loss": 0.33, "step": 2650 }, { "epoch": 2.85, "grad_norm": 1.1777921915054321, "learning_rate": 2.4697938877043355e-05, "loss": 0.3122, "step": 2675 }, { "epoch": 2.88, "grad_norm": 1.6172949075698853, "learning_rate": 2.025586353944563e-05, "loss": 0.3012, "step": 2700 }, { "epoch": 2.88, "eval_accuracy": 0.8671, "eval_f1_macro": 0.7953004112768677, "eval_f1_micro": 0.8671, "eval_loss": 0.500918984413147, "eval_runtime": 24.1653, "eval_samples_per_second": 413.817, "eval_steps_per_second": 6.497, "step": 2700 }, { "epoch": 2.91, "grad_norm": 1.0941059589385986, "learning_rate": 1.5813788201847902e-05, "loss": 0.2869, "step": 2725 }, { "epoch": 2.93, "grad_norm": 1.700844168663025, "learning_rate": 1.1371712864250177e-05, "loss": 0.2968, "step": 2750 }, { "epoch": 2.96, "grad_norm": 1.8403632640838623, "learning_rate": 6.929637526652452e-06, "loss": 0.272, "step": 2775 }, { "epoch": 2.99, "grad_norm": 1.8172844648361206, "learning_rate": 2.4875621890547264e-06, "loss": 0.3181, "step": 2800 }, { "epoch": 2.99, "eval_accuracy": 0.8664, "eval_f1_macro": 0.7947908970820687, "eval_f1_micro": 0.8664, "eval_loss": 0.5004217028617859, "eval_runtime": 24.1769, "eval_samples_per_second": 413.617, "eval_steps_per_second": 6.494, "step": 2800 }, { "epoch": 3.0, "step": 2814, "total_flos": 2.751004323033907e+16, "train_loss": 0.5752294131348806, "train_runtime": 2000.5798, "train_samples_per_second": 89.974, "train_steps_per_second": 1.407 } ], "logging_steps": 25, "max_steps": 2814, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 2.751004323033907e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }