{ "best_metric": 7.342555999755859, "best_model_checkpoint": "/data1/attanasiog/babylm/roberta-tiny-8l-10M/checkpoint-700", "epoch": 17.698779704560053, "global_step": 850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21, "learning_rate": 8e-05, "loss": 10.2998, "step": 10 }, { "epoch": 0.41, "learning_rate": 0.00016, "loss": 8.9979, "step": 20 }, { "epoch": 0.62, "learning_rate": 0.00024, "loss": 7.8015, "step": 30 }, { "epoch": 0.82, "learning_rate": 0.00032, "loss": 7.3376, "step": 40 }, { "epoch": 1.04, "learning_rate": 0.0004, "loss": 7.8102, "step": 50 }, { "epoch": 1.04, "eval_accuracy": 0.05136765891155645, "eval_loss": 7.374657154083252, "eval_runtime": 180.8913, "eval_samples_per_second": 132.98, "eval_steps_per_second": 4.157, "step": 50 }, { "epoch": 1.25, "learning_rate": 0.000399995625676045, "loss": 7.3419, "step": 60 }, { "epoch": 1.45, "learning_rate": 0.0003999825028955268, "loss": 8.1652, "step": 70 }, { "epoch": 1.66, "learning_rate": 0.0003999606322324786, "loss": 8.4182, "step": 80 }, { "epoch": 1.86, "learning_rate": 0.0003999300146435939, "loss": 7.3249, "step": 90 }, { "epoch": 2.08, "learning_rate": 0.00039989065146818525, "loss": 7.805, "step": 100 }, { "epoch": 2.08, "eval_accuracy": 0.051684268514440884, "eval_loss": 7.369903087615967, "eval_runtime": 181.0798, "eval_samples_per_second": 132.842, "eval_steps_per_second": 4.153, "step": 100 }, { "epoch": 2.29, "learning_rate": 0.0003998425444281255, "loss": 7.3101, "step": 110 }, { "epoch": 2.49, "learning_rate": 0.00039978569562777234, "loss": 7.3232, "step": 120 }, { "epoch": 2.7, "learning_rate": 0.0003997201075538765, "loss": 7.3073, "step": 130 }, { "epoch": 2.9, "learning_rate": 0.0003996457830754729, "loss": 7.3236, "step": 140 }, { "epoch": 3.12, "learning_rate": 0.00039956272544375493, "loss": 7.7907, "step": 150 }, { "epoch": 3.12, "eval_accuracy": 0.05174263561361906, "eval_loss": 7.35952091217041, "eval_runtime": 180.7769, "eval_samples_per_second": 133.065, "eval_steps_per_second": 4.16, "step": 150 }, { "epoch": 3.33, "learning_rate": 0.00039947093829193245, "loss": 7.2981, "step": 160 }, { "epoch": 3.53, "learning_rate": 0.00039937042563507283, "loss": 7.3259, "step": 170 }, { "epoch": 3.74, "learning_rate": 0.00039926119186992537, "loss": 7.3352, "step": 180 }, { "epoch": 3.95, "learning_rate": 0.0003991432417747288, "loss": 7.3069, "step": 190 }, { "epoch": 4.16, "learning_rate": 0.0003990165805090023, "loss": 7.7838, "step": 200 }, { "epoch": 4.16, "eval_accuracy": 0.05138188801155976, "eval_loss": 7.361721992492676, "eval_runtime": 180.6907, "eval_samples_per_second": 133.128, "eval_steps_per_second": 4.162, "step": 200 }, { "epoch": 4.37, "learning_rate": 0.00039888121361332003, "loss": 7.3066, "step": 210 }, { "epoch": 4.58, "learning_rate": 0.0003987371470090686, "loss": 7.3237, "step": 220 }, { "epoch": 4.78, "learning_rate": 0.00039858438699818784, "loss": 7.3209, "step": 230 }, { "epoch": 4.99, "learning_rate": 0.0003984229402628956, "loss": 7.3024, "step": 240 }, { "epoch": 5.21, "learning_rate": 0.00039825281386539503, "loss": 7.7706, "step": 250 }, { "epoch": 5.21, "eval_accuracy": 0.05140231728427503, "eval_loss": 7.358623504638672, "eval_runtime": 180.7786, "eval_samples_per_second": 133.063, "eval_steps_per_second": 4.16, "step": 250 }, { "epoch": 5.41, "learning_rate": 0.000398074015247566, "loss": 7.3135, "step": 260 }, { "epoch": 5.62, "learning_rate": 0.0003978865522306392, "loss": 7.3003, "step": 270 }, { "epoch": 5.82, "learning_rate": 0.0003976904330148543, "loss": 7.3159, "step": 280 }, { "epoch": 6.04, "learning_rate": 0.00039748566617910113, "loss": 7.7967, "step": 290 }, { "epoch": 6.25, "learning_rate": 0.0003972722606805445, "loss": 7.2933, "step": 300 }, { "epoch": 6.25, "eval_accuracy": 0.05126180317018771, "eval_loss": 7.356584548950195, "eval_runtime": 180.7497, "eval_samples_per_second": 133.085, "eval_steps_per_second": 4.16, "step": 300 }, { "epoch": 6.45, "learning_rate": 0.00039705022585423216, "loss": 7.3163, "step": 310 }, { "epoch": 6.66, "learning_rate": 0.0003968195714126868, "loss": 7.2904, "step": 320 }, { "epoch": 6.86, "learning_rate": 0.00039658030744548075, "loss": 7.3045, "step": 330 }, { "epoch": 7.08, "learning_rate": 0.0003963324444187952, "loss": 7.7849, "step": 340 }, { "epoch": 7.29, "learning_rate": 0.0003960759931749619, "loss": 7.2932, "step": 350 }, { "epoch": 7.29, "eval_accuracy": 0.05161072401384023, "eval_loss": 7.3526611328125, "eval_runtime": 180.6553, "eval_samples_per_second": 133.154, "eval_steps_per_second": 4.163, "step": 350 }, { "epoch": 7.49, "learning_rate": 0.00039581096493198893, "loss": 7.3057, "step": 360 }, { "epoch": 7.7, "learning_rate": 0.0003955373712830703, "loss": 7.3002, "step": 370 }, { "epoch": 7.9, "learning_rate": 0.00039525522419607854, "loss": 7.3029, "step": 380 }, { "epoch": 8.12, "learning_rate": 0.0003949645360130412, "loss": 7.7765, "step": 390 }, { "epoch": 8.33, "learning_rate": 0.0003946653194496012, "loss": 7.2986, "step": 400 }, { "epoch": 8.33, "eval_accuracy": 0.051572554180051966, "eval_loss": 7.356107234954834, "eval_runtime": 180.5938, "eval_samples_per_second": 133.199, "eval_steps_per_second": 4.164, "step": 400 }, { "epoch": 8.53, "learning_rate": 0.00039435758759446025, "loss": 7.3093, "step": 410 }, { "epoch": 8.74, "learning_rate": 0.00039404135390880664, "loss": 7.294, "step": 420 }, { "epoch": 8.95, "learning_rate": 0.0003937166322257262, "loss": 7.3083, "step": 430 }, { "epoch": 9.16, "learning_rate": 0.00039338343674959745, "loss": 7.7912, "step": 440 }, { "epoch": 9.37, "learning_rate": 0.00039304178205546976, "loss": 7.289, "step": 450 }, { "epoch": 9.37, "eval_accuracy": 0.05145224079666028, "eval_loss": 7.34950590133667, "eval_runtime": 180.7201, "eval_samples_per_second": 133.106, "eval_steps_per_second": 4.161, "step": 450 }, { "epoch": 9.58, "learning_rate": 0.00039269168308842634, "loss": 7.3004, "step": 460 }, { "epoch": 9.78, "learning_rate": 0.00039233315516293006, "loss": 7.2938, "step": 470 }, { "epoch": 9.99, "learning_rate": 0.00039196621396215403, "loss": 7.2897, "step": 480 }, { "epoch": 10.21, "learning_rate": 0.000391590875537295, "loss": 7.7652, "step": 490 }, { "epoch": 10.41, "learning_rate": 0.00039120715630687155, "loss": 7.2879, "step": 500 }, { "epoch": 10.41, "eval_accuracy": 0.05138556381472711, "eval_loss": 7.3455071449279785, "eval_runtime": 180.6339, "eval_samples_per_second": 133.17, "eval_steps_per_second": 4.163, "step": 500 }, { "epoch": 10.62, "learning_rate": 0.000390815073056006, "loss": 7.2942, "step": 510 }, { "epoch": 10.82, "learning_rate": 0.00039041464293568983, "loss": 7.306, "step": 520 }, { "epoch": 11.04, "learning_rate": 0.00039000588346203374, "loss": 7.7754, "step": 530 }, { "epoch": 11.25, "learning_rate": 0.0003895888125155014, "loss": 7.2912, "step": 540 }, { "epoch": 11.45, "learning_rate": 0.00038916344834012695, "loss": 7.276, "step": 550 }, { "epoch": 11.45, "eval_accuracy": 0.05130612004196204, "eval_loss": 7.347738265991211, "eval_runtime": 180.7636, "eval_samples_per_second": 133.074, "eval_steps_per_second": 4.16, "step": 550 }, { "epoch": 11.66, "learning_rate": 0.00038872980954271757, "loss": 7.3135, "step": 560 }, { "epoch": 11.86, "learning_rate": 0.00038828791509203895, "loss": 7.2859, "step": 570 }, { "epoch": 12.08, "learning_rate": 0.00038783778431798597, "loss": 7.7845, "step": 580 }, { "epoch": 12.29, "learning_rate": 0.0003873794369107369, "loss": 7.2966, "step": 590 }, { "epoch": 12.49, "learning_rate": 0.0003869128929198922, "loss": 7.3072, "step": 600 }, { "epoch": 12.49, "eval_accuracy": 0.051627819878485845, "eval_loss": 7.344621658325195, "eval_runtime": 180.6519, "eval_samples_per_second": 133.157, "eval_steps_per_second": 4.163, "step": 600 }, { "epoch": 12.7, "learning_rate": 0.0003864381727535973, "loss": 7.3026, "step": 610 }, { "epoch": 12.9, "learning_rate": 0.00038595529717765027, "loss": 7.2966, "step": 620 }, { "epoch": 13.12, "learning_rate": 0.0003854642873145931, "loss": 7.7848, "step": 630 }, { "epoch": 13.33, "learning_rate": 0.00038496516464278776, "loss": 7.2964, "step": 640 }, { "epoch": 13.53, "learning_rate": 0.00038445795099547697, "loss": 7.2978, "step": 650 }, { "epoch": 13.53, "eval_accuracy": 0.05143096217098587, "eval_loss": 7.346319198608398, "eval_runtime": 180.763, "eval_samples_per_second": 133.075, "eval_steps_per_second": 4.16, "step": 650 }, { "epoch": 13.74, "learning_rate": 0.0003839426685598287, "loss": 7.2919, "step": 660 }, { "epoch": 13.95, "learning_rate": 0.000383419339875966, "loss": 7.3006, "step": 670 }, { "epoch": 14.16, "learning_rate": 0.00038288798783598087, "loss": 7.7738, "step": 680 }, { "epoch": 14.37, "learning_rate": 0.0003823486356829329, "loss": 7.2839, "step": 690 }, { "epoch": 14.58, "learning_rate": 0.0003818013070098325, "loss": 7.2857, "step": 700 }, { "epoch": 14.58, "eval_accuracy": 0.05146984844436126, "eval_loss": 7.342555999755859, "eval_runtime": 180.8063, "eval_samples_per_second": 133.043, "eval_steps_per_second": 4.159, "step": 700 }, { "epoch": 14.78, "learning_rate": 0.0003812460257586089, "loss": 7.2949, "step": 710 }, { "epoch": 14.99, "learning_rate": 0.000380682816219063, "loss": 7.3249, "step": 720 }, { "epoch": 15.21, "learning_rate": 0.00038011170302780446, "loss": 7.7486, "step": 730 }, { "epoch": 15.41, "learning_rate": 0.00037953271116717444, "loss": 7.2879, "step": 740 }, { "epoch": 15.62, "learning_rate": 0.0003789458659641527, "loss": 7.2868, "step": 750 }, { "epoch": 15.62, "eval_accuracy": 0.05147383671825258, "eval_loss": 7.343778610229492, "eval_runtime": 180.8254, "eval_samples_per_second": 133.029, "eval_steps_per_second": 4.159, "step": 750 }, { "epoch": 15.82, "learning_rate": 0.0003783511930892495, "loss": 7.2986, "step": 760 }, { "epoch": 16.04, "learning_rate": 0.00037774871855538275, "loss": 7.7788, "step": 770 }, { "epoch": 16.25, "learning_rate": 0.00037713846871674045, "loss": 7.2858, "step": 780 }, { "epoch": 16.45, "learning_rate": 0.0003765204702676274, "loss": 7.2937, "step": 790 }, { "epoch": 16.66, "learning_rate": 0.0003758947502412978, "loss": 7.2973, "step": 800 }, { "epoch": 16.66, "eval_accuracy": 0.051658592501666364, "eval_loss": 7.344185829162598, "eval_runtime": 180.7375, "eval_samples_per_second": 133.094, "eval_steps_per_second": 4.161, "step": 800 }, { "epoch": 16.86, "learning_rate": 0.0003752613360087727, "loss": 7.3043, "step": 810 }, { "epoch": 17.08, "learning_rate": 0.00037462025527764265, "loss": 7.7616, "step": 820 }, { "epoch": 17.29, "learning_rate": 0.00037397153609085553, "loss": 7.2869, "step": 830 }, { "epoch": 17.49, "learning_rate": 0.0003733152068254901, "loss": 7.2798, "step": 840 }, { "epoch": 17.7, "learning_rate": 0.00037265129619151483, "loss": 7.2988, "step": 850 }, { "epoch": 17.7, "eval_accuracy": 0.051239018394020945, "eval_loss": 7.343734264373779, "eval_runtime": 180.5675, "eval_samples_per_second": 133.219, "eval_steps_per_second": 4.165, "step": 850 }, { "epoch": 17.7, "step": 850, "total_flos": 1.1524171581514752e+17, "train_loss": 7.482659651812385, "train_runtime": 11122.8848, "train_samples_per_second": 223.953, "train_steps_per_second": 0.432 } ], "max_steps": 4800, "num_train_epochs": 100, "total_flos": 1.1524171581514752e+17, "trial_name": null, "trial_params": null }