{ "best_metric": 1.0969369411468506, "best_model_checkpoint": "./vit-base-melSpecImagesCREMA/checkpoint-400", "epoch": 10.0, "eval_steps": 100, "global_step": 1310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 1.503008246421814, "learning_rate": 0.0001984732824427481, "loss": 1.7751, "step": 10 }, { "epoch": 0.15, "grad_norm": 1.1629976034164429, "learning_rate": 0.0001969465648854962, "loss": 1.6558, "step": 20 }, { "epoch": 0.23, "grad_norm": 1.205520749092102, "learning_rate": 0.00019541984732824428, "loss": 1.6062, "step": 30 }, { "epoch": 0.31, "grad_norm": 1.4008492231369019, "learning_rate": 0.00019389312977099237, "loss": 1.6114, "step": 40 }, { "epoch": 0.38, "grad_norm": 1.3961032629013062, "learning_rate": 0.00019236641221374049, "loss": 1.5886, "step": 50 }, { "epoch": 0.46, "grad_norm": 1.31687331199646, "learning_rate": 0.00019083969465648857, "loss": 1.544, "step": 60 }, { "epoch": 0.53, "grad_norm": 1.3430259227752686, "learning_rate": 0.00018931297709923666, "loss": 1.493, "step": 70 }, { "epoch": 0.61, "grad_norm": 2.1518867015838623, "learning_rate": 0.00018778625954198475, "loss": 1.5494, "step": 80 }, { "epoch": 0.69, "grad_norm": 1.066909909248352, "learning_rate": 0.00018625954198473284, "loss": 1.4324, "step": 90 }, { "epoch": 0.76, "grad_norm": 1.1103651523590088, "learning_rate": 0.00018473282442748093, "loss": 1.5606, "step": 100 }, { "epoch": 0.76, "eval_accuracy": 0.40786948176583493, "eval_loss": 1.4423701763153076, "eval_runtime": 10.9463, "eval_samples_per_second": 95.192, "eval_steps_per_second": 11.967, "step": 100 }, { "epoch": 0.84, "grad_norm": 1.20200777053833, "learning_rate": 0.00018320610687022902, "loss": 1.429, "step": 110 }, { "epoch": 0.92, "grad_norm": 1.1876991987228394, "learning_rate": 0.0001816793893129771, "loss": 1.4265, "step": 120 }, { "epoch": 0.99, "grad_norm": 1.1246036291122437, "learning_rate": 0.00018015267175572518, "loss": 1.4526, "step": 130 }, { "epoch": 1.07, "grad_norm": 1.8578764200210571, "learning_rate": 0.0001786259541984733, "loss": 1.349, "step": 140 }, { "epoch": 1.15, "grad_norm": 1.8486779928207397, "learning_rate": 0.00017709923664122138, "loss": 1.3172, "step": 150 }, { "epoch": 1.22, "grad_norm": 1.775232195854187, "learning_rate": 0.00017557251908396947, "loss": 1.2735, "step": 160 }, { "epoch": 1.3, "grad_norm": 1.7306294441223145, "learning_rate": 0.00017404580152671756, "loss": 1.3665, "step": 170 }, { "epoch": 1.37, "grad_norm": 1.8813070058822632, "learning_rate": 0.00017251908396946565, "loss": 1.322, "step": 180 }, { "epoch": 1.45, "grad_norm": 1.4844669103622437, "learning_rate": 0.00017099236641221374, "loss": 1.324, "step": 190 }, { "epoch": 1.53, "grad_norm": 2.489121913909912, "learning_rate": 0.00016946564885496183, "loss": 1.2841, "step": 200 }, { "epoch": 1.53, "eval_accuracy": 0.3694817658349328, "eval_loss": 1.498081088066101, "eval_runtime": 11.2582, "eval_samples_per_second": 92.555, "eval_steps_per_second": 11.636, "step": 200 }, { "epoch": 1.6, "grad_norm": 1.9091380834579468, "learning_rate": 0.00016793893129770992, "loss": 1.2919, "step": 210 }, { "epoch": 1.68, "grad_norm": 1.8453233242034912, "learning_rate": 0.00016641221374045804, "loss": 1.2998, "step": 220 }, { "epoch": 1.76, "grad_norm": 2.699350595474243, "learning_rate": 0.00016488549618320613, "loss": 1.3153, "step": 230 }, { "epoch": 1.83, "grad_norm": 2.409592628479004, "learning_rate": 0.00016335877862595422, "loss": 1.1943, "step": 240 }, { "epoch": 1.91, "grad_norm": 1.59845769405365, "learning_rate": 0.0001618320610687023, "loss": 1.2302, "step": 250 }, { "epoch": 1.98, "grad_norm": 2.2508938312530518, "learning_rate": 0.0001603053435114504, "loss": 1.4177, "step": 260 }, { "epoch": 2.06, "grad_norm": 1.6891452074050903, "learning_rate": 0.00015877862595419848, "loss": 1.0828, "step": 270 }, { "epoch": 2.14, "grad_norm": 3.317949056625366, "learning_rate": 0.00015725190839694657, "loss": 1.1312, "step": 280 }, { "epoch": 2.21, "grad_norm": 3.0603318214416504, "learning_rate": 0.00015572519083969466, "loss": 1.0713, "step": 290 }, { "epoch": 2.29, "grad_norm": 2.0383269786834717, "learning_rate": 0.00015419847328244275, "loss": 1.0159, "step": 300 }, { "epoch": 2.29, "eval_accuracy": 0.5518234165067178, "eval_loss": 1.1692744493484497, "eval_runtime": 11.2045, "eval_samples_per_second": 92.998, "eval_steps_per_second": 11.692, "step": 300 }, { "epoch": 2.37, "grad_norm": 1.9367152452468872, "learning_rate": 0.00015267175572519084, "loss": 0.9986, "step": 310 }, { "epoch": 2.44, "grad_norm": 2.3003132343292236, "learning_rate": 0.00015114503816793893, "loss": 1.0508, "step": 320 }, { "epoch": 2.52, "grad_norm": 3.84384822845459, "learning_rate": 0.00014961832061068702, "loss": 1.0623, "step": 330 }, { "epoch": 2.6, "grad_norm": 1.7760237455368042, "learning_rate": 0.0001480916030534351, "loss": 1.0425, "step": 340 }, { "epoch": 2.67, "grad_norm": 2.6222035884857178, "learning_rate": 0.0001465648854961832, "loss": 1.0366, "step": 350 }, { "epoch": 2.75, "grad_norm": 2.4612748622894287, "learning_rate": 0.0001450381679389313, "loss": 1.0451, "step": 360 }, { "epoch": 2.82, "grad_norm": 2.3277204036712646, "learning_rate": 0.00014351145038167938, "loss": 1.1176, "step": 370 }, { "epoch": 2.9, "grad_norm": 2.187209129333496, "learning_rate": 0.00014198473282442747, "loss": 0.9908, "step": 380 }, { "epoch": 2.98, "grad_norm": 3.229308843612671, "learning_rate": 0.00014045801526717556, "loss": 1.0181, "step": 390 }, { "epoch": 3.05, "grad_norm": 1.847447156906128, "learning_rate": 0.00013893129770992368, "loss": 0.9868, "step": 400 }, { "epoch": 3.05, "eval_accuracy": 0.5930902111324377, "eval_loss": 1.0969369411468506, "eval_runtime": 11.1277, "eval_samples_per_second": 93.64, "eval_steps_per_second": 11.772, "step": 400 }, { "epoch": 3.13, "grad_norm": 2.6871042251586914, "learning_rate": 0.00013740458015267177, "loss": 0.8034, "step": 410 }, { "epoch": 3.21, "grad_norm": 3.8769664764404297, "learning_rate": 0.00013587786259541986, "loss": 0.8141, "step": 420 }, { "epoch": 3.28, "grad_norm": 2.8785126209259033, "learning_rate": 0.00013435114503816795, "loss": 0.7343, "step": 430 }, { "epoch": 3.36, "grad_norm": 2.4167301654815674, "learning_rate": 0.00013282442748091604, "loss": 0.8251, "step": 440 }, { "epoch": 3.44, "grad_norm": 2.921082019805908, "learning_rate": 0.00013129770992366413, "loss": 0.7511, "step": 450 }, { "epoch": 3.51, "grad_norm": 2.847332000732422, "learning_rate": 0.00012977099236641222, "loss": 0.7392, "step": 460 }, { "epoch": 3.59, "grad_norm": 3.5761501789093018, "learning_rate": 0.0001282442748091603, "loss": 0.6463, "step": 470 }, { "epoch": 3.66, "grad_norm": 3.0666024684906006, "learning_rate": 0.0001267175572519084, "loss": 0.7027, "step": 480 }, { "epoch": 3.74, "grad_norm": 1.9951330423355103, "learning_rate": 0.0001251908396946565, "loss": 0.7666, "step": 490 }, { "epoch": 3.82, "grad_norm": 3.5129895210266113, "learning_rate": 0.0001236641221374046, "loss": 0.8477, "step": 500 }, { "epoch": 3.82, "eval_accuracy": 0.5796545105566219, "eval_loss": 1.1718865633010864, "eval_runtime": 11.143, "eval_samples_per_second": 93.511, "eval_steps_per_second": 11.756, "step": 500 }, { "epoch": 3.89, "grad_norm": 2.490349054336548, "learning_rate": 0.0001221374045801527, "loss": 0.6279, "step": 510 }, { "epoch": 3.97, "grad_norm": 4.751981258392334, "learning_rate": 0.00012061068702290077, "loss": 0.6861, "step": 520 }, { "epoch": 4.05, "grad_norm": 1.843367338180542, "learning_rate": 0.00011908396946564886, "loss": 0.6348, "step": 530 }, { "epoch": 4.12, "grad_norm": 3.823432445526123, "learning_rate": 0.00011755725190839695, "loss": 0.4487, "step": 540 }, { "epoch": 4.2, "grad_norm": 1.805336833000183, "learning_rate": 0.00011603053435114504, "loss": 0.4393, "step": 550 }, { "epoch": 4.27, "grad_norm": 2.604633331298828, "learning_rate": 0.00011450381679389313, "loss": 0.4098, "step": 560 }, { "epoch": 4.35, "grad_norm": 3.3479952812194824, "learning_rate": 0.00011297709923664124, "loss": 0.5057, "step": 570 }, { "epoch": 4.43, "grad_norm": 2.961186408996582, "learning_rate": 0.00011145038167938933, "loss": 0.5446, "step": 580 }, { "epoch": 4.5, "grad_norm": 4.528743267059326, "learning_rate": 0.00010992366412213742, "loss": 0.5032, "step": 590 }, { "epoch": 4.58, "grad_norm": 3.593770742416382, "learning_rate": 0.0001083969465648855, "loss": 0.5495, "step": 600 }, { "epoch": 4.58, "eval_accuracy": 0.5806142034548945, "eval_loss": 1.234790563583374, "eval_runtime": 11.1508, "eval_samples_per_second": 93.446, "eval_steps_per_second": 11.748, "step": 600 }, { "epoch": 4.66, "grad_norm": 2.7948455810546875, "learning_rate": 0.00010687022900763359, "loss": 0.519, "step": 610 }, { "epoch": 4.73, "grad_norm": 5.143344879150391, "learning_rate": 0.00010534351145038168, "loss": 0.5493, "step": 620 }, { "epoch": 4.81, "grad_norm": 4.292489528656006, "learning_rate": 0.00010381679389312977, "loss": 0.5947, "step": 630 }, { "epoch": 4.89, "grad_norm": 3.135993480682373, "learning_rate": 0.00010229007633587786, "loss": 0.4873, "step": 640 }, { "epoch": 4.96, "grad_norm": 2.5363755226135254, "learning_rate": 0.00010076335877862595, "loss": 0.4813, "step": 650 }, { "epoch": 5.04, "grad_norm": 2.4878172874450684, "learning_rate": 9.923664122137405e-05, "loss": 0.4249, "step": 660 }, { "epoch": 5.11, "grad_norm": 2.6882851123809814, "learning_rate": 9.770992366412214e-05, "loss": 0.2691, "step": 670 }, { "epoch": 5.19, "grad_norm": 2.534364700317383, "learning_rate": 9.618320610687024e-05, "loss": 0.3055, "step": 680 }, { "epoch": 5.27, "grad_norm": 4.340401649475098, "learning_rate": 9.465648854961833e-05, "loss": 0.2444, "step": 690 }, { "epoch": 5.34, "grad_norm": 5.553411483764648, "learning_rate": 9.312977099236642e-05, "loss": 0.2671, "step": 700 }, { "epoch": 5.34, "eval_accuracy": 0.5854126679462572, "eval_loss": 1.3457223176956177, "eval_runtime": 11.154, "eval_samples_per_second": 93.419, "eval_steps_per_second": 11.745, "step": 700 }, { "epoch": 5.42, "grad_norm": 2.335097551345825, "learning_rate": 9.160305343511451e-05, "loss": 0.3027, "step": 710 }, { "epoch": 5.5, "grad_norm": 6.312303066253662, "learning_rate": 9.007633587786259e-05, "loss": 0.3205, "step": 720 }, { "epoch": 5.57, "grad_norm": 7.102092266082764, "learning_rate": 8.854961832061069e-05, "loss": 0.2796, "step": 730 }, { "epoch": 5.65, "grad_norm": 4.718852996826172, "learning_rate": 8.702290076335878e-05, "loss": 0.3254, "step": 740 }, { "epoch": 5.73, "grad_norm": 5.368557453155518, "learning_rate": 8.549618320610687e-05, "loss": 0.329, "step": 750 }, { "epoch": 5.8, "grad_norm": 1.7768850326538086, "learning_rate": 8.396946564885496e-05, "loss": 0.2239, "step": 760 }, { "epoch": 5.88, "grad_norm": 1.6067969799041748, "learning_rate": 8.244274809160306e-05, "loss": 0.2289, "step": 770 }, { "epoch": 5.95, "grad_norm": 0.8688230514526367, "learning_rate": 8.091603053435115e-05, "loss": 0.2715, "step": 780 }, { "epoch": 6.03, "grad_norm": 2.5685417652130127, "learning_rate": 7.938931297709924e-05, "loss": 0.2098, "step": 790 }, { "epoch": 6.11, "grad_norm": 1.9992344379425049, "learning_rate": 7.786259541984733e-05, "loss": 0.1388, "step": 800 }, { "epoch": 6.11, "eval_accuracy": 0.5786948176583493, "eval_loss": 1.389073371887207, "eval_runtime": 11.1211, "eval_samples_per_second": 93.696, "eval_steps_per_second": 11.779, "step": 800 }, { "epoch": 6.18, "grad_norm": 2.335876703262329, "learning_rate": 7.633587786259542e-05, "loss": 0.1762, "step": 810 }, { "epoch": 6.26, "grad_norm": 4.345489025115967, "learning_rate": 7.480916030534351e-05, "loss": 0.1829, "step": 820 }, { "epoch": 6.34, "grad_norm": 1.5818490982055664, "learning_rate": 7.32824427480916e-05, "loss": 0.2543, "step": 830 }, { "epoch": 6.41, "grad_norm": 2.3892974853515625, "learning_rate": 7.175572519083969e-05, "loss": 0.2035, "step": 840 }, { "epoch": 6.49, "grad_norm": 0.9355500936508179, "learning_rate": 7.022900763358778e-05, "loss": 0.2287, "step": 850 }, { "epoch": 6.56, "grad_norm": 3.3484275341033936, "learning_rate": 6.870229007633588e-05, "loss": 0.1433, "step": 860 }, { "epoch": 6.64, "grad_norm": 1.1796901226043701, "learning_rate": 6.717557251908397e-05, "loss": 0.1943, "step": 870 }, { "epoch": 6.72, "grad_norm": 3.2419047355651855, "learning_rate": 6.564885496183206e-05, "loss": 0.2079, "step": 880 }, { "epoch": 6.79, "grad_norm": 5.841382026672363, "learning_rate": 6.412213740458015e-05, "loss": 0.2108, "step": 890 }, { "epoch": 6.87, "grad_norm": 1.1594499349594116, "learning_rate": 6.259541984732826e-05, "loss": 0.1548, "step": 900 }, { "epoch": 6.87, "eval_accuracy": 0.5978886756238004, "eval_loss": 1.4216477870941162, "eval_runtime": 11.0828, "eval_samples_per_second": 94.019, "eval_steps_per_second": 11.82, "step": 900 }, { "epoch": 6.95, "grad_norm": 1.131686806678772, "learning_rate": 6.106870229007635e-05, "loss": 0.1264, "step": 910 }, { "epoch": 7.02, "grad_norm": 0.8231382966041565, "learning_rate": 5.954198473282443e-05, "loss": 0.1726, "step": 920 }, { "epoch": 7.1, "grad_norm": 0.48267778754234314, "learning_rate": 5.801526717557252e-05, "loss": 0.1198, "step": 930 }, { "epoch": 7.18, "grad_norm": 2.5278027057647705, "learning_rate": 5.648854961832062e-05, "loss": 0.1702, "step": 940 }, { "epoch": 7.25, "grad_norm": 2.183692455291748, "learning_rate": 5.496183206106871e-05, "loss": 0.0854, "step": 950 }, { "epoch": 7.33, "grad_norm": 0.40628916025161743, "learning_rate": 5.3435114503816794e-05, "loss": 0.1078, "step": 960 }, { "epoch": 7.4, "grad_norm": 0.19366604089736938, "learning_rate": 5.1908396946564884e-05, "loss": 0.0979, "step": 970 }, { "epoch": 7.48, "grad_norm": 0.4136432707309723, "learning_rate": 5.038167938931297e-05, "loss": 0.0954, "step": 980 }, { "epoch": 7.56, "grad_norm": 0.17071671783924103, "learning_rate": 4.885496183206107e-05, "loss": 0.0432, "step": 990 }, { "epoch": 7.63, "grad_norm": 0.9517059326171875, "learning_rate": 4.7328244274809166e-05, "loss": 0.0906, "step": 1000 }, { "epoch": 7.63, "eval_accuracy": 0.564299424184261, "eval_loss": 1.640116810798645, "eval_runtime": 11.0958, "eval_samples_per_second": 93.91, "eval_steps_per_second": 11.806, "step": 1000 }, { "epoch": 7.71, "grad_norm": 0.1622392237186432, "learning_rate": 4.5801526717557256e-05, "loss": 0.1284, "step": 1010 }, { "epoch": 7.79, "grad_norm": 6.343606948852539, "learning_rate": 4.4274809160305345e-05, "loss": 0.0985, "step": 1020 }, { "epoch": 7.86, "grad_norm": 0.7874533534049988, "learning_rate": 4.2748091603053435e-05, "loss": 0.1168, "step": 1030 }, { "epoch": 7.94, "grad_norm": 0.31581443548202515, "learning_rate": 4.122137404580153e-05, "loss": 0.1668, "step": 1040 }, { "epoch": 8.02, "grad_norm": 0.3773091435432434, "learning_rate": 3.969465648854962e-05, "loss": 0.1027, "step": 1050 }, { "epoch": 8.09, "grad_norm": 0.09075737744569778, "learning_rate": 3.816793893129771e-05, "loss": 0.0301, "step": 1060 }, { "epoch": 8.17, "grad_norm": 0.1646617352962494, "learning_rate": 3.66412213740458e-05, "loss": 0.077, "step": 1070 }, { "epoch": 8.24, "grad_norm": 1.9873558282852173, "learning_rate": 3.511450381679389e-05, "loss": 0.0443, "step": 1080 }, { "epoch": 8.32, "grad_norm": 0.13392330706119537, "learning_rate": 3.358778625954199e-05, "loss": 0.0823, "step": 1090 }, { "epoch": 8.4, "grad_norm": 1.0865620374679565, "learning_rate": 3.2061068702290076e-05, "loss": 0.1047, "step": 1100 }, { "epoch": 8.4, "eval_accuracy": 0.5873320537428023, "eval_loss": 1.6780017614364624, "eval_runtime": 11.1335, "eval_samples_per_second": 93.591, "eval_steps_per_second": 11.766, "step": 1100 }, { "epoch": 8.47, "grad_norm": 0.10025681555271149, "learning_rate": 3.053435114503817e-05, "loss": 0.0592, "step": 1110 }, { "epoch": 8.55, "grad_norm": 0.14586412906646729, "learning_rate": 2.900763358778626e-05, "loss": 0.0393, "step": 1120 }, { "epoch": 8.63, "grad_norm": 0.5182372331619263, "learning_rate": 2.7480916030534355e-05, "loss": 0.0802, "step": 1130 }, { "epoch": 8.7, "grad_norm": 0.3386911153793335, "learning_rate": 2.5954198473282442e-05, "loss": 0.0748, "step": 1140 }, { "epoch": 8.78, "grad_norm": 0.10998167097568512, "learning_rate": 2.4427480916030535e-05, "loss": 0.0366, "step": 1150 }, { "epoch": 8.85, "grad_norm": 0.12017246335744858, "learning_rate": 2.2900763358778628e-05, "loss": 0.0461, "step": 1160 }, { "epoch": 8.93, "grad_norm": 0.09363219887018204, "learning_rate": 2.1374045801526718e-05, "loss": 0.056, "step": 1170 }, { "epoch": 9.01, "grad_norm": 0.08617076277732849, "learning_rate": 1.984732824427481e-05, "loss": 0.0407, "step": 1180 }, { "epoch": 9.08, "grad_norm": 0.07992846518754959, "learning_rate": 1.83206106870229e-05, "loss": 0.0225, "step": 1190 }, { "epoch": 9.16, "grad_norm": 0.5971510410308838, "learning_rate": 1.6793893129770993e-05, "loss": 0.0583, "step": 1200 }, { "epoch": 9.16, "eval_accuracy": 0.5767754318618042, "eval_loss": 1.6794880628585815, "eval_runtime": 11.1041, "eval_samples_per_second": 93.839, "eval_steps_per_second": 11.797, "step": 1200 }, { "epoch": 9.24, "grad_norm": 0.06741204857826233, "learning_rate": 1.5267175572519086e-05, "loss": 0.0221, "step": 1210 }, { "epoch": 9.31, "grad_norm": 0.08024097234010696, "learning_rate": 1.3740458015267178e-05, "loss": 0.0357, "step": 1220 }, { "epoch": 9.39, "grad_norm": 0.17311328649520874, "learning_rate": 1.2213740458015267e-05, "loss": 0.0336, "step": 1230 }, { "epoch": 9.47, "grad_norm": 0.09014473855495453, "learning_rate": 1.0687022900763359e-05, "loss": 0.0514, "step": 1240 }, { "epoch": 9.54, "grad_norm": 0.07394669950008392, "learning_rate": 9.16030534351145e-06, "loss": 0.0534, "step": 1250 }, { "epoch": 9.62, "grad_norm": 0.11295180022716522, "learning_rate": 7.633587786259543e-06, "loss": 0.0254, "step": 1260 }, { "epoch": 9.69, "grad_norm": 1.9013243913650513, "learning_rate": 6.106870229007634e-06, "loss": 0.0919, "step": 1270 }, { "epoch": 9.77, "grad_norm": 0.08759523928165436, "learning_rate": 4.580152671755725e-06, "loss": 0.0476, "step": 1280 }, { "epoch": 9.85, "grad_norm": 0.07936747372150421, "learning_rate": 3.053435114503817e-06, "loss": 0.0365, "step": 1290 }, { "epoch": 9.92, "grad_norm": 0.13975408673286438, "learning_rate": 1.5267175572519084e-06, "loss": 0.0228, "step": 1300 }, { "epoch": 9.92, "eval_accuracy": 0.5882917466410749, "eval_loss": 1.6925907135009766, "eval_runtime": 11.1309, "eval_samples_per_second": 93.613, "eval_steps_per_second": 11.769, "step": 1300 }, { "epoch": 10.0, "grad_norm": 0.3255839943885803, "learning_rate": 0.0, "loss": 0.0364, "step": 1310 }, { "epoch": 10.0, "step": 1310, "total_flos": 3.229206972532531e+18, "train_loss": 0.583979975267221, "train_runtime": 954.0647, "train_samples_per_second": 43.676, "train_steps_per_second": 1.373 } ], "logging_steps": 10, "max_steps": 1310, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 3.229206972532531e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }