{ "best_metric": 0.2901732623577118, "best_model_checkpoint": "./cconvnext-tiny-15ep-1e-4/checkpoint-8250", "epoch": 15.0, "eval_steps": 500, "global_step": 8250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 17.5977840423584, "learning_rate": 9.996375239002369e-05, "loss": 1.9015, "step": 100 }, { "epoch": 0.36, "grad_norm": 19.579683303833008, "learning_rate": 9.985506211566388e-05, "loss": 0.9353, "step": 200 }, { "epoch": 0.55, "grad_norm": 14.599044799804688, "learning_rate": 9.967408676742751e-05, "loss": 0.7327, "step": 300 }, { "epoch": 0.73, "grad_norm": 13.404082298278809, "learning_rate": 9.942108874226811e-05, "loss": 0.6524, "step": 400 }, { "epoch": 0.91, "grad_norm": 15.84705924987793, "learning_rate": 9.909643486313533e-05, "loss": 0.5838, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.8811133200795228, "eval_loss": 0.40972331166267395, "eval_runtime": 70.3463, "eval_samples_per_second": 35.752, "eval_steps_per_second": 1.123, "step": 550 }, { "epoch": 1.09, "grad_norm": 23.304401397705078, "learning_rate": 9.870059584711668e-05, "loss": 0.5633, "step": 600 }, { "epoch": 1.27, "grad_norm": 8.285646438598633, "learning_rate": 9.82341456229428e-05, "loss": 0.4562, "step": 700 }, { "epoch": 1.45, "grad_norm": 17.91335678100586, "learning_rate": 9.769776049884563e-05, "loss": 0.444, "step": 800 }, { "epoch": 1.64, "grad_norm": 8.382722854614258, "learning_rate": 9.709221818197624e-05, "loss": 0.4898, "step": 900 }, { "epoch": 1.82, "grad_norm": 10.226556777954102, "learning_rate": 9.641839665080363e-05, "loss": 0.4433, "step": 1000 }, { "epoch": 2.0, "grad_norm": 19.516849517822266, "learning_rate": 9.567727288213005e-05, "loss": 0.4565, "step": 1100 }, { "epoch": 2.0, "eval_accuracy": 0.8763419483101391, "eval_loss": 0.42688119411468506, "eval_runtime": 71.6777, "eval_samples_per_second": 35.088, "eval_steps_per_second": 1.102, "step": 1100 }, { "epoch": 2.18, "grad_norm": 13.549880027770996, "learning_rate": 9.486992143456792e-05, "loss": 0.3639, "step": 1200 }, { "epoch": 2.36, "grad_norm": 8.744523048400879, "learning_rate": 9.399751289053267e-05, "loss": 0.3795, "step": 1300 }, { "epoch": 2.55, "grad_norm": 5.8690056800842285, "learning_rate": 9.306131215901003e-05, "loss": 0.3665, "step": 1400 }, { "epoch": 2.73, "grad_norm": 8.52180290222168, "learning_rate": 9.206267664155907e-05, "loss": 0.3741, "step": 1500 }, { "epoch": 2.91, "grad_norm": 3.9724719524383545, "learning_rate": 9.100305426420956e-05, "loss": 0.3628, "step": 1600 }, { "epoch": 3.0, "eval_accuracy": 0.9001988071570577, "eval_loss": 0.3464488685131073, "eval_runtime": 72.3767, "eval_samples_per_second": 34.749, "eval_steps_per_second": 1.092, "step": 1650 }, { "epoch": 3.09, "grad_norm": 10.699442863464355, "learning_rate": 8.988398137810777e-05, "loss": 0.3206, "step": 1700 }, { "epoch": 3.27, "grad_norm": 7.718528747558594, "learning_rate": 8.870708053195413e-05, "loss": 0.3157, "step": 1800 }, { "epoch": 3.45, "grad_norm": 12.929231643676758, "learning_rate": 8.74740581194627e-05, "loss": 0.2883, "step": 1900 }, { "epoch": 3.64, "grad_norm": 9.246111869812012, "learning_rate": 8.618670190525352e-05, "loss": 0.3361, "step": 2000 }, { "epoch": 3.82, "grad_norm": 8.94174861907959, "learning_rate": 8.484687843276469e-05, "loss": 0.3051, "step": 2100 }, { "epoch": 4.0, "grad_norm": 13.777535438537598, "learning_rate": 8.345653031794292e-05, "loss": 0.2915, "step": 2200 }, { "epoch": 4.0, "eval_accuracy": 0.9065606361829026, "eval_loss": 0.33660224080085754, "eval_runtime": 72.2343, "eval_samples_per_second": 34.817, "eval_steps_per_second": 1.094, "step": 2200 }, { "epoch": 4.18, "grad_norm": 10.229923248291016, "learning_rate": 8.201767343263612e-05, "loss": 0.2636, "step": 2300 }, { "epoch": 4.36, "grad_norm": 6.8911027908325195, "learning_rate": 8.053239398177191e-05, "loss": 0.2446, "step": 2400 }, { "epoch": 4.55, "grad_norm": 14.081520080566406, "learning_rate": 7.900284547855991e-05, "loss": 0.2407, "step": 2500 }, { "epoch": 4.73, "grad_norm": 15.959187507629395, "learning_rate": 7.74312456221035e-05, "loss": 0.2645, "step": 2600 }, { "epoch": 4.91, "grad_norm": 8.710769653320312, "learning_rate": 7.58198730819481e-05, "loss": 0.2655, "step": 2700 }, { "epoch": 5.0, "eval_accuracy": 0.9053677932405566, "eval_loss": 0.3387199640274048, "eval_runtime": 71.8817, "eval_samples_per_second": 34.988, "eval_steps_per_second": 1.099, "step": 2750 }, { "epoch": 5.09, "grad_norm": 8.720322608947754, "learning_rate": 7.417106419422819e-05, "loss": 0.2436, "step": 2800 }, { "epoch": 5.27, "grad_norm": 14.12549114227295, "learning_rate": 7.24872095742033e-05, "loss": 0.2238, "step": 2900 }, { "epoch": 5.45, "grad_norm": 8.974920272827148, "learning_rate": 7.077075065009433e-05, "loss": 0.2007, "step": 3000 }, { "epoch": 5.64, "grad_norm": 3.3389041423797607, "learning_rate": 6.902417612324615e-05, "loss": 0.2043, "step": 3100 }, { "epoch": 5.82, "grad_norm": 5.468075275421143, "learning_rate": 6.725001835974853e-05, "loss": 0.2399, "step": 3200 }, { "epoch": 6.0, "grad_norm": 19.46337890625, "learning_rate": 6.545084971874738e-05, "loss": 0.2395, "step": 3300 }, { "epoch": 6.0, "eval_accuracy": 0.9125248508946322, "eval_loss": 0.33130431175231934, "eval_runtime": 70.0346, "eval_samples_per_second": 35.911, "eval_steps_per_second": 1.128, "step": 3300 }, { "epoch": 6.18, "grad_norm": 5.854804039001465, "learning_rate": 6.36292788227699e-05, "loss": 0.1906, "step": 3400 }, { "epoch": 6.36, "grad_norm": 11.46949577331543, "learning_rate": 6.178794677547137e-05, "loss": 0.1857, "step": 3500 }, { "epoch": 6.55, "grad_norm": 7.80468225479126, "learning_rate": 5.992952333228728e-05, "loss": 0.1863, "step": 3600 }, { "epoch": 6.73, "grad_norm": 6.652174472808838, "learning_rate": 5.805670302954321e-05, "loss": 0.1743, "step": 3700 }, { "epoch": 6.91, "grad_norm": 9.390819549560547, "learning_rate": 5.617220127763474e-05, "loss": 0.2065, "step": 3800 }, { "epoch": 7.0, "eval_accuracy": 0.9180914512922466, "eval_loss": 0.3119599223136902, "eval_runtime": 74.2948, "eval_samples_per_second": 33.852, "eval_steps_per_second": 1.063, "step": 3850 }, { "epoch": 7.09, "grad_norm": 7.0801544189453125, "learning_rate": 5.427875042394199e-05, "loss": 0.1487, "step": 3900 }, { "epoch": 7.27, "grad_norm": 6.336465358734131, "learning_rate": 5.2379095791187124e-05, "loss": 0.1712, "step": 4000 }, { "epoch": 7.45, "grad_norm": 6.033371448516846, "learning_rate": 5.047599169697884e-05, "loss": 0.1643, "step": 4100 }, { "epoch": 7.64, "grad_norm": 13.526623725891113, "learning_rate": 4.85721974603152e-05, "loss": 0.1716, "step": 4200 }, { "epoch": 7.82, "grad_norm": 13.018455505371094, "learning_rate": 4.667047340083481e-05, "loss": 0.159, "step": 4300 }, { "epoch": 8.0, "grad_norm": 19.489803314208984, "learning_rate": 4.477357683661734e-05, "loss": 0.1503, "step": 4400 }, { "epoch": 8.0, "eval_accuracy": 0.9220675944333996, "eval_loss": 0.30650895833969116, "eval_runtime": 70.0541, "eval_samples_per_second": 35.901, "eval_steps_per_second": 1.128, "step": 4400 }, { "epoch": 8.18, "grad_norm": 12.573568344116211, "learning_rate": 4.288425808633575e-05, "loss": 0.1518, "step": 4500 }, { "epoch": 8.36, "grad_norm": 13.930220603942871, "learning_rate": 4.100525648155731e-05, "loss": 0.1289, "step": 4600 }, { "epoch": 8.55, "grad_norm": 5.025359153747559, "learning_rate": 3.913929639497462e-05, "loss": 0.1243, "step": 4700 }, { "epoch": 8.73, "grad_norm": 10.068098068237305, "learning_rate": 3.728908329032567e-05, "loss": 0.1484, "step": 4800 }, { "epoch": 8.91, "grad_norm": 7.712088584899902, "learning_rate": 3.545729979973005e-05, "loss": 0.1503, "step": 4900 }, { "epoch": 9.0, "eval_accuracy": 0.927634194831014, "eval_loss": 0.29477769136428833, "eval_runtime": 72.6614, "eval_samples_per_second": 34.613, "eval_steps_per_second": 1.087, "step": 4950 }, { "epoch": 9.09, "grad_norm": 9.499425888061523, "learning_rate": 3.364660183412892e-05, "loss": 0.117, "step": 5000 }, { "epoch": 9.27, "grad_norm": 3.1358752250671387, "learning_rate": 3.1859614732467954e-05, "loss": 0.118, "step": 5100 }, { "epoch": 9.45, "grad_norm": 8.783591270446777, "learning_rate": 3.0098929455206904e-05, "loss": 0.1064, "step": 5200 }, { "epoch": 9.64, "grad_norm": 4.272777557373047, "learning_rate": 2.8367098827674578e-05, "loss": 0.1294, "step": 5300 }, { "epoch": 9.82, "grad_norm": 5.206827163696289, "learning_rate": 2.6666633838716314e-05, "loss": 0.1452, "step": 5400 }, { "epoch": 10.0, "grad_norm": 27.840124130249023, "learning_rate": 2.500000000000001e-05, "loss": 0.1125, "step": 5500 }, { "epoch": 10.0, "eval_accuracy": 0.9304174950298211, "eval_loss": 0.2917528450489044, "eval_runtime": 71.1465, "eval_samples_per_second": 35.35, "eval_steps_per_second": 1.11, "step": 5500 }, { "epoch": 10.18, "grad_norm": 10.059708595275879, "learning_rate": 2.336961377126001e-05, "loss": 0.1115, "step": 5600 }, { "epoch": 10.36, "grad_norm": 3.956040143966675, "learning_rate": 2.1777839056661554e-05, "loss": 0.099, "step": 5700 }, { "epoch": 10.55, "grad_norm": 4.310999870300293, "learning_rate": 2.0226983777365604e-05, "loss": 0.1056, "step": 5800 }, { "epoch": 10.73, "grad_norm": 10.636781692504883, "learning_rate": 1.8719296525263922e-05, "loss": 0.0998, "step": 5900 }, { "epoch": 10.91, "grad_norm": 8.48064136505127, "learning_rate": 1.725696330273575e-05, "loss": 0.1057, "step": 6000 }, { "epoch": 11.0, "eval_accuracy": 0.932803180914513, "eval_loss": 0.2953931987285614, "eval_runtime": 71.8417, "eval_samples_per_second": 35.008, "eval_steps_per_second": 1.1, "step": 6050 }, { "epoch": 11.09, "grad_norm": 1.377661943435669, "learning_rate": 1.5842104353153287e-05, "loss": 0.0873, "step": 6100 }, { "epoch": 11.27, "grad_norm": 2.6207616329193115, "learning_rate": 1.4476771086731567e-05, "loss": 0.0913, "step": 6200 }, { "epoch": 11.45, "grad_norm": 8.88305950164795, "learning_rate": 1.3162943106179749e-05, "loss": 0.0874, "step": 6300 }, { "epoch": 11.64, "grad_norm": 7.829620361328125, "learning_rate": 1.1902525336466464e-05, "loss": 0.0901, "step": 6400 }, { "epoch": 11.82, "grad_norm": 1.778537392616272, "learning_rate": 1.0697345262860636e-05, "loss": 0.0922, "step": 6500 }, { "epoch": 12.0, "grad_norm": 3.688204526901245, "learning_rate": 9.549150281252633e-06, "loss": 0.0937, "step": 6600 }, { "epoch": 12.0, "eval_accuracy": 0.9335984095427435, "eval_loss": 0.2958705723285675, "eval_runtime": 71.9587, "eval_samples_per_second": 34.951, "eval_steps_per_second": 1.098, "step": 6600 }, { "epoch": 12.18, "grad_norm": 5.87827730178833, "learning_rate": 8.459605164597267e-06, "loss": 0.0973, "step": 6700 }, { "epoch": 12.36, "grad_norm": 2.3431482315063477, "learning_rate": 7.430289649152156e-06, "loss": 0.0857, "step": 6800 }, { "epoch": 12.55, "grad_norm": 1.375058650970459, "learning_rate": 6.462696144011149e-06, "loss": 0.0912, "step": 6900 }, { "epoch": 12.73, "grad_norm": 5.9749979972839355, "learning_rate": 5.558227567253832e-06, "loss": 0.073, "step": 7000 }, { "epoch": 12.91, "grad_norm": 8.656426429748535, "learning_rate": 4.7181953118484556e-06, "loss": 0.0966, "step": 7100 }, { "epoch": 13.0, "eval_accuracy": 0.9351888667992048, "eval_loss": 0.29395705461502075, "eval_runtime": 71.5553, "eval_samples_per_second": 35.148, "eval_steps_per_second": 1.104, "step": 7150 }, { "epoch": 13.09, "grad_norm": 6.880538463592529, "learning_rate": 3.9438173442575e-06, "loss": 0.0836, "step": 7200 }, { "epoch": 13.27, "grad_norm": 3.5548784732818604, "learning_rate": 3.2362164385026706e-06, "loss": 0.1008, "step": 7300 }, { "epoch": 13.45, "grad_norm": 6.498635292053223, "learning_rate": 2.596418548250029e-06, "loss": 0.0736, "step": 7400 }, { "epoch": 13.64, "grad_norm": 2.343573808670044, "learning_rate": 2.0253513192751373e-06, "loss": 0.0719, "step": 7500 }, { "epoch": 13.82, "grad_norm": 8.872209548950195, "learning_rate": 1.523842744465437e-06, "loss": 0.0879, "step": 7600 }, { "epoch": 14.0, "grad_norm": 3.882363796234131, "learning_rate": 1.0926199633097157e-06, "loss": 0.0735, "step": 7700 }, { "epoch": 14.0, "eval_accuracy": 0.9339960238568589, "eval_loss": 0.2915794849395752, "eval_runtime": 69.9779, "eval_samples_per_second": 35.94, "eval_steps_per_second": 1.129, "step": 7700 }, { "epoch": 14.18, "grad_norm": 1.7305363416671753, "learning_rate": 7.323082076153509e-07, "loss": 0.0824, "step": 7800 }, { "epoch": 14.36, "grad_norm": 10.12863826751709, "learning_rate": 4.434298949819449e-07, "loss": 0.0825, "step": 7900 }, { "epoch": 14.55, "grad_norm": 0.9927307367324829, "learning_rate": 2.2640387134577058e-07, "loss": 0.0861, "step": 8000 }, { "epoch": 14.73, "grad_norm": 7.997195720672607, "learning_rate": 8.15448036932176e-08, "loss": 0.0824, "step": 8100 }, { "epoch": 14.91, "grad_norm": 7.195863246917725, "learning_rate": 9.06272382371065e-09, "loss": 0.0881, "step": 8200 }, { "epoch": 15.0, "eval_accuracy": 0.9355864811133201, "eval_loss": 0.2901732623577118, "eval_runtime": 71.1315, "eval_samples_per_second": 35.357, "eval_steps_per_second": 1.111, "step": 8250 }, { "epoch": 15.0, "step": 8250, "total_flos": 1.952142618502398e+19, "train_loss": 0.23986065309697932, "train_runtime": 12740.4964, "train_samples_per_second": 20.699, "train_steps_per_second": 0.648 } ], "logging_steps": 100, "max_steps": 8250, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 1.952142618502398e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }