|
{ |
|
"best_metric": 0.5248246192932129, |
|
"best_model_checkpoint": "./vit-beta2-0.99/checkpoint-5778", |
|
"epoch": 28.0, |
|
"eval_steps": 500, |
|
"global_step": 8988, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 27.36143684387207, |
|
"learning_rate": 1.8291979226774382e-05, |
|
"loss": 1.7217, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7035367545076283, |
|
"eval_f1": 0.6527006206875058, |
|
"eval_loss": 1.0190271139144897, |
|
"eval_precision": 0.6635371340134015, |
|
"eval_recall": 0.7035367545076283, |
|
"eval_runtime": 22.7678, |
|
"eval_samples_per_second": 126.67, |
|
"eval_steps_per_second": 15.856, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 32.75101089477539, |
|
"learning_rate": 3.681477207155222e-05, |
|
"loss": 1.1622, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.705617198335645, |
|
"eval_f1": 0.7191659456414141, |
|
"eval_loss": 0.7385604977607727, |
|
"eval_precision": 0.7575809849286596, |
|
"eval_recall": 0.705617198335645, |
|
"eval_runtime": 23.0256, |
|
"eval_samples_per_second": 125.252, |
|
"eval_steps_per_second": 15.678, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 7.780852317810059, |
|
"learning_rate": 5.5337564916330066e-05, |
|
"loss": 1.0368, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7517337031900139, |
|
"eval_f1": 0.7214294760102591, |
|
"eval_loss": 0.655035138130188, |
|
"eval_precision": 0.7563510144231254, |
|
"eval_recall": 0.7517337031900139, |
|
"eval_runtime": 22.8211, |
|
"eval_samples_per_second": 126.374, |
|
"eval_steps_per_second": 15.819, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 30.053781509399414, |
|
"learning_rate": 7.386035776110792e-05, |
|
"loss": 0.9653, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7843273231622746, |
|
"eval_f1": 0.7863050211732793, |
|
"eval_loss": 0.5640625953674316, |
|
"eval_precision": 0.7948170600442105, |
|
"eval_recall": 0.7843273231622746, |
|
"eval_runtime": 22.4916, |
|
"eval_samples_per_second": 128.226, |
|
"eval_steps_per_second": 16.05, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 10.72180461883545, |
|
"learning_rate": 9.238315060588575e-05, |
|
"loss": 0.9272, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.6768377253814147, |
|
"eval_f1": 0.703514416314774, |
|
"eval_loss": 0.7956904172897339, |
|
"eval_precision": 0.7959127541379581, |
|
"eval_recall": 0.6768377253814147, |
|
"eval_runtime": 22.4157, |
|
"eval_samples_per_second": 128.66, |
|
"eval_steps_per_second": 16.105, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 9.665838241577148, |
|
"learning_rate": 9.984715255878176e-05, |
|
"loss": 0.9878, |
|
"step": 1926 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7871012482662968, |
|
"eval_f1": 0.7903728309886872, |
|
"eval_loss": 0.580937922000885, |
|
"eval_precision": 0.8061861183645468, |
|
"eval_recall": 0.7871012482662968, |
|
"eval_runtime": 22.8489, |
|
"eval_samples_per_second": 126.221, |
|
"eval_steps_per_second": 15.799, |
|
"step": 1926 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 7.413944244384766, |
|
"learning_rate": 9.889061131437471e-05, |
|
"loss": 0.872, |
|
"step": 2247 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7215672676837726, |
|
"eval_f1": 0.7441866963098906, |
|
"eval_loss": 0.681545078754425, |
|
"eval_precision": 0.8081398492651465, |
|
"eval_recall": 0.7215672676837726, |
|
"eval_runtime": 22.5223, |
|
"eval_samples_per_second": 128.051, |
|
"eval_steps_per_second": 16.029, |
|
"step": 2247 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 3.1963610649108887, |
|
"learning_rate": 9.707265436104638e-05, |
|
"loss": 0.7998, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7558945908460472, |
|
"eval_f1": 0.7722745826002211, |
|
"eval_loss": 0.6104105114936829, |
|
"eval_precision": 0.8143413614427701, |
|
"eval_recall": 0.7558945908460472, |
|
"eval_runtime": 22.1997, |
|
"eval_samples_per_second": 129.912, |
|
"eval_steps_per_second": 16.261, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 5.708278656005859, |
|
"learning_rate": 9.44253127296151e-05, |
|
"loss": 0.733, |
|
"step": 2889 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.8148404993065187, |
|
"eval_f1": 0.817195928987888, |
|
"eval_loss": 0.5296399593353271, |
|
"eval_precision": 0.8254366092776788, |
|
"eval_recall": 0.8148404993065187, |
|
"eval_runtime": 22.2658, |
|
"eval_samples_per_second": 129.526, |
|
"eval_steps_per_second": 16.213, |
|
"step": 2889 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 8.459081649780273, |
|
"learning_rate": 9.099523058358976e-05, |
|
"loss": 0.6957, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7995839112343966, |
|
"eval_f1": 0.8052016823215509, |
|
"eval_loss": 0.579708456993103, |
|
"eval_precision": 0.8322303167903897, |
|
"eval_recall": 0.7995839112343966, |
|
"eval_runtime": 22.874, |
|
"eval_samples_per_second": 126.082, |
|
"eval_steps_per_second": 15.782, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 23.16131591796875, |
|
"learning_rate": 8.684284338417735e-05, |
|
"loss": 0.6271, |
|
"step": 3531 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7933425797503467, |
|
"eval_f1": 0.8057886771155975, |
|
"eval_loss": 0.5925618410110474, |
|
"eval_precision": 0.8342721256554735, |
|
"eval_recall": 0.7933425797503467, |
|
"eval_runtime": 22.4086, |
|
"eval_samples_per_second": 128.7, |
|
"eval_steps_per_second": 16.11, |
|
"step": 3531 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 13.86120319366455, |
|
"learning_rate": 8.204131306302357e-05, |
|
"loss": 0.5614, |
|
"step": 3852 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7919556171983356, |
|
"eval_f1": 0.8060480658501534, |
|
"eval_loss": 0.58785080909729, |
|
"eval_precision": 0.8384030380760077, |
|
"eval_recall": 0.7919556171983356, |
|
"eval_runtime": 22.7711, |
|
"eval_samples_per_second": 126.652, |
|
"eval_steps_per_second": 15.853, |
|
"step": 3852 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 14.25642204284668, |
|
"learning_rate": 7.667523896413962e-05, |
|
"loss": 0.4576, |
|
"step": 4173 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.8138002773925104, |
|
"eval_f1": 0.802756356097153, |
|
"eval_loss": 0.6664562225341797, |
|
"eval_precision": 0.8312195418319966, |
|
"eval_recall": 0.8138002773925104, |
|
"eval_runtime": 22.7626, |
|
"eval_samples_per_second": 126.699, |
|
"eval_steps_per_second": 15.859, |
|
"step": 4173 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 2.4907939434051514, |
|
"learning_rate": 7.083916726724684e-05, |
|
"loss": 0.4645, |
|
"step": 4494 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8294036061026352, |
|
"eval_f1": 0.8329373166180268, |
|
"eval_loss": 0.5514557957649231, |
|
"eval_precision": 0.8470000675264909, |
|
"eval_recall": 0.8294036061026352, |
|
"eval_runtime": 22.6903, |
|
"eval_samples_per_second": 127.103, |
|
"eval_steps_per_second": 15.91, |
|
"step": 4494 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 6.062983512878418, |
|
"learning_rate": 6.463592515537568e-05, |
|
"loss": 0.3913, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.8224687933425797, |
|
"eval_f1": 0.8288240804348163, |
|
"eval_loss": 0.5473943948745728, |
|
"eval_precision": 0.8465516026472886, |
|
"eval_recall": 0.8224687933425797, |
|
"eval_runtime": 22.9153, |
|
"eval_samples_per_second": 125.855, |
|
"eval_steps_per_second": 15.754, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 8.832720756530762, |
|
"learning_rate": 5.8174809077430184e-05, |
|
"loss": 0.3693, |
|
"step": 5136 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.823509015256588, |
|
"eval_f1": 0.8308490392951902, |
|
"eval_loss": 0.5769267082214355, |
|
"eval_precision": 0.8463977879369676, |
|
"eval_recall": 0.823509015256588, |
|
"eval_runtime": 22.4364, |
|
"eval_samples_per_second": 128.541, |
|
"eval_steps_per_second": 16.09, |
|
"step": 5136 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 3.554053783416748, |
|
"learning_rate": 5.156965902716534e-05, |
|
"loss": 0.2794, |
|
"step": 5457 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.8509015256588072, |
|
"eval_f1": 0.851625286103631, |
|
"eval_loss": 0.5327965617179871, |
|
"eval_precision": 0.8571269483137886, |
|
"eval_recall": 0.8509015256588072, |
|
"eval_runtime": 22.5058, |
|
"eval_samples_per_second": 128.145, |
|
"eval_steps_per_second": 16.04, |
|
"step": 5457 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 12.536526679992676, |
|
"learning_rate": 4.493685276832998e-05, |
|
"loss": 0.2677, |
|
"step": 5778 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.8623439667128987, |
|
"eval_f1": 0.8596257980493167, |
|
"eval_loss": 0.5248246192932129, |
|
"eval_precision": 0.8583558881059454, |
|
"eval_recall": 0.8623439667128987, |
|
"eval_runtime": 23.0138, |
|
"eval_samples_per_second": 125.316, |
|
"eval_steps_per_second": 15.686, |
|
"step": 5778 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.8805328011512756, |
|
"learning_rate": 3.839325534621579e-05, |
|
"loss": 0.2104, |
|
"step": 6099 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.8432732316227461, |
|
"eval_f1": 0.8473236873027821, |
|
"eval_loss": 0.6283748745918274, |
|
"eval_precision": 0.8572165883311328, |
|
"eval_recall": 0.8432732316227461, |
|
"eval_runtime": 22.0808, |
|
"eval_samples_per_second": 130.611, |
|
"eval_steps_per_second": 16.349, |
|
"step": 6099 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.04168611392378807, |
|
"learning_rate": 3.2073473743477955e-05, |
|
"loss": 0.2459, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.8543689320388349, |
|
"eval_f1": 0.8554941368452752, |
|
"eval_loss": 0.6137195229530334, |
|
"eval_precision": 0.8595513589850039, |
|
"eval_recall": 0.8543689320388349, |
|
"eval_runtime": 22.5224, |
|
"eval_samples_per_second": 128.05, |
|
"eval_steps_per_second": 16.028, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 3.362337350845337, |
|
"learning_rate": 2.604941738980618e-05, |
|
"loss": 0.1769, |
|
"step": 6741 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.8637309292649098, |
|
"eval_f1": 0.8566147037629545, |
|
"eval_loss": 0.5959635376930237, |
|
"eval_precision": 0.8572769111277326, |
|
"eval_recall": 0.8637309292649098, |
|
"eval_runtime": 22.6796, |
|
"eval_samples_per_second": 127.162, |
|
"eval_steps_per_second": 15.917, |
|
"step": 6741 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 32.62522506713867, |
|
"learning_rate": 2.0447352243517255e-05, |
|
"loss": 0.1294, |
|
"step": 7062 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.8699722607489597, |
|
"eval_f1": 0.868659902198456, |
|
"eval_loss": 0.5843542814254761, |
|
"eval_precision": 0.8687454531579298, |
|
"eval_recall": 0.8699722607489597, |
|
"eval_runtime": 22.7408, |
|
"eval_samples_per_second": 126.82, |
|
"eval_steps_per_second": 15.875, |
|
"step": 7062 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 4.362905502319336, |
|
"learning_rate": 1.536598246865575e-05, |
|
"loss": 0.1597, |
|
"step": 7383 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.866504854368932, |
|
"eval_f1": 0.8589306843328246, |
|
"eval_loss": 0.6580309867858887, |
|
"eval_precision": 0.8603887819342586, |
|
"eval_recall": 0.866504854368932, |
|
"eval_runtime": 22.8065, |
|
"eval_samples_per_second": 126.455, |
|
"eval_steps_per_second": 15.829, |
|
"step": 7383 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 1.3405486345291138, |
|
"learning_rate": 1.0894837969414489e-05, |
|
"loss": 0.1227, |
|
"step": 7704 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.8730929264909847, |
|
"eval_f1": 0.8712364081669894, |
|
"eval_loss": 0.6225888729095459, |
|
"eval_precision": 0.8720005451749013, |
|
"eval_recall": 0.8730929264909847, |
|
"eval_runtime": 22.4336, |
|
"eval_samples_per_second": 128.557, |
|
"eval_steps_per_second": 16.092, |
|
"step": 7704 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.08982106298208237, |
|
"learning_rate": 7.112696940726155e-06, |
|
"loss": 0.1054, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.8751733703190014, |
|
"eval_f1": 0.8720900906443607, |
|
"eval_loss": 0.6197877526283264, |
|
"eval_precision": 0.8727615417504975, |
|
"eval_recall": 0.8751733703190014, |
|
"eval_runtime": 22.3517, |
|
"eval_samples_per_second": 129.028, |
|
"eval_steps_per_second": 16.151, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 16.338762283325195, |
|
"learning_rate": 4.094394131694684e-06, |
|
"loss": 0.0945, |
|
"step": 8346 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.8793342579750347, |
|
"eval_f1": 0.8764130000059128, |
|
"eval_loss": 0.6049804091453552, |
|
"eval_precision": 0.8757375120219936, |
|
"eval_recall": 0.8793342579750347, |
|
"eval_runtime": 22.9615, |
|
"eval_samples_per_second": 125.601, |
|
"eval_steps_per_second": 15.722, |
|
"step": 8346 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.07402696460485458, |
|
"learning_rate": 1.8742732027931087e-06, |
|
"loss": 0.1242, |
|
"step": 8667 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.8828016643550625, |
|
"eval_f1": 0.8797538930980352, |
|
"eval_loss": 0.6077802181243896, |
|
"eval_precision": 0.8788497875866979, |
|
"eval_recall": 0.8828016643550625, |
|
"eval_runtime": 22.9003, |
|
"eval_samples_per_second": 125.937, |
|
"eval_steps_per_second": 15.764, |
|
"step": 8667 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.06073630228638649, |
|
"learning_rate": 5.020912943263345e-07, |
|
"loss": 0.0819, |
|
"step": 8988 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.8796809986130375, |
|
"eval_f1": 0.8755693874564652, |
|
"eval_loss": 0.6189650893211365, |
|
"eval_precision": 0.8748178599277232, |
|
"eval_recall": 0.8796809986130375, |
|
"eval_runtime": 23.1664, |
|
"eval_samples_per_second": 124.49, |
|
"eval_steps_per_second": 15.583, |
|
"step": 8988 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"step": 8988, |
|
"total_flos": 1.1127108458244538e+19, |
|
"train_loss": 0.5275238134301817, |
|
"train_runtime": 3329.2548, |
|
"train_samples_per_second": 154.028, |
|
"train_steps_per_second": 9.642 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 32100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 1.1127108458244538e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|