Qwen2.5-7B-CoNLL-2003 / trainer_state.json
stefan-it's picture
feat: add fine-tuning artefacts
cdaf217 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 3748,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.026689797824781477,
"grad_norm": 2.147552967071533,
"learning_rate": 2.6133333333333334e-06,
"loss": 0.7325,
"step": 50
},
{
"epoch": 0.053379595649562954,
"grad_norm": 1.0603501796722412,
"learning_rate": 5.28e-06,
"loss": 0.3367,
"step": 100
},
{
"epoch": 0.08006939347434443,
"grad_norm": 0.6418544054031372,
"learning_rate": 7.946666666666666e-06,
"loss": 0.3204,
"step": 150
},
{
"epoch": 0.10675919129912591,
"grad_norm": 0.5255997776985168,
"learning_rate": 1.0613333333333334e-05,
"loss": 0.3304,
"step": 200
},
{
"epoch": 0.13344898912390737,
"grad_norm": 0.3151794970035553,
"learning_rate": 1.3280000000000002e-05,
"loss": 0.2894,
"step": 250
},
{
"epoch": 0.16013878694868885,
"grad_norm": 0.8053786754608154,
"learning_rate": 1.5946666666666668e-05,
"loss": 0.2991,
"step": 300
},
{
"epoch": 0.18682858477347034,
"grad_norm": 0.02903924137353897,
"learning_rate": 1.8613333333333334e-05,
"loss": 0.3086,
"step": 350
},
{
"epoch": 0.21351838259825182,
"grad_norm": 0.7904114723205566,
"learning_rate": 1.9997501717954778e-05,
"loss": 0.3291,
"step": 400
},
{
"epoch": 0.2402081804230333,
"grad_norm": 0.41448771953582764,
"learning_rate": 1.9976257383527736e-05,
"loss": 0.3051,
"step": 450
},
{
"epoch": 0.26689797824781475,
"grad_norm": 0.3972539007663727,
"learning_rate": 1.9933381071415265e-05,
"loss": 0.3149,
"step": 500
},
{
"epoch": 0.26689797824781475,
"eval_loss": 0.4319009780883789,
"eval_runtime": 118.7587,
"eval_samples_per_second": 29.185,
"eval_steps_per_second": 29.185,
"step": 500
},
{
"epoch": 0.29358777607259623,
"grad_norm": 0.9960196018218994,
"learning_rate": 1.9868965752296757e-05,
"loss": 0.2993,
"step": 550
},
{
"epoch": 0.3202775738973777,
"grad_norm": 0.4068482518196106,
"learning_rate": 1.978315110087108e-05,
"loss": 0.289,
"step": 600
},
{
"epoch": 0.3469673717221592,
"grad_norm": 0.16865970194339752,
"learning_rate": 1.967612319299347e-05,
"loss": 0.3031,
"step": 650
},
{
"epoch": 0.37365716954694067,
"grad_norm": 0.5249218344688416,
"learning_rate": 1.954811410219871e-05,
"loss": 0.296,
"step": 700
},
{
"epoch": 0.40034696737172215,
"grad_norm": 0.2836828827857971,
"learning_rate": 1.9399401396485418e-05,
"loss": 0.2986,
"step": 750
},
{
"epoch": 0.42703676519650363,
"grad_norm": 0.024336909875273705,
"learning_rate": 1.923030753645258e-05,
"loss": 0.2963,
"step": 800
},
{
"epoch": 0.4537265630212851,
"grad_norm": 0.15610457956790924,
"learning_rate": 1.9041199176093403e-05,
"loss": 0.2939,
"step": 850
},
{
"epoch": 0.4804163608460666,
"grad_norm": 0.3119260370731354,
"learning_rate": 1.8832486367762608e-05,
"loss": 0.3027,
"step": 900
},
{
"epoch": 0.5071061586708481,
"grad_norm": 1.0931050777435303,
"learning_rate": 1.8604621673041056e-05,
"loss": 0.3102,
"step": 950
},
{
"epoch": 0.5337959564956295,
"grad_norm": 0.16979913413524628,
"learning_rate": 1.8358099181425628e-05,
"loss": 0.2789,
"step": 1000
},
{
"epoch": 0.5337959564956295,
"eval_loss": 0.42964276671409607,
"eval_runtime": 118.8013,
"eval_samples_per_second": 29.175,
"eval_steps_per_second": 29.175,
"step": 1000
},
{
"epoch": 0.560485754320411,
"grad_norm": 0.5423979163169861,
"learning_rate": 1.809345343897229e-05,
"loss": 0.2991,
"step": 1050
},
{
"epoch": 0.5871755521451925,
"grad_norm": 0.9803158640861511,
"learning_rate": 1.7811258289215265e-05,
"loss": 0.2987,
"step": 1100
},
{
"epoch": 0.613865349969974,
"grad_norm": 0.25261151790618896,
"learning_rate": 1.7512125628875722e-05,
"loss": 0.3019,
"step": 1150
},
{
"epoch": 0.6405551477947554,
"grad_norm": 0.34850814938545227,
"learning_rate": 1.7196704081057955e-05,
"loss": 0.2888,
"step": 1200
},
{
"epoch": 0.667244945619537,
"grad_norm": 0.32236334681510925,
"learning_rate": 1.6865677588810112e-05,
"loss": 0.2895,
"step": 1250
},
{
"epoch": 0.6939347434443184,
"grad_norm": 0.32494327425956726,
"learning_rate": 1.6519763932099e-05,
"loss": 0.2983,
"step": 1300
},
{
"epoch": 0.7206245412690999,
"grad_norm": 0.022694284096360207,
"learning_rate": 1.615971317141477e-05,
"loss": 0.2883,
"step": 1350
},
{
"epoch": 0.7473143390938813,
"grad_norm": 0.02763156034052372,
"learning_rate": 1.578630602138029e-05,
"loss": 0.3111,
"step": 1400
},
{
"epoch": 0.7740041369186629,
"grad_norm": 0.4600750207901001,
"learning_rate": 1.54003521578917e-05,
"loss": 0.3019,
"step": 1450
},
{
"epoch": 0.8006939347434443,
"grad_norm": 0.024699728935956955,
"learning_rate": 1.5002688462460931e-05,
"loss": 0.3074,
"step": 1500
},
{
"epoch": 0.8006939347434443,
"eval_loss": 0.42814743518829346,
"eval_runtime": 118.8349,
"eval_samples_per_second": 29.167,
"eval_steps_per_second": 29.167,
"step": 1500
},
{
"epoch": 0.8273837325682258,
"grad_norm": 0.43051406741142273,
"learning_rate": 1.459417720756705e-05,
"loss": 0.2929,
"step": 1550
},
{
"epoch": 0.8540735303930073,
"grad_norm": 0.03149699047207832,
"learning_rate": 1.4175704186951178e-05,
"loss": 0.2858,
"step": 1600
},
{
"epoch": 0.8807633282177888,
"grad_norm": 0.30078360438346863,
"learning_rate": 1.3748176794909173e-05,
"loss": 0.296,
"step": 1650
},
{
"epoch": 0.9074531260425702,
"grad_norm": 0.2556113302707672,
"learning_rate": 1.3312522058746883e-05,
"loss": 0.2844,
"step": 1700
},
{
"epoch": 0.9341429238673518,
"grad_norm": 0.09224811941385269,
"learning_rate": 1.2869684628664158e-05,
"loss": 0.2997,
"step": 1750
},
{
"epoch": 0.9608327216921332,
"grad_norm": 0.4570249319076538,
"learning_rate": 1.2420624729426419e-05,
"loss": 0.286,
"step": 1800
},
{
"epoch": 0.9875225195169146,
"grad_norm": 0.013208149001002312,
"learning_rate": 1.1966316078265114e-05,
"loss": 0.2889,
"step": 1850
},
{
"epoch": 1.0138786948688863,
"grad_norm": 0.049777038395404816,
"learning_rate": 1.150774377352188e-05,
"loss": 0.2915,
"step": 1900
},
{
"epoch": 1.0405684926936678,
"grad_norm": 0.019075842574238777,
"learning_rate": 1.1045902158614493e-05,
"loss": 0.2997,
"step": 1950
},
{
"epoch": 1.0672582905184493,
"grad_norm": 0.3245074152946472,
"learning_rate": 1.0581792665956311e-05,
"loss": 0.2819,
"step": 2000
},
{
"epoch": 1.0672582905184493,
"eval_loss": 0.4269392490386963,
"eval_runtime": 118.8174,
"eval_samples_per_second": 29.171,
"eval_steps_per_second": 29.171,
"step": 2000
},
{
"epoch": 1.0939480883432309,
"grad_norm": 0.1012788936495781,
"learning_rate": 1.0116421645504322e-05,
"loss": 0.3026,
"step": 2050
},
{
"epoch": 1.1206378861680122,
"grad_norm": 0.005864244420081377,
"learning_rate": 9.650798182644238e-06,
"loss": 0.2996,
"step": 2100
},
{
"epoch": 1.1473276839927937,
"grad_norm": 0.20553366839885712,
"learning_rate": 9.185931910144259e-06,
"loss": 0.2947,
"step": 2150
},
{
"epoch": 1.1740174818175753,
"grad_norm": 0.1598707139492035,
"learning_rate": 8.722830818921908e-06,
"loss": 0.3082,
"step": 2200
},
{
"epoch": 1.2007072796423568,
"grad_norm": 0.013660128228366375,
"learning_rate": 8.262499072370962e-06,
"loss": 0.2808,
"step": 2250
},
{
"epoch": 1.227397077467138,
"grad_norm": 0.2775779068470001,
"learning_rate": 7.805934828987778e-06,
"loss": 0.2956,
"step": 2300
},
{
"epoch": 1.2540868752919196,
"grad_norm": 0.11179913580417633,
"learning_rate": 7.354128078018343e-06,
"loss": 0.2766,
"step": 2350
},
{
"epoch": 1.2807766731167012,
"grad_norm": 0.013183626346290112,
"learning_rate": 6.908058492819043e-06,
"loss": 0.3051,
"step": 2400
},
{
"epoch": 1.3074664709414825,
"grad_norm": 0.030972259119153023,
"learning_rate": 6.468693306585873e-06,
"loss": 0.2966,
"step": 2450
},
{
"epoch": 1.334156268766264,
"grad_norm": 0.004325371701270342,
"learning_rate": 6.036985215058232e-06,
"loss": 0.2993,
"step": 2500
},
{
"epoch": 1.334156268766264,
"eval_loss": 0.4272440969944,
"eval_runtime": 118.9153,
"eval_samples_per_second": 29.147,
"eval_steps_per_second": 29.147,
"step": 2500
},
{
"epoch": 1.3608460665910456,
"grad_norm": 0.005641893949359655,
"learning_rate": 5.613870310744911e-06,
"loss": 0.2827,
"step": 2550
},
{
"epoch": 1.387535864415827,
"grad_norm": 0.06420216709375381,
"learning_rate": 5.20026605315167e-06,
"loss": 0.2675,
"step": 2600
},
{
"epoch": 1.4142256622406086,
"grad_norm": 0.7545241117477417,
"learning_rate": 4.797069279411617e-06,
"loss": 0.3023,
"step": 2650
},
{
"epoch": 1.44091546006539,
"grad_norm": 0.01520050223916769,
"learning_rate": 4.405154259631967e-06,
"loss": 0.2743,
"step": 2700
},
{
"epoch": 1.4676052578901715,
"grad_norm": 0.11380592733621597,
"learning_rate": 4.0253708011739915e-06,
"loss": 0.2878,
"step": 2750
},
{
"epoch": 1.494295055714953,
"grad_norm": 0.013577022589743137,
"learning_rate": 3.6585424059766296e-06,
"loss": 0.2902,
"step": 2800
},
{
"epoch": 1.5209848535397343,
"grad_norm": 0.020401863381266594,
"learning_rate": 3.3054644849193495e-06,
"loss": 0.2884,
"step": 2850
},
{
"epoch": 1.5476746513645159,
"grad_norm": 0.006767068989574909,
"learning_rate": 2.966902633096178e-06,
"loss": 0.2952,
"step": 2900
},
{
"epoch": 1.5743644491892974,
"grad_norm": 0.01109298225492239,
"learning_rate": 2.643590969740637e-06,
"loss": 0.2839,
"step": 2950
},
{
"epoch": 1.601054247014079,
"grad_norm": 0.014958917163312435,
"learning_rate": 2.33623054640124e-06,
"loss": 0.2909,
"step": 3000
},
{
"epoch": 1.601054247014079,
"eval_loss": 0.42682579159736633,
"eval_runtime": 118.7962,
"eval_samples_per_second": 29.176,
"eval_steps_per_second": 29.176,
"step": 3000
},
{
"epoch": 1.6277440448388605,
"grad_norm": 0.26821330189704895,
"learning_rate": 2.0454878268191925e-06,
"loss": 0.2818,
"step": 3050
},
{
"epoch": 1.6544338426636418,
"grad_norm": 0.21184305846691132,
"learning_rate": 1.7719932418044105e-06,
"loss": 0.2839,
"step": 3100
},
{
"epoch": 1.6811236404884233,
"grad_norm": 0.1029989942908287,
"learning_rate": 1.516339822243398e-06,
"loss": 0.2828,
"step": 3150
},
{
"epoch": 1.7078134383132046,
"grad_norm": 0.03526095300912857,
"learning_rate": 1.2790819132030974e-06,
"loss": 0.2851,
"step": 3200
},
{
"epoch": 1.7345032361379862,
"grad_norm": 0.004737787880003452,
"learning_rate": 1.0607339719190002e-06,
"loss": 0.2797,
"step": 3250
},
{
"epoch": 1.7611930339627677,
"grad_norm": 0.010323897004127502,
"learning_rate": 8.617694522738518e-07,
"loss": 0.274,
"step": 3300
},
{
"epoch": 1.7878828317875493,
"grad_norm": 0.6822851896286011,
"learning_rate": 6.826197781858324e-07,
"loss": 0.2727,
"step": 3350
},
{
"epoch": 1.8145726296123308,
"grad_norm": 0.007676210254430771,
"learning_rate": 5.236734081322281e-07,
"loss": 0.2932,
"step": 3400
},
{
"epoch": 1.8412624274371123,
"grad_norm": 0.45130208134651184,
"learning_rate": 3.852749928370536e-07,
"loss": 0.2885,
"step": 3450
},
{
"epoch": 1.8679522252618936,
"grad_norm": 0.007974912412464619,
"learning_rate": 2.677246279490309e-07,
"loss": 0.2996,
"step": 3500
},
{
"epoch": 1.8679522252618936,
"eval_loss": 0.42648929357528687,
"eval_runtime": 118.9628,
"eval_samples_per_second": 29.135,
"eval_steps_per_second": 29.135,
"step": 3500
},
{
"epoch": 1.894642023086675,
"grad_norm": 0.20535898208618164,
"learning_rate": 1.7127720333040442e-07,
"loss": 0.2885,
"step": 3550
},
{
"epoch": 1.9213318209114565,
"grad_norm": 0.005906387697905302,
"learning_rate": 9.614185036752155e-08,
"loss": 0.2867,
"step": 3600
},
{
"epoch": 1.948021618736238,
"grad_norm": 0.1817421317100525,
"learning_rate": 4.248148850162892e-08,
"loss": 0.273,
"step": 3650
},
{
"epoch": 1.9747114165610196,
"grad_norm": 0.4754043221473694,
"learning_rate": 1.041247196316264e-08,
"loss": 0.3001,
"step": 3700
},
{
"epoch": 2.0,
"step": 3748,
"total_flos": 3.7463946988098355e+17,
"train_loss": 0.3005621947244748,
"train_runtime": 4284.8367,
"train_samples_per_second": 6.995,
"train_steps_per_second": 0.875
}
],
"logging_steps": 50,
"max_steps": 3748,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.7463946988098355e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}