MHGanainy's picture
MHGanainy/mgpt-lora-multi-germany-imbalanced-1024
9afeef3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 9871,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010130685847431872,
"grad_norm": 7.154277324676514,
"learning_rate": 2.532928064842959e-07,
"loss": 6.7612,
"step": 100
},
{
"epoch": 0.020261371694863743,
"grad_norm": 7.9137983322143555,
"learning_rate": 5.065856129685918e-07,
"loss": 6.7189,
"step": 200
},
{
"epoch": 0.030392057542295615,
"grad_norm": 7.134788990020752,
"learning_rate": 7.598784194528875e-07,
"loss": 6.6275,
"step": 300
},
{
"epoch": 0.040522743389727486,
"grad_norm": 5.295099258422852,
"learning_rate": 1.0131712259371835e-06,
"loss": 6.2129,
"step": 400
},
{
"epoch": 0.050653429237159354,
"grad_norm": 4.518360614776611,
"learning_rate": 1.2664640324214794e-06,
"loss": 5.6904,
"step": 500
},
{
"epoch": 0.06078411508459123,
"grad_norm": 2.872621774673462,
"learning_rate": 1.519756838905775e-06,
"loss": 5.1702,
"step": 600
},
{
"epoch": 0.0709148009320231,
"grad_norm": 4.534188270568848,
"learning_rate": 1.7730496453900712e-06,
"loss": 4.8866,
"step": 700
},
{
"epoch": 0.08104548677945497,
"grad_norm": 1.8225231170654297,
"learning_rate": 2.026342451874367e-06,
"loss": 4.7456,
"step": 800
},
{
"epoch": 0.09117617262688683,
"grad_norm": 1.6859124898910522,
"learning_rate": 2.279635258358663e-06,
"loss": 4.5977,
"step": 900
},
{
"epoch": 0.10130685847431871,
"grad_norm": 1.448459506034851,
"learning_rate": 2.532928064842959e-06,
"loss": 4.5081,
"step": 1000
},
{
"epoch": 0.11143754432175058,
"grad_norm": 7.997034549713135,
"learning_rate": 2.7862208713272543e-06,
"loss": 4.4333,
"step": 1100
},
{
"epoch": 0.12156823016918246,
"grad_norm": 10.502978324890137,
"learning_rate": 3.03951367781155e-06,
"loss": 4.3472,
"step": 1200
},
{
"epoch": 0.13169891601661432,
"grad_norm": 1.39786958694458,
"learning_rate": 3.2928064842958464e-06,
"loss": 4.2736,
"step": 1300
},
{
"epoch": 0.1418296018640462,
"grad_norm": 1.8371859788894653,
"learning_rate": 3.5460992907801423e-06,
"loss": 4.1658,
"step": 1400
},
{
"epoch": 0.15196028771147807,
"grad_norm": 1.7692155838012695,
"learning_rate": 3.799392097264438e-06,
"loss": 4.121,
"step": 1500
},
{
"epoch": 0.16209097355890995,
"grad_norm": 2.3330609798431396,
"learning_rate": 4.052684903748734e-06,
"loss": 3.9781,
"step": 1600
},
{
"epoch": 0.17222165940634182,
"grad_norm": 9.345723152160645,
"learning_rate": 4.3059777102330295e-06,
"loss": 3.8582,
"step": 1700
},
{
"epoch": 0.18235234525377367,
"grad_norm": 4.135860919952393,
"learning_rate": 4.559270516717326e-06,
"loss": 3.6802,
"step": 1800
},
{
"epoch": 0.19248303110120554,
"grad_norm": 3.1482667922973633,
"learning_rate": 4.812563323201621e-06,
"loss": 3.4976,
"step": 1900
},
{
"epoch": 0.20261371694863742,
"grad_norm": 3.647608995437622,
"learning_rate": 5.065856129685918e-06,
"loss": 3.3249,
"step": 2000
},
{
"epoch": 0.2127444027960693,
"grad_norm": 4.777210235595703,
"learning_rate": 5.319148936170213e-06,
"loss": 3.1602,
"step": 2100
},
{
"epoch": 0.22287508864350117,
"grad_norm": 3.177557945251465,
"learning_rate": 5.5724417426545085e-06,
"loss": 2.9832,
"step": 2200
},
{
"epoch": 0.23300577449093304,
"grad_norm": 2.4862775802612305,
"learning_rate": 5.825734549138805e-06,
"loss": 2.86,
"step": 2300
},
{
"epoch": 0.24313646033836492,
"grad_norm": 2.4592087268829346,
"learning_rate": 6.0790273556231e-06,
"loss": 2.76,
"step": 2400
},
{
"epoch": 0.25326714618579677,
"grad_norm": 4.776676654815674,
"learning_rate": 6.3323201621073974e-06,
"loss": 2.7536,
"step": 2500
},
{
"epoch": 0.26339783203322864,
"grad_norm": 3.8928306102752686,
"learning_rate": 6.585612968591693e-06,
"loss": 2.6681,
"step": 2600
},
{
"epoch": 0.2735285178806605,
"grad_norm": 3.821058511734009,
"learning_rate": 6.838905775075988e-06,
"loss": 2.6202,
"step": 2700
},
{
"epoch": 0.2836592037280924,
"grad_norm": 2.162001609802246,
"learning_rate": 7.092198581560285e-06,
"loss": 2.5764,
"step": 2800
},
{
"epoch": 0.29378988957552427,
"grad_norm": 1.753183364868164,
"learning_rate": 7.34549138804458e-06,
"loss": 2.5782,
"step": 2900
},
{
"epoch": 0.30392057542295614,
"grad_norm": 1.6950722932815552,
"learning_rate": 7.598784194528876e-06,
"loss": 2.5569,
"step": 3000
},
{
"epoch": 0.314051261270388,
"grad_norm": 1.6093318462371826,
"learning_rate": 7.852077001013173e-06,
"loss": 2.498,
"step": 3100
},
{
"epoch": 0.3241819471178199,
"grad_norm": 1.3843328952789307,
"learning_rate": 8.105369807497468e-06,
"loss": 2.4841,
"step": 3200
},
{
"epoch": 0.33431263296525177,
"grad_norm": 2.4331817626953125,
"learning_rate": 8.358662613981764e-06,
"loss": 2.4389,
"step": 3300
},
{
"epoch": 0.34444331881268364,
"grad_norm": 1.6857964992523193,
"learning_rate": 8.611955420466059e-06,
"loss": 2.4589,
"step": 3400
},
{
"epoch": 0.3545740046601155,
"grad_norm": 1.323832631111145,
"learning_rate": 8.865248226950355e-06,
"loss": 2.3911,
"step": 3500
},
{
"epoch": 0.36470469050754734,
"grad_norm": 1.2381794452667236,
"learning_rate": 9.118541033434652e-06,
"loss": 2.4117,
"step": 3600
},
{
"epoch": 0.3748353763549792,
"grad_norm": 1.2798967361450195,
"learning_rate": 9.371833839918947e-06,
"loss": 2.3574,
"step": 3700
},
{
"epoch": 0.3849660622024111,
"grad_norm": 1.5534555912017822,
"learning_rate": 9.625126646403243e-06,
"loss": 2.3676,
"step": 3800
},
{
"epoch": 0.39509674804984296,
"grad_norm": 2.2549705505371094,
"learning_rate": 9.878419452887538e-06,
"loss": 2.3417,
"step": 3900
},
{
"epoch": 0.40522743389727484,
"grad_norm": 1.787307620048523,
"learning_rate": 1.0131712259371835e-05,
"loss": 2.3361,
"step": 4000
},
{
"epoch": 0.4153581197447067,
"grad_norm": 1.382817029953003,
"learning_rate": 1.038500506585613e-05,
"loss": 2.321,
"step": 4100
},
{
"epoch": 0.4254888055921386,
"grad_norm": 2.0577666759490967,
"learning_rate": 1.0638297872340426e-05,
"loss": 2.316,
"step": 4200
},
{
"epoch": 0.43561949143957046,
"grad_norm": 1.5490138530731201,
"learning_rate": 1.0891590678824722e-05,
"loss": 2.2916,
"step": 4300
},
{
"epoch": 0.44575017728700234,
"grad_norm": 1.516607642173767,
"learning_rate": 1.1144883485309017e-05,
"loss": 2.2755,
"step": 4400
},
{
"epoch": 0.4558808631344342,
"grad_norm": 1.2727792263031006,
"learning_rate": 1.1398176291793314e-05,
"loss": 2.2622,
"step": 4500
},
{
"epoch": 0.4660115489818661,
"grad_norm": 2.546395778656006,
"learning_rate": 1.165146909827761e-05,
"loss": 2.2575,
"step": 4600
},
{
"epoch": 0.47614223482929796,
"grad_norm": 20.72391128540039,
"learning_rate": 1.1904761904761905e-05,
"loss": 2.2558,
"step": 4700
},
{
"epoch": 0.48627292067672984,
"grad_norm": 2.8161871433258057,
"learning_rate": 1.21580547112462e-05,
"loss": 2.2573,
"step": 4800
},
{
"epoch": 0.4964036065241617,
"grad_norm": 2.429856300354004,
"learning_rate": 1.2411347517730496e-05,
"loss": 2.2347,
"step": 4900
},
{
"epoch": 0.5065342923715935,
"grad_norm": 1.5232312679290771,
"learning_rate": 1.2664640324214795e-05,
"loss": 2.2388,
"step": 5000
},
{
"epoch": 0.5166649782190255,
"grad_norm": 1.2436437606811523,
"learning_rate": 1.291793313069909e-05,
"loss": 2.2174,
"step": 5100
},
{
"epoch": 0.5267956640664573,
"grad_norm": 1.0977882146835327,
"learning_rate": 1.3171225937183386e-05,
"loss": 2.2086,
"step": 5200
},
{
"epoch": 0.5369263499138892,
"grad_norm": 1.1018918752670288,
"learning_rate": 1.3424518743667681e-05,
"loss": 2.2273,
"step": 5300
},
{
"epoch": 0.547057035761321,
"grad_norm": 1.9143567085266113,
"learning_rate": 1.3677811550151977e-05,
"loss": 2.217,
"step": 5400
},
{
"epoch": 0.557187721608753,
"grad_norm": 1.243256688117981,
"learning_rate": 1.3931104356636274e-05,
"loss": 2.1944,
"step": 5500
},
{
"epoch": 0.5673184074561848,
"grad_norm": 1.2262539863586426,
"learning_rate": 1.418439716312057e-05,
"loss": 2.199,
"step": 5600
},
{
"epoch": 0.5774490933036166,
"grad_norm": 1.6061089038848877,
"learning_rate": 1.4437689969604865e-05,
"loss": 2.1886,
"step": 5700
},
{
"epoch": 0.5875797791510485,
"grad_norm": 1.6153846979141235,
"learning_rate": 1.469098277608916e-05,
"loss": 2.2086,
"step": 5800
},
{
"epoch": 0.5977104649984804,
"grad_norm": 1.023937702178955,
"learning_rate": 1.4944275582573456e-05,
"loss": 2.1795,
"step": 5900
},
{
"epoch": 0.6078411508459123,
"grad_norm": 1.7739474773406982,
"learning_rate": 1.5197568389057753e-05,
"loss": 2.198,
"step": 6000
},
{
"epoch": 0.6179718366933441,
"grad_norm": 1.8380703926086426,
"learning_rate": 1.545086119554205e-05,
"loss": 2.1816,
"step": 6100
},
{
"epoch": 0.628102522540776,
"grad_norm": 1.173989176750183,
"learning_rate": 1.5704154002026345e-05,
"loss": 2.1804,
"step": 6200
},
{
"epoch": 0.6382332083882079,
"grad_norm": 1.0197932720184326,
"learning_rate": 1.595744680851064e-05,
"loss": 2.1791,
"step": 6300
},
{
"epoch": 0.6483638942356398,
"grad_norm": 1.3487520217895508,
"learning_rate": 1.6210739614994936e-05,
"loss": 2.1597,
"step": 6400
},
{
"epoch": 0.6584945800830716,
"grad_norm": 1.2801405191421509,
"learning_rate": 1.6464032421479232e-05,
"loss": 2.1587,
"step": 6500
},
{
"epoch": 0.6686252659305035,
"grad_norm": 1.4226802587509155,
"learning_rate": 1.6717325227963527e-05,
"loss": 2.1462,
"step": 6600
},
{
"epoch": 0.6787559517779354,
"grad_norm": 6.232682228088379,
"learning_rate": 1.6970618034447823e-05,
"loss": 2.1429,
"step": 6700
},
{
"epoch": 0.6888866376253673,
"grad_norm": 1.2026145458221436,
"learning_rate": 1.7223910840932118e-05,
"loss": 2.1549,
"step": 6800
},
{
"epoch": 0.6990173234727991,
"grad_norm": 1.0961929559707642,
"learning_rate": 1.7477203647416414e-05,
"loss": 2.138,
"step": 6900
},
{
"epoch": 0.709148009320231,
"grad_norm": 1.0841082334518433,
"learning_rate": 1.773049645390071e-05,
"loss": 2.1235,
"step": 7000
},
{
"epoch": 0.7192786951676629,
"grad_norm": 6.760639667510986,
"learning_rate": 1.7983789260385008e-05,
"loss": 2.1259,
"step": 7100
},
{
"epoch": 0.7294093810150947,
"grad_norm": 1.358530044555664,
"learning_rate": 1.8237082066869303e-05,
"loss": 2.1259,
"step": 7200
},
{
"epoch": 0.7395400668625266,
"grad_norm": 0.9958035349845886,
"learning_rate": 1.84903748733536e-05,
"loss": 2.1364,
"step": 7300
},
{
"epoch": 0.7496707527099584,
"grad_norm": 1.4952771663665771,
"learning_rate": 1.8743667679837894e-05,
"loss": 2.1037,
"step": 7400
},
{
"epoch": 0.7598014385573904,
"grad_norm": 1.4222711324691772,
"learning_rate": 1.899696048632219e-05,
"loss": 2.1181,
"step": 7500
},
{
"epoch": 0.7699321244048222,
"grad_norm": 0.9418506622314453,
"learning_rate": 1.9250253292806485e-05,
"loss": 2.1146,
"step": 7600
},
{
"epoch": 0.7800628102522541,
"grad_norm": 1.6261628866195679,
"learning_rate": 1.950354609929078e-05,
"loss": 2.1089,
"step": 7700
},
{
"epoch": 0.7901934960996859,
"grad_norm": 2.184122085571289,
"learning_rate": 1.9756838905775076e-05,
"loss": 2.1036,
"step": 7800
},
{
"epoch": 0.8003241819471179,
"grad_norm": 3.901762008666992,
"learning_rate": 1.999979757969855e-05,
"loss": 2.1042,
"step": 7900
},
{
"epoch": 0.8104548677945497,
"grad_norm": 4.365869522094727,
"learning_rate": 1.9863475200884386e-05,
"loss": 2.1132,
"step": 8000
},
{
"epoch": 0.8205855536419816,
"grad_norm": 2.3704495429992676,
"learning_rate": 1.9478106818608973e-05,
"loss": 2.0985,
"step": 8100
},
{
"epoch": 0.8307162394894134,
"grad_norm": 4.145130634307861,
"learning_rate": 1.8853422720981873e-05,
"loss": 2.1131,
"step": 8200
},
{
"epoch": 0.8408469253368454,
"grad_norm": 1.318753719329834,
"learning_rate": 1.800519575426379e-05,
"loss": 2.1137,
"step": 8300
},
{
"epoch": 0.8509776111842772,
"grad_norm": 1.3615083694458008,
"learning_rate": 1.6954843069285113e-05,
"loss": 2.1177,
"step": 8400
},
{
"epoch": 0.861108297031709,
"grad_norm": 1.8678381443023682,
"learning_rate": 1.5728885353034063e-05,
"loss": 2.0999,
"step": 8500
},
{
"epoch": 0.8712389828791409,
"grad_norm": 1.5799565315246582,
"learning_rate": 1.4358277199447007e-05,
"loss": 2.0918,
"step": 8600
},
{
"epoch": 0.8813696687265727,
"grad_norm": 0.9612336754798889,
"learning_rate": 1.2877625527080568e-05,
"loss": 2.1174,
"step": 8700
},
{
"epoch": 0.8915003545740047,
"grad_norm": 1.934597134590149,
"learning_rate": 1.1324315778084788e-05,
"loss": 2.1028,
"step": 8800
},
{
"epoch": 0.9016310404214365,
"grad_norm": 2.4133620262145996,
"learning_rate": 9.737567961355728e-06,
"loss": 2.0958,
"step": 8900
},
{
"epoch": 0.9117617262688684,
"grad_norm": 2.853616237640381,
"learning_rate": 8.157446374132335e-06,
"loss": 2.0875,
"step": 9000
},
{
"epoch": 0.9218924121163002,
"grad_norm": 1.1536306142807007,
"learning_rate": 6.623848005890046e-06,
"loss": 2.1093,
"step": 9100
},
{
"epoch": 0.9320230979637322,
"grad_norm": 3.1503701210021973,
"learning_rate": 5.175495166640958e-06,
"loss": 2.0838,
"step": 9200
},
{
"epoch": 0.942153783811164,
"grad_norm": 1.7781630754470825,
"learning_rate": 3.848957775087207e-06,
"loss": 2.0971,
"step": 9300
},
{
"epoch": 0.9522844696585959,
"grad_norm": 3.86759614944458,
"learning_rate": 2.6777299931828482e-06,
"loss": 2.0888,
"step": 9400
},
{
"epoch": 0.9624151555060277,
"grad_norm": 2.273083209991455,
"learning_rate": 1.6913845214503065e-06,
"loss": 2.0841,
"step": 9500
},
{
"epoch": 0.9725458413534597,
"grad_norm": 5.05561637878418,
"learning_rate": 9.148259085161204e-07,
"loss": 2.0874,
"step": 9600
},
{
"epoch": 0.9826765272008915,
"grad_norm": 5.530516147613525,
"learning_rate": 3.676617282896666e-07,
"loss": 2.0993,
"step": 9700
},
{
"epoch": 0.9928072130483234,
"grad_norm": 1.7245455980300903,
"learning_rate": 6.37075021310396e-08,
"loss": 2.0655,
"step": 9800
},
{
"epoch": 1.0,
"step": 9871,
"total_flos": 1.1780246832876093e+18,
"train_loss": 2.774943617143574,
"train_runtime": 2530.0612,
"train_samples_per_second": 31.21,
"train_steps_per_second": 3.901
}
],
"logging_steps": 100,
"max_steps": 9871,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1780246832876093e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}