Model save

Browse files

Files changed (4) hide show

README.md +69 -0
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +1387 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: llama3
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Meta-Llama-3-8B
+datasets:
+- generator
+model-index:
+- name: downstream_0.01p_seed42_level2_rare
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# downstream_0.01p_seed42_level2_rare
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.9754
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- total_eval_batch_size: 4
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.0649        | 0.9997 | 954  | 0.9754          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.43.4
+- Pytorch 2.3.1+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9997380141472361,
+    "total_flos": 1.1997126510772224e+16,
+    "train_loss": 1.1073102571179532,
+    "train_runtime": 19660.0543,
+    "train_samples": 104452,
+    "train_samples_per_second": 3.106,
+    "train_steps_per_second": 0.049
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9997380141472361,
+    "total_flos": 1.1997126510772224e+16,
+    "train_loss": 1.1073102571179532,
+    "train_runtime": 19660.0543,
+    "train_samples": 104452,
+    "train_samples_per_second": 3.106,
+    "train_steps_per_second": 0.049
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1387 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997380141472361,
+  "eval_steps": 500,
+  "global_step": 954,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.001047943411055803,
+      "grad_norm": 0.4001561321232938,
+      "learning_rate": 2.0833333333333334e-06,
+      "loss": 1.3753,
+      "step": 1
+    },
+    {
+      "epoch": 0.0052397170552790145,
+      "grad_norm": 0.4325259158841641,
+      "learning_rate": 1.0416666666666668e-05,
+      "loss": 1.3728,
+      "step": 5
+    },
+    {
+      "epoch": 0.010479434110558029,
+      "grad_norm": 0.48537394471497125,
+      "learning_rate": 2.0833333333333336e-05,
+      "loss": 1.3694,
+      "step": 10
+    },
+    {
+      "epoch": 0.015719151165837046,
+      "grad_norm": 0.21451255069741118,
+      "learning_rate": 3.125e-05,
+      "loss": 1.3356,
+      "step": 15
+    },
+    {
+      "epoch": 0.020958868221116058,
+      "grad_norm": 0.1625576362850241,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.2901,
+      "step": 20
+    },
+    {
+      "epoch": 0.026198585276395073,
+      "grad_norm": 0.1442683256295141,
+      "learning_rate": 5.208333333333334e-05,
+      "loss": 1.3057,
+      "step": 25
+    },
+    {
+      "epoch": 0.03143830233167409,
+      "grad_norm": 0.13774645654676312,
+      "learning_rate": 6.25e-05,
+      "loss": 1.3118,
+      "step": 30
+    },
+    {
+      "epoch": 0.03667801938695311,
+      "grad_norm": 0.1351879639516721,
+      "learning_rate": 7.291666666666667e-05,
+      "loss": 1.2465,
+      "step": 35
+    },
+    {
+      "epoch": 0.041917736442232116,
+      "grad_norm": 0.11321368969702118,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 1.2302,
+      "step": 40
+    },
+    {
+      "epoch": 0.04715745349751113,
+      "grad_norm": 0.09641835898020555,
+      "learning_rate": 9.375e-05,
+      "loss": 1.1752,
+      "step": 45
+    },
+    {
+      "epoch": 0.05239717055279015,
+      "grad_norm": 0.08525335607581626,
+      "learning_rate": 0.00010416666666666667,
+      "loss": 1.1696,
+      "step": 50
+    },
+    {
+      "epoch": 0.05763688760806916,
+      "grad_norm": 0.08103124164785507,
+      "learning_rate": 0.00011458333333333333,
+      "loss": 1.1899,
+      "step": 55
+    },
+    {
+      "epoch": 0.06287660466334818,
+      "grad_norm": 0.06901625153652526,
+      "learning_rate": 0.000125,
+      "loss": 1.1664,
+      "step": 60
+    },
+    {
+      "epoch": 0.06811632171862719,
+      "grad_norm": 0.0674972608334169,
+      "learning_rate": 0.0001354166666666667,
+      "loss": 1.1857,
+      "step": 65
+    },
+    {
+      "epoch": 0.07335603877390622,
+      "grad_norm": 0.07061975941315685,
+      "learning_rate": 0.00014583333333333335,
+      "loss": 1.1566,
+      "step": 70
+    },
+    {
+      "epoch": 0.07859575582918522,
+      "grad_norm": 0.06830732833348191,
+      "learning_rate": 0.00015625,
+      "loss": 1.1464,
+      "step": 75
+    },
+    {
+      "epoch": 0.08383547288446423,
+      "grad_norm": 0.06929791315843523,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 1.1572,
+      "step": 80
+    },
+    {
+      "epoch": 0.08907518993974325,
+      "grad_norm": 0.07358426687980345,
+      "learning_rate": 0.00017708333333333335,
+      "loss": 1.1425,
+      "step": 85
+    },
+    {
+      "epoch": 0.09431490699502226,
+      "grad_norm": 0.07082497637162344,
+      "learning_rate": 0.0001875,
+      "loss": 1.1416,
+      "step": 90
+    },
+    {
+      "epoch": 0.09955462405030129,
+      "grad_norm": 0.09545498739533521,
+      "learning_rate": 0.0001979166666666667,
+      "loss": 1.1624,
+      "step": 95
+    },
+    {
+      "epoch": 0.1047943411055803,
+      "grad_norm": 0.08500702062384047,
+      "learning_rate": 0.00019998927475076107,
+      "loss": 1.1323,
+      "step": 100
+    },
+    {
+      "epoch": 0.11003405816085932,
+      "grad_norm": 0.09301268675355433,
+      "learning_rate": 0.00019994570736865406,
+      "loss": 1.0883,
+      "step": 105
+    },
+    {
+      "epoch": 0.11527377521613832,
+      "grad_norm": 0.08575977499880286,
+      "learning_rate": 0.00019986864211644069,
+      "loss": 1.1046,
+      "step": 110
+    },
+    {
+      "epoch": 0.12051349227141735,
+      "grad_norm": 0.10121427622454875,
+      "learning_rate": 0.00019975810482336233,
+      "loss": 1.1154,
+      "step": 115
+    },
+    {
+      "epoch": 0.12575320932669637,
+      "grad_norm": 0.09078809335101837,
+      "learning_rate": 0.00019961413253717213,
+      "loss": 1.1002,
+      "step": 120
+    },
+    {
+      "epoch": 0.13099292638197538,
+      "grad_norm": 0.0781587950026229,
+      "learning_rate": 0.00019943677351171775,
+      "loss": 1.1016,
+      "step": 125
+    },
+    {
+      "epoch": 0.13623264343725439,
+      "grad_norm": 0.08268644073866073,
+      "learning_rate": 0.00019922608719076873,
+      "loss": 1.1026,
+      "step": 130
+    },
+    {
+      "epoch": 0.1414723604925334,
+      "grad_norm": 0.0985349651519935,
+      "learning_rate": 0.0001989821441880933,
+      "loss": 1.14,
+      "step": 135
+    },
+    {
+      "epoch": 0.14671207754781243,
+      "grad_norm": 0.08307522790061574,
+      "learning_rate": 0.00019870502626379127,
+      "loss": 1.1298,
+      "step": 140
+    },
+    {
+      "epoch": 0.15195179460309144,
+      "grad_norm": 0.08579822231869356,
+      "learning_rate": 0.00019839482629689154,
+      "loss": 1.0967,
+      "step": 145
+    },
+    {
+      "epoch": 0.15719151165837045,
+      "grad_norm": 0.07617635505916012,
+      "learning_rate": 0.0001980516482542224,
+      "loss": 1.1107,
+      "step": 150
+    },
+    {
+      "epoch": 0.16243122871364946,
+      "grad_norm": 0.07813217848338273,
+      "learning_rate": 0.00019767560715556597,
+      "loss": 1.1271,
+      "step": 155
+    },
+    {
+      "epoch": 0.16767094576892846,
+      "grad_norm": 0.08062953312484893,
+      "learning_rate": 0.0001972668290351084,
+      "loss": 1.0962,
+      "step": 160
+    },
+    {
+      "epoch": 0.1729106628242075,
+      "grad_norm": 0.079570669107266,
+      "learning_rate": 0.00019682545089919784,
+      "loss": 1.1172,
+      "step": 165
+    },
+    {
+      "epoch": 0.1781503798794865,
+      "grad_norm": 0.07824690927340332,
+      "learning_rate": 0.00019635162068042545,
+      "loss": 1.0977,
+      "step": 170
+    },
+    {
+      "epoch": 0.18339009693476552,
+      "grad_norm": 0.07565054237827153,
+      "learning_rate": 0.0001958454971880441,
+      "loss": 1.0989,
+      "step": 175
+    },
+    {
+      "epoch": 0.18862981399004453,
+      "grad_norm": 0.08232625474832411,
+      "learning_rate": 0.00019530725005474195,
+      "loss": 1.1031,
+      "step": 180
+    },
+    {
+      "epoch": 0.19386953104532356,
+      "grad_norm": 0.08313986432990468,
+      "learning_rate": 0.00019473705967978808,
+      "loss": 1.1188,
+      "step": 185
+    },
+    {
+      "epoch": 0.19910924810060257,
+      "grad_norm": 0.08608307927164675,
+      "learning_rate": 0.00019413511716856972,
+      "loss": 1.1182,
+      "step": 190
+    },
+    {
+      "epoch": 0.20434896515588158,
+      "grad_norm": 0.07591604791733918,
+      "learning_rate": 0.0001935016242685415,
+      "loss": 1.1136,
+      "step": 195
+    },
+    {
+      "epoch": 0.2095886822111606,
+      "grad_norm": 0.08255303175008252,
+      "learning_rate": 0.00019283679330160726,
+      "loss": 1.0869,
+      "step": 200
+    },
+    {
+      "epoch": 0.21482839926643962,
+      "grad_norm": 0.08084692432416707,
+      "learning_rate": 0.00019214084709295848,
+      "loss": 1.1077,
+      "step": 205
+    },
+    {
+      "epoch": 0.22006811632171863,
+      "grad_norm": 0.07931205352695565,
+      "learning_rate": 0.0001914140188963917,
+      "loss": 1.0975,
+      "step": 210
+    },
+    {
+      "epoch": 0.22530783337699764,
+      "grad_norm": 0.0771370875424395,
+      "learning_rate": 0.0001906565523161312,
+      "loss": 1.1186,
+      "step": 215
+    },
+    {
+      "epoch": 0.23054755043227665,
+      "grad_norm": 0.0881145788751443,
+      "learning_rate": 0.00018986870122518262,
+      "loss": 1.1124,
+      "step": 220
+    },
+    {
+      "epoch": 0.23578726748755569,
+      "grad_norm": 0.1055770930021298,
+      "learning_rate": 0.00018905072968024425,
+      "loss": 1.0968,
+      "step": 225
+    },
+    {
+      "epoch": 0.2410269845428347,
+      "grad_norm": 0.09095639364627423,
+      "learning_rate": 0.00018820291183320603,
+      "loss": 1.126,
+      "step": 230
+    },
+    {
+      "epoch": 0.2462667015981137,
+      "grad_norm": 0.07225566008587724,
+      "learning_rate": 0.00018732553183926443,
+      "loss": 1.0918,
+      "step": 235
+    },
+    {
+      "epoch": 0.25150641865339274,
+      "grad_norm": 0.07767735154463175,
+      "learning_rate": 0.00018641888376168484,
+      "loss": 1.1009,
+      "step": 240
+    },
+    {
+      "epoch": 0.2567461357086717,
+      "grad_norm": 0.08437806562537126,
+      "learning_rate": 0.00018548327147324315,
+      "loss": 1.0768,
+      "step": 245
+    },
+    {
+      "epoch": 0.26198585276395076,
+      "grad_norm": 0.07576721126196213,
+      "learning_rate": 0.0001845190085543795,
+      "loss": 1.0878,
+      "step": 250
+    },
+    {
+      "epoch": 0.26722556981922974,
+      "grad_norm": 0.08205260181446893,
+      "learning_rate": 0.00018352641818809848,
+      "loss": 1.1281,
+      "step": 255
+    },
+    {
+      "epoch": 0.27246528687450877,
+      "grad_norm": 0.07975349596755926,
+      "learning_rate": 0.00018250583305165098,
+      "loss": 1.1293,
+      "step": 260
+    },
+    {
+      "epoch": 0.2777050039297878,
+      "grad_norm": 0.07530550662057361,
+      "learning_rate": 0.00018145759520503358,
+      "loss": 1.1147,
+      "step": 265
+    },
+    {
+      "epoch": 0.2829447209850668,
+      "grad_norm": 0.0807310288155147,
+      "learning_rate": 0.00018038205597634393,
+      "loss": 1.1092,
+      "step": 270
+    },
+    {
+      "epoch": 0.2881844380403458,
+      "grad_norm": 0.0783642611125798,
+      "learning_rate": 0.00017927957584402897,
+      "loss": 1.0969,
+      "step": 275
+    },
+    {
+      "epoch": 0.29342415509562486,
+      "grad_norm": 0.0834215103482037,
+      "learning_rate": 0.000178150524316067,
+      "loss": 1.0951,
+      "step": 280
+    },
+    {
+      "epoch": 0.29866387215090384,
+      "grad_norm": 0.0777991252254151,
+      "learning_rate": 0.00017699527980612304,
+      "loss": 1.0974,
+      "step": 285
+    },
+    {
+      "epoch": 0.3039035892061829,
+      "grad_norm": 0.07401384328604721,
+      "learning_rate": 0.00017581422950671942,
+      "loss": 1.1258,
+      "step": 290
+    },
+    {
+      "epoch": 0.30914330626146186,
+      "grad_norm": 0.0768067427086218,
+      "learning_rate": 0.00017460776925946417,
+      "loss": 1.0946,
+      "step": 295
+    },
+    {
+      "epoch": 0.3143830233167409,
+      "grad_norm": 0.07212540160444633,
+      "learning_rate": 0.00017337630342238042,
+      "loss": 1.1074,
+      "step": 300
+    },
+    {
+      "epoch": 0.31962274037201993,
+      "grad_norm": 0.0748307586015756,
+      "learning_rate": 0.00017212024473438147,
+      "loss": 1.0954,
+      "step": 305
+    },
+    {
+      "epoch": 0.3248624574272989,
+      "grad_norm": 0.07601887269890263,
+      "learning_rate": 0.00017084001417693703,
+      "loss": 1.0936,
+      "step": 310
+    },
+    {
+      "epoch": 0.33010217448257795,
+      "grad_norm": 0.07281811010206982,
+      "learning_rate": 0.00016953604083297665,
+      "loss": 1.1219,
+      "step": 315
+    },
+    {
+      "epoch": 0.33534189153785693,
+      "grad_norm": 0.07523858579513697,
+      "learning_rate": 0.00016820876174307821,
+      "loss": 1.1157,
+      "step": 320
+    },
+    {
+      "epoch": 0.34058160859313596,
+      "grad_norm": 0.08575069343747709,
+      "learning_rate": 0.00016685862175898892,
+      "loss": 1.1179,
+      "step": 325
+    },
+    {
+      "epoch": 0.345821325648415,
+      "grad_norm": 0.07857161375376395,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 1.0844,
+      "step": 330
+    },
+    {
+      "epoch": 0.351061042703694,
+      "grad_norm": 0.07945712197579065,
+      "learning_rate": 0.00016409157667392457,
+      "loss": 1.0964,
+      "step": 335
+    },
+    {
+      "epoch": 0.356300759758973,
+      "grad_norm": 0.08254440545274826,
+      "learning_rate": 0.00016267559897763028,
+      "loss": 1.094,
+      "step": 340
+    },
+    {
+      "epoch": 0.36154047681425205,
+      "grad_norm": 0.07939321839093792,
+      "learning_rate": 0.0001612386148856771,
+      "loss": 1.0966,
+      "step": 345
+    },
+    {
+      "epoch": 0.36678019386953103,
+      "grad_norm": 0.08193880396697184,
+      "learning_rate": 0.0001597811060186141,
+      "loss": 1.0997,
+      "step": 350
+    },
+    {
+      "epoch": 0.37201991092481007,
+      "grad_norm": 0.07719408936904257,
+      "learning_rate": 0.00015830356087608764,
+      "loss": 1.0789,
+      "step": 355
+    },
+    {
+      "epoch": 0.37725962798008905,
+      "grad_norm": 0.07549679547307497,
+      "learning_rate": 0.00015680647467311557,
+      "loss": 1.1008,
+      "step": 360
+    },
+    {
+      "epoch": 0.3824993450353681,
+      "grad_norm": 0.08172686400286096,
+      "learning_rate": 0.00015529034917411073,
+      "loss": 1.1251,
+      "step": 365
+    },
+    {
+      "epoch": 0.3877390620906471,
+      "grad_norm": 0.07573300240197864,
+      "learning_rate": 0.00015375569252470896,
+      "loss": 1.1121,
+      "step": 370
+    },
+    {
+      "epoch": 0.3929787791459261,
+      "grad_norm": 0.07510493645601654,
+      "learning_rate": 0.00015220301908145905,
+      "loss": 1.0617,
+      "step": 375
+    },
+    {
+      "epoch": 0.39821849620120514,
+      "grad_norm": 0.07047562795180644,
+      "learning_rate": 0.00015063284923943031,
+      "loss": 1.0964,
+      "step": 380
+    },
+    {
+      "epoch": 0.4034582132564842,
+      "grad_norm": 0.07710971072693787,
+      "learning_rate": 0.00014904570925779683,
+      "loss": 1.0822,
+      "step": 385
+    },
+    {
+      "epoch": 0.40869793031176316,
+      "grad_norm": 0.08283963772289088,
+      "learning_rate": 0.00014744213108345604,
+      "loss": 1.0744,
+      "step": 390
+    },
+    {
+      "epoch": 0.4139376473670422,
+      "grad_norm": 0.07086438189754951,
+      "learning_rate": 0.00014582265217274104,
+      "loss": 1.109,
+      "step": 395
+    },
+    {
+      "epoch": 0.4191773644223212,
+      "grad_norm": 0.06876096472263239,
+      "learning_rate": 0.00014418781531128636,
+      "loss": 1.1008,
+      "step": 400
+    },
+    {
+      "epoch": 0.4244170814776002,
+      "grad_norm": 0.07686166297785638,
+      "learning_rate": 0.0001425381684321075,
+      "loss": 1.0829,
+      "step": 405
+    },
+    {
+      "epoch": 0.42965679853287925,
+      "grad_norm": 0.08368209207409245,
+      "learning_rate": 0.00014087426443195548,
+      "loss": 1.1336,
+      "step": 410
+    },
+    {
+      "epoch": 0.43489651558815823,
+      "grad_norm": 0.0725885844341327,
+      "learning_rate": 0.00013919666098600753,
+      "loss": 1.0874,
+      "step": 415
+    },
+    {
+      "epoch": 0.44013623264343726,
+      "grad_norm": 0.07310731092919365,
+      "learning_rate": 0.0001375059203609562,
+      "loss": 1.084,
+      "step": 420
+    },
+    {
+      "epoch": 0.44537594969871624,
+      "grad_norm": 0.07788032512060031,
+      "learning_rate": 0.00013580260922655985,
+      "loss": 1.102,
+      "step": 425
+    },
+    {
+      "epoch": 0.4506156667539953,
+      "grad_norm": 0.07182054106731274,
+      "learning_rate": 0.00013408729846571714,
+      "loss": 1.075,
+      "step": 430
+    },
+    {
+      "epoch": 0.4558553838092743,
+      "grad_norm": 0.0745557942612687,
+      "learning_rate": 0.00013236056298312958,
+      "loss": 1.0936,
+      "step": 435
+    },
+    {
+      "epoch": 0.4610951008645533,
+      "grad_norm": 0.07109140712607685,
+      "learning_rate": 0.00013062298151261592,
+      "loss": 1.0762,
+      "step": 440
+    },
+    {
+      "epoch": 0.46633481791983233,
+      "grad_norm": 0.07067264389645633,
+      "learning_rate": 0.00012887513642314373,
+      "loss": 1.1132,
+      "step": 445
+    },
+    {
+      "epoch": 0.47157453497511137,
+      "grad_norm": 0.08171597552387606,
+      "learning_rate": 0.00012711761352364172,
+      "loss": 1.0829,
+      "step": 450
+    },
+    {
+      "epoch": 0.47681425203039035,
+      "grad_norm": 0.07525113073399527,
+      "learning_rate": 0.00012535100186666,
+      "loss": 1.0983,
+      "step": 455
+    },
+    {
+      "epoch": 0.4820539690856694,
+      "grad_norm": 0.07155593708972481,
+      "learning_rate": 0.00012357589355094275,
+      "loss": 1.0934,
+      "step": 460
+    },
+    {
+      "epoch": 0.48729368614094837,
+      "grad_norm": 0.07568360243136522,
+      "learning_rate": 0.00012179288352297984,
+      "loss": 1.1088,
+      "step": 465
+    },
+    {
+      "epoch": 0.4925334031962274,
+      "grad_norm": 0.07870992806459551,
+      "learning_rate": 0.00012000256937760445,
+      "loss": 1.0886,
+      "step": 470
+    },
+    {
+      "epoch": 0.49777312025150644,
+      "grad_norm": 0.07740807887909781,
+      "learning_rate": 0.00011820555115770255,
+      "loss": 1.0769,
+      "step": 475
+    },
+    {
+      "epoch": 0.5030128373067855,
+      "grad_norm": 0.08195630153403244,
+      "learning_rate": 0.00011640243115310218,
+      "loss": 1.0676,
+      "step": 480
+    },
+    {
+      "epoch": 0.5082525543620644,
+      "grad_norm": 0.07035497008178994,
+      "learning_rate": 0.00011459381369870974,
+      "loss": 1.0847,
+      "step": 485
+    },
+    {
+      "epoch": 0.5134922714173434,
+      "grad_norm": 0.07213741826205265,
+      "learning_rate": 0.00011278030497196049,
+      "loss": 1.0882,
+      "step": 490
+    },
+    {
+      "epoch": 0.5187319884726225,
+      "grad_norm": 0.07436809187729645,
+      "learning_rate": 0.00011096251278965172,
+      "loss": 1.0897,
+      "step": 495
+    },
+    {
+      "epoch": 0.5239717055279015,
+      "grad_norm": 0.08114263617937822,
+      "learning_rate": 0.00010914104640422679,
+      "loss": 1.091,
+      "step": 500
+    },
+    {
+      "epoch": 0.5292114225831805,
+      "grad_norm": 0.07778710874522761,
+      "learning_rate": 0.00010731651629957722,
+      "loss": 1.0714,
+      "step": 505
+    },
+    {
+      "epoch": 0.5344511396384595,
+      "grad_norm": 0.08462996195681859,
+      "learning_rate": 0.00010548953398643275,
+      "loss": 1.0774,
+      "step": 510
+    },
+    {
+      "epoch": 0.5396908566937385,
+      "grad_norm": 0.07951504140245977,
+      "learning_rate": 0.00010366071179740706,
+      "loss": 1.0849,
+      "step": 515
+    },
+    {
+      "epoch": 0.5449305737490175,
+      "grad_norm": 0.07543823682325243,
+      "learning_rate": 0.00010183066268176776,
+      "loss": 1.0965,
+      "step": 520
+    },
+    {
+      "epoch": 0.5501702908042966,
+      "grad_norm": 0.07866000568639331,
+      "learning_rate": 0.0001,
+      "loss": 1.0606,
+      "step": 525
+    },
+    {
+      "epoch": 0.5554100078595756,
+      "grad_norm": 0.0734883472447597,
+      "learning_rate": 9.816933731823231e-05,
+      "loss": 1.1037,
+      "step": 530
+    },
+    {
+      "epoch": 0.5606497249148547,
+      "grad_norm": 0.06987918160697645,
+      "learning_rate": 9.633928820259295e-05,
+      "loss": 1.0805,
+      "step": 535
+    },
+    {
+      "epoch": 0.5658894419701336,
+      "grad_norm": 0.07153298757189964,
+      "learning_rate": 9.451046601356725e-05,
+      "loss": 1.1353,
+      "step": 540
+    },
+    {
+      "epoch": 0.5711291590254126,
+      "grad_norm": 0.0744809511267128,
+      "learning_rate": 9.268348370042281e-05,
+      "loss": 1.0661,
+      "step": 545
+    },
+    {
+      "epoch": 0.5763688760806917,
+      "grad_norm": 0.07467776042097098,
+      "learning_rate": 9.085895359577324e-05,
+      "loss": 1.0986,
+      "step": 550
+    },
+    {
+      "epoch": 0.5816085931359707,
+      "grad_norm": 0.07805443340486796,
+      "learning_rate": 8.903748721034827e-05,
+      "loss": 1.1047,
+      "step": 555
+    },
+    {
+      "epoch": 0.5868483101912497,
+      "grad_norm": 0.07920975635426553,
+      "learning_rate": 8.721969502803954e-05,
+      "loss": 1.0786,
+      "step": 560
+    },
+    {
+      "epoch": 0.5920880272465286,
+      "grad_norm": 0.07525491817078783,
+      "learning_rate": 8.540618630129029e-05,
+      "loss": 1.074,
+      "step": 565
+    },
+    {
+      "epoch": 0.5973277443018077,
+      "grad_norm": 0.07250304608041182,
+      "learning_rate": 8.359756884689784e-05,
+      "loss": 1.0878,
+      "step": 570
+    },
+    {
+      "epoch": 0.6025674613570867,
+      "grad_norm": 0.0733085547464114,
+      "learning_rate": 8.179444884229746e-05,
+      "loss": 1.0903,
+      "step": 575
+    },
+    {
+      "epoch": 0.6078071784123658,
+      "grad_norm": 0.07942414834860045,
+      "learning_rate": 7.999743062239557e-05,
+      "loss": 1.0709,
+      "step": 580
+    },
+    {
+      "epoch": 0.6130468954676448,
+      "grad_norm": 0.07667678452382841,
+      "learning_rate": 7.820711647702017e-05,
+      "loss": 1.1036,
+      "step": 585
+    },
+    {
+      "epoch": 0.6182866125229237,
+      "grad_norm": 0.0736959002458591,
+      "learning_rate": 7.642410644905726e-05,
+      "loss": 1.1069,
+      "step": 590
+    },
+    {
+      "epoch": 0.6235263295782028,
+      "grad_norm": 0.07339877720581538,
+      "learning_rate": 7.464899813334001e-05,
+      "loss": 1.0663,
+      "step": 595
+    },
+    {
+      "epoch": 0.6287660466334818,
+      "grad_norm": 0.0755698325420648,
+      "learning_rate": 7.28823864763583e-05,
+      "loss": 1.0794,
+      "step": 600
+    },
+    {
+      "epoch": 0.6340057636887608,
+      "grad_norm": 0.07192880469266458,
+      "learning_rate": 7.112486357685631e-05,
+      "loss": 1.0828,
+      "step": 605
+    },
+    {
+      "epoch": 0.6392454807440399,
+      "grad_norm": 0.07254861043098242,
+      "learning_rate": 6.937701848738406e-05,
+      "loss": 1.0756,
+      "step": 610
+    },
+    {
+      "epoch": 0.6444851977993188,
+      "grad_norm": 0.07557384049733529,
+      "learning_rate": 6.763943701687045e-05,
+      "loss": 1.0707,
+      "step": 615
+    },
+    {
+      "epoch": 0.6497249148545978,
+      "grad_norm": 0.07452016389965133,
+      "learning_rate": 6.591270153428288e-05,
+      "loss": 1.1175,
+      "step": 620
+    },
+    {
+      "epoch": 0.6549646319098769,
+      "grad_norm": 0.07996587854125373,
+      "learning_rate": 6.419739077344016e-05,
+      "loss": 1.0689,
+      "step": 625
+    },
+    {
+      "epoch": 0.6602043489651559,
+      "grad_norm": 0.07413365338024828,
+      "learning_rate": 6.249407963904382e-05,
+      "loss": 1.0751,
+      "step": 630
+    },
+    {
+      "epoch": 0.6654440660204349,
+      "grad_norm": 0.07459048337829312,
+      "learning_rate": 6.080333901399251e-05,
+      "loss": 1.093,
+      "step": 635
+    },
+    {
+      "epoch": 0.6706837830757139,
+      "grad_norm": 0.07384856775055335,
+      "learning_rate": 5.9125735568044524e-05,
+      "loss": 1.08,
+      "step": 640
+    },
+    {
+      "epoch": 0.6759235001309929,
+      "grad_norm": 0.07929304895930443,
+      "learning_rate": 5.746183156789252e-05,
+      "loss": 1.0676,
+      "step": 645
+    },
+    {
+      "epoch": 0.6811632171862719,
+      "grad_norm": 0.07350637434626273,
+      "learning_rate": 5.581218468871366e-05,
+      "loss": 1.1031,
+      "step": 650
+    },
+    {
+      "epoch": 0.686402934241551,
+      "grad_norm": 0.07540040766752996,
+      "learning_rate": 5.417734782725896e-05,
+      "loss": 1.0757,
+      "step": 655
+    },
+    {
+      "epoch": 0.69164265129683,
+      "grad_norm": 0.07568774804377028,
+      "learning_rate": 5.2557868916543994e-05,
+      "loss": 1.0876,
+      "step": 660
+    },
+    {
+      "epoch": 0.696882368352109,
+      "grad_norm": 0.07333524606614103,
+      "learning_rate": 5.0954290742203195e-05,
+      "loss": 1.0983,
+      "step": 665
+    },
+    {
+      "epoch": 0.702122085407388,
+      "grad_norm": 0.07303211978160114,
+      "learning_rate": 4.936715076056975e-05,
+      "loss": 1.0786,
+      "step": 670
+    },
+    {
+      "epoch": 0.707361802462667,
+      "grad_norm": 0.07783295637332018,
+      "learning_rate": 4.779698091854098e-05,
+      "loss": 1.0998,
+      "step": 675
+    },
+    {
+      "epoch": 0.712601519517946,
+      "grad_norm": 0.07902938086530782,
+      "learning_rate": 4.624430747529102e-05,
+      "loss": 1.0847,
+      "step": 680
+    },
+    {
+      "epoch": 0.7178412365732251,
+      "grad_norm": 0.07352763796685166,
+      "learning_rate": 4.4709650825889283e-05,
+      "loss": 1.0749,
+      "step": 685
+    },
+    {
+      "epoch": 0.7230809536285041,
+      "grad_norm": 0.07547148249723341,
+      "learning_rate": 4.3193525326884435e-05,
+      "loss": 1.0768,
+      "step": 690
+    },
+    {
+      "epoch": 0.728320670683783,
+      "grad_norm": 0.0746832803560011,
+      "learning_rate": 4.169643912391241e-05,
+      "loss": 1.1248,
+      "step": 695
+    },
+    {
+      "epoch": 0.7335603877390621,
+      "grad_norm": 0.0716047892433064,
+      "learning_rate": 4.021889398138593e-05,
+      "loss": 1.114,
+      "step": 700
+    },
+    {
+      "epoch": 0.7388001047943411,
+      "grad_norm": 0.07737137661226196,
+      "learning_rate": 3.87613851143229e-05,
+      "loss": 1.0713,
+      "step": 705
+    },
+    {
+      "epoch": 0.7440398218496201,
+      "grad_norm": 0.07385277062243696,
+      "learning_rate": 3.732440102236975e-05,
+      "loss": 1.0925,
+      "step": 710
+    },
+    {
+      "epoch": 0.7492795389048992,
+      "grad_norm": 0.0721480276098664,
+      "learning_rate": 3.5908423326075456e-05,
+      "loss": 1.0754,
+      "step": 715
+    },
+    {
+      "epoch": 0.7545192559601781,
+      "grad_norm": 0.0755228681817193,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 1.1039,
+      "step": 720
+    },
+    {
+      "epoch": 0.7597589730154571,
+      "grad_norm": 0.07201430505747895,
+      "learning_rate": 3.314137824101111e-05,
+      "loss": 1.0818,
+      "step": 725
+    },
+    {
+      "epoch": 0.7649986900707362,
+      "grad_norm": 0.07923887248519075,
+      "learning_rate": 3.179123825692178e-05,
+      "loss": 1.0938,
+      "step": 730
+    },
+    {
+      "epoch": 0.7702384071260152,
+      "grad_norm": 0.08502340793939184,
+      "learning_rate": 3.0463959167023336e-05,
+      "loss": 1.1037,
+      "step": 735
+    },
+    {
+      "epoch": 0.7754781241812942,
+      "grad_norm": 0.07305649086014973,
+      "learning_rate": 2.9159985823062997e-05,
+      "loss": 1.0617,
+      "step": 740
+    },
+    {
+      "epoch": 0.7807178412365732,
+      "grad_norm": 0.07363395579085769,
+      "learning_rate": 2.7879755265618555e-05,
+      "loss": 1.0816,
+      "step": 745
+    },
+    {
+      "epoch": 0.7859575582918522,
+      "grad_norm": 0.07380591423429197,
+      "learning_rate": 2.6623696577619627e-05,
+      "loss": 1.1121,
+      "step": 750
+    },
+    {
+      "epoch": 0.7911972753471312,
+      "grad_norm": 0.07553408254615797,
+      "learning_rate": 2.539223074053585e-05,
+      "loss": 1.1121,
+      "step": 755
+    },
+    {
+      "epoch": 0.7964369924024103,
+      "grad_norm": 0.07091968630997039,
+      "learning_rate": 2.418577049328058e-05,
+      "loss": 1.0779,
+      "step": 760
+    },
+    {
+      "epoch": 0.8016767094576893,
+      "grad_norm": 0.07111148174207031,
+      "learning_rate": 2.3004720193876973e-05,
+      "loss": 1.0769,
+      "step": 765
+    },
+    {
+      "epoch": 0.8069164265129684,
+      "grad_norm": 0.07286026119404068,
+      "learning_rate": 2.1849475683932996e-05,
+      "loss": 1.0852,
+      "step": 770
+    },
+    {
+      "epoch": 0.8121561435682473,
+      "grad_norm": 0.07285993869373032,
+      "learning_rate": 2.0720424155971042e-05,
+      "loss": 1.0702,
+      "step": 775
+    },
+    {
+      "epoch": 0.8173958606235263,
+      "grad_norm": 0.0768385830643589,
+      "learning_rate": 1.961794402365611e-05,
+      "loss": 1.0652,
+      "step": 780
+    },
+    {
+      "epoch": 0.8226355776788054,
+      "grad_norm": 0.07700519821749924,
+      "learning_rate": 1.854240479496643e-05,
+      "loss": 1.105,
+      "step": 785
+    },
+    {
+      "epoch": 0.8278752947340844,
+      "grad_norm": 0.07789933149114912,
+      "learning_rate": 1.7494166948349055e-05,
+      "loss": 1.0992,
+      "step": 790
+    },
+    {
+      "epoch": 0.8331150117893634,
+      "grad_norm": 0.0713531816025941,
+      "learning_rate": 1.647358181190153e-05,
+      "loss": 1.0672,
+      "step": 795
+    },
+    {
+      "epoch": 0.8383547288446423,
+      "grad_norm": 0.07496958584065704,
+      "learning_rate": 1.5480991445620542e-05,
+      "loss": 1.1105,
+      "step": 800
+    },
+    {
+      "epoch": 0.8435944458999214,
+      "grad_norm": 0.07272092802927198,
+      "learning_rate": 1.4516728526756874e-05,
+      "loss": 1.1139,
+      "step": 805
+    },
+    {
+      "epoch": 0.8488341629552004,
+      "grad_norm": 0.07161568950716177,
+      "learning_rate": 1.3581116238315195e-05,
+      "loss": 1.085,
+      "step": 810
+    },
+    {
+      "epoch": 0.8540738800104795,
+      "grad_norm": 0.07695666983129193,
+      "learning_rate": 1.2674468160735587e-05,
+      "loss": 1.0616,
+      "step": 815
+    },
+    {
+      "epoch": 0.8593135970657585,
+      "grad_norm": 0.07401216452154172,
+      "learning_rate": 1.1797088166794e-05,
+      "loss": 1.0734,
+      "step": 820
+    },
+    {
+      "epoch": 0.8645533141210374,
+      "grad_norm": 0.07259433622324657,
+      "learning_rate": 1.0949270319755766e-05,
+      "loss": 1.0684,
+      "step": 825
+    },
+    {
+      "epoch": 0.8697930311763165,
+      "grad_norm": 0.07530462440367772,
+      "learning_rate": 1.013129877481741e-05,
+      "loss": 1.0983,
+      "step": 830
+    },
+    {
+      "epoch": 0.8750327482315955,
+      "grad_norm": 0.07005327786295418,
+      "learning_rate": 9.3434476838688e-06,
+      "loss": 1.0959,
+      "step": 835
+    },
+    {
+      "epoch": 0.8802724652868745,
+      "grad_norm": 0.07386728875097175,
+      "learning_rate": 8.585981103608342e-06,
+      "loss": 1.0778,
+      "step": 840
+    },
+    {
+      "epoch": 0.8855121823421536,
+      "grad_norm": 0.07241052446613962,
+      "learning_rate": 7.859152907041545e-06,
+      "loss": 1.0972,
+      "step": 845
+    },
+    {
+      "epoch": 0.8907518993974325,
+      "grad_norm": 0.0752050904544279,
+      "learning_rate": 7.163206698392744e-06,
+      "loss": 1.0816,
+      "step": 850
+    },
+    {
+      "epoch": 0.8959916164527115,
+      "grad_norm": 0.07365477033333792,
+      "learning_rate": 6.498375731458528e-06,
+      "loss": 1.0838,
+      "step": 855
+    },
+    {
+      "epoch": 0.9012313335079906,
+      "grad_norm": 0.07349597369238159,
+      "learning_rate": 5.864882831430274e-06,
+      "loss": 1.0655,
+      "step": 860
+    },
+    {
+      "epoch": 0.9064710505632696,
+      "grad_norm": 0.0744936274605184,
+      "learning_rate": 5.262940320211951e-06,
+      "loss": 1.1006,
+      "step": 865
+    },
+    {
+      "epoch": 0.9117107676185486,
+      "grad_norm": 0.07094867075036164,
+      "learning_rate": 4.692749945258057e-06,
+      "loss": 1.0821,
+      "step": 870
+    },
+    {
+      "epoch": 0.9169504846738276,
+      "grad_norm": 0.07625036960430226,
+      "learning_rate": 4.154502811955907e-06,
+      "loss": 1.0944,
+      "step": 875
+    },
+    {
+      "epoch": 0.9221902017291066,
+      "grad_norm": 0.0723629686619488,
+      "learning_rate": 3.6483793195745684e-06,
+      "loss": 1.083,
+      "step": 880
+    },
+    {
+      "epoch": 0.9274299187843856,
+      "grad_norm": 0.0763650337980307,
+      "learning_rate": 3.1745491008021598e-06,
+      "loss": 1.081,
+      "step": 885
+    },
+    {
+      "epoch": 0.9326696358396647,
+      "grad_norm": 0.0728944319249245,
+      "learning_rate": 2.7331709648916073e-06,
+      "loss": 1.1094,
+      "step": 890
+    },
+    {
+      "epoch": 0.9379093528949437,
+      "grad_norm": 0.07193879908153704,
+      "learning_rate": 2.3243928444340426e-06,
+      "loss": 1.0809,
+      "step": 895
+    },
+    {
+      "epoch": 0.9431490699502227,
+      "grad_norm": 0.07421423217890344,
+      "learning_rate": 1.9483517457776436e-06,
+      "loss": 1.09,
+      "step": 900
+    },
+    {
+      "epoch": 0.9483887870055017,
+      "grad_norm": 0.07663776082455946,
+      "learning_rate": 1.6051737031084536e-06,
+      "loss": 1.0814,
+      "step": 905
+    },
+    {
+      "epoch": 0.9536285040607807,
+      "grad_norm": 0.07329159457647504,
+      "learning_rate": 1.2949737362087156e-06,
+      "loss": 1.0976,
+      "step": 910
+    },
+    {
+      "epoch": 0.9588682211160597,
+      "grad_norm": 0.07520915566066837,
+      "learning_rate": 1.0178558119067315e-06,
+      "loss": 1.1012,
+      "step": 915
+    },
+    {
+      "epoch": 0.9641079381713388,
+      "grad_norm": 0.07528058322802142,
+      "learning_rate": 7.73912809231292e-07,
+      "loss": 1.095,
+      "step": 920
+    },
+    {
+      "epoch": 0.9693476552266178,
+      "grad_norm": 0.10802859507213024,
+      "learning_rate": 5.632264882822758e-07,
+      "loss": 1.1008,
+      "step": 925
+    },
+    {
+      "epoch": 0.9745873722818967,
+      "grad_norm": 0.07235805606525178,
+      "learning_rate": 3.8586746282788244e-07,
+      "loss": 1.105,
+      "step": 930
+    },
+    {
+      "epoch": 0.9798270893371758,
+      "grad_norm": 0.07225406845866389,
+      "learning_rate": 2.4189517663767424e-07,
+      "loss": 1.0796,
+      "step": 935
+    },
+    {
+      "epoch": 0.9850668063924548,
+      "grad_norm": 0.07529138400314424,
+      "learning_rate": 1.3135788355934652e-07,
+      "loss": 1.1061,
+      "step": 940
+    },
+    {
+      "epoch": 0.9903065234477338,
+      "grad_norm": 0.0904445943158699,
+      "learning_rate": 5.4292631345942424e-08,
+      "loss": 1.0877,
+      "step": 945
+    },
+    {
+      "epoch": 0.9955462405030129,
+      "grad_norm": 0.07392400364849133,
+      "learning_rate": 1.0725249238940915e-08,
+      "loss": 1.0649,
+      "step": 950
+    },
+    {
+      "epoch": 0.9997380141472361,
+      "eval_loss": 0.9753671884536743,
+      "eval_runtime": 2.0953,
+      "eval_samples_per_second": 3.341,
+      "eval_steps_per_second": 0.955,
+      "step": 954
+    },
+    {
+      "epoch": 0.9997380141472361,
+      "step": 954,
+      "total_flos": 1.1997126510772224e+16,
+      "train_loss": 1.1073102571179532,
+      "train_runtime": 19660.0543,
+      "train_samples_per_second": 3.106,
+      "train_steps_per_second": 0.049
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 954,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1997126510772224e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}