nvan15 commited on Jan 15

Commit

5d64278

verified ·

1 Parent(s): 70ff38e

Batch upload part 12

Browse files

Files changed (50) hide show

nl_tasks/exprep/run_ex20_2ep/ft/adapter_config.json +19 -0
nl_tasks/exprep/run_ex20_2ep/ft/special_tokens_map.json +24 -0
nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.json +0 -0
nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.model +3 -0
nl_tasks/exprep/run_ex20_2ep/ft/tokenizer_config.json +43 -0
nl_tasks/exprep/run_ex20_2ep/ft2/adapter_config.json +19 -0
nl_tasks/exprep/run_ex20_2ep/ft2/adapter_model.bin +3 -0
nl_tasks/exprep/run_ex20_2ep/output.txt +4 -0
nl_tasks/exprep/run_ex20_2ep/trainer_state.json +743 -0
nl_tasks/exprep/run_ex21_2ep/ft/adapter_config.json +19 -0
nl_tasks/exprep/run_ex21_2ep/ft/special_tokens_map.json +24 -0
nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.json +0 -0
nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.model +3 -0
nl_tasks/exprep/run_ex21_2ep/ft/tokenizer_config.json +43 -0
nl_tasks/exprep/run_ex21_2ep/ft2/adapter_config.json +19 -0
nl_tasks/exprep/run_ex21_2ep/ft2/adapter_model.bin +3 -0
nl_tasks/exprep/run_ex21_2ep/output.txt +4 -0
nl_tasks/exprep/run_ex21_2ep/trainer_state.json +743 -0
nl_tasks/exprep/run_ex22_2ep/ft/adapter_config.json +19 -0
nl_tasks/exprep/run_ex22_2ep/ft/special_tokens_map.json +24 -0
nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.json +0 -0
nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.model +3 -0
nl_tasks/exprep/run_ex22_2ep/ft/tokenizer_config.json +43 -0
nl_tasks/exprep/run_ex22_2ep/ft2/adapter_config.json +19 -0
nl_tasks/exprep/run_ex22_2ep/ft2/adapter_model.bin +3 -0
nl_tasks/exprep/run_ex22_2ep/output.txt +4 -0
nl_tasks/exprep/run_ex22_2ep/trainer_state.json +743 -0
nl_tasks/exprep/run_ex23_3ep/ft/adapter_config.json +19 -0
nl_tasks/exprep/run_ex23_3ep/ft/special_tokens_map.json +24 -0
nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.json +0 -0
nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.model +3 -0
nl_tasks/exprep/run_ex23_3ep/ft/tokenizer_config.json +43 -0
nl_tasks/exprep/run_ex23_3ep/ft2/adapter_config.json +19 -0
nl_tasks/exprep/run_ex23_3ep/ft2/adapter_model.bin +3 -0
nl_tasks/exprep/run_ex23_3ep/output.txt +4 -0
nl_tasks/exprep/run_ex23_3ep/trainer_state.json +1093 -0
nl_tasks/exprep/run_ex24_3ep/ft/special_tokens_map.json +24 -0
nl_tasks/exprep/run_ex24_3ep/ft/tokenizer_config.json +43 -0
nl_tasks/exprep/run_ex24_3ep/output.txt +4 -0
nl_tasks/exprep/run_ex24_3ep/trainer_state.json +1093 -0
nl_tasks/run_exps/ft/adapter_config.json +18 -0
nl_tasks/run_exps/ft/merges.txt +0 -0
nl_tasks/run_exps/ft/special_tokens_map.json +30 -0
nl_tasks/run_exps/ft/tokenizer.json +0 -0
nl_tasks/run_exps/ft/tokenizer_config.json +51 -0
nl_tasks/run_exps/ft/training_args.bin +3 -0
nl_tasks/run_exps/ft/vocab.json +0 -0
nl_tasks/run_exps/ft2/adapter_config.json +18 -0
nl_tasks/run_exps/ft2/adapter_model.bin +3 -0
nl_tasks/run_exps/trainer_state.json +73 -0

nl_tasks/exprep/run_ex20_2ep/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": false,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex20_2ep/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/exprep/run_ex20_2ep/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/exprep/run_ex20_2ep/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": true,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex20_2ep/ft2/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f79c0c1e022748a2c2946c6babb09613500a5e5696a7e2fb68e2b97d0b7020e
+size 33602915

nl_tasks/exprep/run_ex20_2ep/output.txt ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 8.04
3	+
4	+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.50720242608037

nl_tasks/exprep/run_ex20_2ep/trainer_state.json ADDED Viewed

	@@ -0,0 +1,743 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 100,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.2626207172870636,
+      "learning_rate": 0.0009997726215503421,
+      "loss": 0.8046,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.18716809153556824,
+      "learning_rate": 0.0009990524226456182,
+      "loss": 0.3432,
+      "step": 50
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.21953710913658142,
+      "learning_rate": 0.000997839719251072,
+      "loss": 0.3343,
+      "step": 75
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.19770056009292603,
+      "learning_rate": 0.0009961357081585429,
+      "loss": 0.3165,
+      "step": 100
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.22986027598381042,
+      "learning_rate": 0.0009939420710212512,
+      "loss": 0.307,
+      "step": 125
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.21198776364326477,
+      "learning_rate": 0.0009912609726942104,
+      "loss": 0.3003,
+      "step": 150
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.3816479444503784,
+      "learning_rate": 0.0009880950590977764,
+      "loss": 0.2983,
+      "step": 175
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2344156950712204,
+      "learning_rate": 0.0009844474546064435,
+      "loss": 0.3,
+      "step": 200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.20885460078716278,
+      "learning_rate": 0.000980321758965464,
+      "loss": 0.2948,
+      "step": 225
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2171780914068222,
+      "learning_rate": 0.0009757220437383345,
+      "loss": 0.2914,
+      "step": 250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.20872080326080322,
+      "learning_rate": 0.0009706528482886534,
+      "loss": 0.2913,
+      "step": 275
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.22229866683483124,
+      "learning_rate": 0.0009651191753003186,
+      "loss": 0.3002,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.19446779787540436,
+      "learning_rate": 0.0009591264858404809,
+      "loss": 0.2913,
+      "step": 325
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.20700472593307495,
+      "learning_rate": 0.000952680693970131,
+      "loss": 0.2935,
+      "step": 350
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.22668114304542542,
+      "learning_rate": 0.0009457881609076351,
+      "loss": 0.2832,
+      "step": 375
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.37962883710861206,
+      "learning_rate": 0.0009384556887509802,
+      "loss": 0.2839,
+      "step": 400
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20619851350784302,
+      "learning_rate": 0.000930690513764925,
+      "loss": 0.2749,
+      "step": 425
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.21654820442199707,
+      "learning_rate": 0.0009225002992396796,
+      "loss": 0.2781,
+      "step": 450
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.216287299990654,
+      "learning_rate": 0.000913893127928164,
+      "loss": 0.2756,
+      "step": 475
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.22118358314037323,
+      "learning_rate": 0.0009048774940693062,
+      "loss": 0.2719,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.21317917108535767,
+      "learning_rate": 0.0008954622950052542,
+      "loss": 0.2723,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.277706116437912,
+      "learning_rate": 0.0008856568224007735,
+      "loss": 0.2589,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.20005273818969727,
+      "learning_rate": 0.0008754707530734958,
+      "loss": 0.2773,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1920776069164276,
+      "learning_rate": 0.0008649141394440677,
+      "loss": 0.2621,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2016347497701645,
+      "learning_rate": 0.0008539973996156264,
+      "loss": 0.2767,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.17412111163139343,
+      "learning_rate": 0.0008427313070923884,
+      "loss": 0.2656,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.16364696621894836,
+      "learning_rate": 0.0008311269801475025,
+      "loss": 0.2563,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.20392268896102905,
+      "learning_rate": 0.0008191958708506557,
+      "loss": 0.2561,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.23820936679840088,
+      "learning_rate": 0.0008069497537662638,
+      "loss": 0.2628,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.1817774921655655,
+      "learning_rate": 0.0007944007143333976,
+      "loss": 0.2585,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.18769747018814087,
+      "learning_rate": 0.0007815611369389133,
+      "loss": 0.2472,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.19526880979537964,
+      "learning_rate": 0.0007684436926955582,
+      "loss": 0.2512,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.19683842360973358,
+      "learning_rate": 0.0007550613269371124,
+      "loss": 0.245,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.18514588475227356,
+      "learning_rate": 0.0007414272464429068,
+      "loss": 0.2469,
+      "step": 850
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.15977084636688232,
+      "learning_rate": 0.0007275549064043269,
+      "loss": 0.2529,
+      "step": 875
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.17137399315834045,
+      "learning_rate": 0.0007134579971461626,
+      "loss": 0.2489,
+      "step": 900
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.19649483263492584,
+      "learning_rate": 0.0006991504306159115,
+      "loss": 0.2452,
+      "step": 925
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.1726703643798828,
+      "learning_rate": 0.0006846463266543652,
+      "loss": 0.2429,
+      "step": 950
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.18005579710006714,
+      "learning_rate": 0.0006699599990610323,
+      "loss": 0.2393,
+      "step": 975
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2112029492855072,
+      "learning_rate": 0.0006551059414681455,
+      "loss": 0.2468,
+      "step": 1000
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.18817350268363953,
+      "learning_rate": 0.0006400988130371969,
+      "loss": 0.2457,
+      "step": 1025
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.1755235642194748,
+      "learning_rate": 0.0006249534239921153,
+      "loss": 0.2354,
+      "step": 1050
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.1836978793144226,
+      "learning_rate": 0.000609684721003363,
+      "loss": 0.2317,
+      "step": 1075
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.17404377460479736,
+      "learning_rate": 0.0005943077724373775,
+      "loss": 0.2324,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.183836430311203,
+      "learning_rate": 0.0005788377534859114,
+      "loss": 0.2405,
+      "step": 1125
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.15977102518081665,
+      "learning_rate": 0.0005632899311899521,
+      "loss": 0.2339,
+      "step": 1150
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17550553381443024,
+      "learning_rate": 0.0005476796493729943,
+      "loss": 0.2365,
+      "step": 1175
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.15650643408298492,
+      "learning_rate": 0.0005320223134985392,
+      "loss": 0.2364,
+      "step": 1200
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.18774349987506866,
+      "learning_rate": 0.000516333375466762,
+      "loss": 0.2366,
+      "step": 1225
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1758316457271576,
+      "learning_rate": 0.0005006283183653513,
+      "loss": 0.2279,
+      "step": 1250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.16334125399589539,
+      "learning_rate": 0.0004849226411895716,
+      "loss": 0.1952,
+      "step": 1275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.14510266482830048,
+      "learning_rate": 0.0004692318435466265,
+      "loss": 0.2013,
+      "step": 1300
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.16317233443260193,
+      "learning_rate": 0.0004535714103594162,
+      "loss": 0.198,
+      "step": 1325
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.17068150639533997,
+      "learning_rate": 0.0004379567965847896,
+      "loss": 0.2031,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.14690789580345154,
+      "learning_rate": 0.000422403411961367,
+      "loss": 0.2061,
+      "step": 1375
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.16131256520748138,
+      "learning_rate": 0.00040692660580198903,
+      "loss": 0.2037,
+      "step": 1400
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.16582365334033966,
+      "learning_rate": 0.00039154165184579736,
+      "loss": 0.1955,
+      "step": 1425
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.16103099286556244,
+      "learning_rate": 0.00037626373318489886,
+      "loss": 0.2029,
+      "step": 1450
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.1566431075334549,
+      "learning_rate": 0.00036110792728048633,
+      "loss": 0.1999,
+      "step": 1475
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.18320217728614807,
+      "learning_rate": 0.00034608919108320487,
+      "loss": 0.2002,
+      "step": 1500
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.15252584218978882,
+      "learning_rate": 0.0003312223462724472,
+      "loss": 0.2026,
+      "step": 1525
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.14527903497219086,
+      "learning_rate": 0.0003165220646291454,
+      "loss": 0.195,
+      "step": 1550
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.16251923143863678,
+      "learning_rate": 0.000302002853556495,
+      "loss": 0.1928,
+      "step": 1575
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.15034204721450806,
+      "learning_rate": 0.0002876790417628994,
+      "loss": 0.1896,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.16318462789058685,
+      "learning_rate": 0.00027356476512126383,
+      "loss": 0.1901,
+      "step": 1625
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.16164065897464752,
+      "learning_rate": 0.0002596739527185961,
+      "loss": 0.185,
+      "step": 1650
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.13852697610855103,
+      "learning_rate": 0.0002460203131096801,
+      "loss": 0.1887,
+      "step": 1675
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.15581487119197845,
+      "learning_rate": 0.00023261732078838537,
+      "loss": 0.188,
+      "step": 1700
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.16509482264518738,
+      "learning_rate": 0.00021947820288997067,
+      "loss": 0.1895,
+      "step": 1725
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.1754249930381775,
+      "learning_rate": 0.00020661592613749636,
+      "loss": 0.1885,
+      "step": 1750
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.14935770630836487,
+      "learning_rate": 0.00019404318404523603,
+      "loss": 0.1898,
+      "step": 1775
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.16208966076374054,
+      "learning_rate": 0.00018177238439170883,
+      "loss": 0.1778,
+      "step": 1800
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.13969115912914276,
+      "learning_rate": 0.00016981563697470158,
+      "loss": 0.1843,
+      "step": 1825
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.18950283527374268,
+      "learning_rate": 0.00015818474166035906,
+      "loss": 0.1874,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.14980795979499817,
+      "learning_rate": 0.00014689117673814133,
+      "loss": 0.1884,
+      "step": 1875
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.16838234663009644,
+      "learning_rate": 0.00013594608759313833,
+      "loss": 0.186,
+      "step": 1900
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.17522183060646057,
+      "learning_rate": 0.00012536027570691938,
+      "loss": 0.1856,
+      "step": 1925
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.1533781737089157,
+      "learning_rate": 0.00011514418799777554,
+      "loss": 0.1753,
+      "step": 1950
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.17392344772815704,
+      "learning_rate": 0.0001053079065108728,
+      "loss": 0.1882,
+      "step": 1975
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.16130375862121582,
+      "learning_rate": 9.586113846848982e-05,
+      "loss": 0.1859,
+      "step": 2000
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.15282462537288666,
+      "learning_rate": 8.68132066901623e-05,
+      "loss": 0.1747,
+      "step": 2025
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.14969567954540253,
+      "learning_rate": 7.81730403921856e-05,
+      "loss": 0.1829,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.19932319223880768,
+      "learning_rate": 6.994916637555571e-05,
+      "loss": 0.1868,
+      "step": 2075
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.16214902698993683,
+      "learning_rate": 6.214970061104686e-05,
+      "loss": 0.1837,
+      "step": 2100
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.15913426876068115,
+      "learning_rate": 5.4782340229727555e-05,
+      "loss": 0.181,
+      "step": 2125
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.15780578553676605,
+      "learning_rate": 4.785435592682219e-05,
+      "loss": 0.1747,
+      "step": 2150
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.15023760497570038,
+      "learning_rate": 4.137258478641176e-05,
+      "loss": 0.191,
+      "step": 2175
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.15563510358333588,
+      "learning_rate": 3.534342353405834e-05,
+      "loss": 0.1827,
+      "step": 2200
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.1683374047279358,
+      "learning_rate": 2.9772822224008513e-05,
+      "loss": 0.1761,
+      "step": 2225
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.14578016102313995,
+      "learning_rate": 2.4666278367208418e-05,
+      "loss": 0.1844,
+      "step": 2250
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.1600412130355835,
+      "learning_rate": 2.0028831505924162e-05,
+      "loss": 0.1769,
+      "step": 2275
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.16546602547168732,
+      "learning_rate": 1.586505824032214e-05,
+      "loss": 0.1806,
+      "step": 2300
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.16472336649894714,
+      "learning_rate": 1.2179067711917014e-05,
+      "loss": 0.1841,
+      "step": 2325
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.15828381478786469,
+      "learning_rate": 8.974497548345395e-06,
+      "loss": 0.175,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.17364706099033356,
+      "learning_rate": 6.254510273466185e-06,
+      "loss": 0.1975,
+      "step": 2375
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.1747918426990509,
+      "learning_rate": 4.021790186331753e-06,
+      "loss": 0.1839,
+      "step": 2400
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.18869030475616455,
+      "learning_rate": 2.2785407121084233e-06,
+      "loss": 0.177,
+      "step": 2425
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.15464165806770325,
+      "learning_rate": 1.026482227562242e-06,
+      "loss": 0.1792,
+      "step": 2450
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.1583382487297058,
+      "learning_rate": 2.668503632545782e-07,
+      "loss": 0.185,
+      "step": 2475
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.1495935469865799,
+      "learning_rate": 3.947841241136452e-10,
+      "loss": 0.1809,
+      "step": 2500
+    },
+    {
+      "epoch": 2.0,
+      "step": 2500,
+      "total_flos": 1.62588235137024e+18,
+      "train_loss": 0.23306660480499267,
+      "train_runtime": 2262.3527,
+      "train_samples_per_second": 35.361,
+      "train_steps_per_second": 1.105
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 2500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62588235137024e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/exprep/run_ex21_2ep/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": false,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex21_2ep/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/exprep/run_ex21_2ep/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/exprep/run_ex21_2ep/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": true,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex21_2ep/ft2/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:838406364c781d904f6e794a55ddbd15163323b48ad1436b297f5155013a0054
+size 33602915

nl_tasks/exprep/run_ex21_2ep/output.txt ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 7.84
3	+
4	+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.88627748294162

nl_tasks/exprep/run_ex21_2ep/trainer_state.json ADDED Viewed

	@@ -0,0 +1,743 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 100,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.4475359320640564,
+      "learning_rate": 0.0009997726215503421,
+      "loss": 0.4344,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.186162531375885,
+      "learning_rate": 0.0009990524226456182,
+      "loss": 0.3436,
+      "step": 50
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.21256987750530243,
+      "learning_rate": 0.000997839719251072,
+      "loss": 0.3324,
+      "step": 75
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.22403238713741302,
+      "learning_rate": 0.0009961357081585429,
+      "loss": 0.3146,
+      "step": 100
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.26591813564300537,
+      "learning_rate": 0.0009939420710212512,
+      "loss": 0.3064,
+      "step": 125
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.20775578916072845,
+      "learning_rate": 0.0009912609726942104,
+      "loss": 0.3012,
+      "step": 150
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.1952112317085266,
+      "learning_rate": 0.0009880950590977764,
+      "loss": 0.2985,
+      "step": 175
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.23744182288646698,
+      "learning_rate": 0.0009844474546064435,
+      "loss": 0.3004,
+      "step": 200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.25152233242988586,
+      "learning_rate": 0.000980321758965464,
+      "loss": 0.2942,
+      "step": 225
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2282615751028061,
+      "learning_rate": 0.0009757220437383345,
+      "loss": 0.2915,
+      "step": 250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.24063201248645782,
+      "learning_rate": 0.0009706528482886534,
+      "loss": 0.2902,
+      "step": 275
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2492981255054474,
+      "learning_rate": 0.0009651191753003186,
+      "loss": 0.302,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.20875616371631622,
+      "learning_rate": 0.0009591264858404809,
+      "loss": 0.2944,
+      "step": 325
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.24029485881328583,
+      "learning_rate": 0.000952680693970131,
+      "loss": 0.297,
+      "step": 350
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.21902728080749512,
+      "learning_rate": 0.0009457881609076351,
+      "loss": 0.2863,
+      "step": 375
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.24253377318382263,
+      "learning_rate": 0.0009384556887509802,
+      "loss": 0.2847,
+      "step": 400
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.19963406026363373,
+      "learning_rate": 0.000930690513764925,
+      "loss": 0.2752,
+      "step": 425
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2175171673297882,
+      "learning_rate": 0.0009225002992396796,
+      "loss": 0.2765,
+      "step": 450
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.20239321887493134,
+      "learning_rate": 0.000913893127928164,
+      "loss": 0.2739,
+      "step": 475
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.23220029473304749,
+      "learning_rate": 0.0009048774940693062,
+      "loss": 0.2706,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.20775073766708374,
+      "learning_rate": 0.0008954622950052542,
+      "loss": 0.272,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.2095000296831131,
+      "learning_rate": 0.0008856568224007735,
+      "loss": 0.2606,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.22149509191513062,
+      "learning_rate": 0.0008754707530734958,
+      "loss": 0.2773,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.20700635015964508,
+      "learning_rate": 0.0008649141394440677,
+      "loss": 0.2619,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.18627957999706268,
+      "learning_rate": 0.0008539973996156264,
+      "loss": 0.2754,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.18619582056999207,
+      "learning_rate": 0.0008427313070923884,
+      "loss": 0.2653,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.16383656859397888,
+      "learning_rate": 0.0008311269801475025,
+      "loss": 0.2561,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2009345144033432,
+      "learning_rate": 0.0008191958708506557,
+      "loss": 0.2552,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.21614684164524078,
+      "learning_rate": 0.0008069497537662638,
+      "loss": 0.2624,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.1810019165277481,
+      "learning_rate": 0.0007944007143333976,
+      "loss": 0.2593,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.18752247095108032,
+      "learning_rate": 0.0007815611369389133,
+      "loss": 0.2471,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.22774486243724823,
+      "learning_rate": 0.0007684436926955582,
+      "loss": 0.253,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.6058231592178345,
+      "learning_rate": 0.0007550613269371124,
+      "loss": 0.2452,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.21049253642559052,
+      "learning_rate": 0.0007414272464429068,
+      "loss": 0.2473,
+      "step": 850
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.1695825457572937,
+      "learning_rate": 0.0007275549064043269,
+      "loss": 0.2535,
+      "step": 875
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.17602591216564178,
+      "learning_rate": 0.0007134579971461626,
+      "loss": 0.2489,
+      "step": 900
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.18434438109397888,
+      "learning_rate": 0.0006991504306159115,
+      "loss": 0.2463,
+      "step": 925
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.17386262118816376,
+      "learning_rate": 0.0006846463266543652,
+      "loss": 0.2432,
+      "step": 950
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.19186212122440338,
+      "learning_rate": 0.0006699599990610323,
+      "loss": 0.2404,
+      "step": 975
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.20263203978538513,
+      "learning_rate": 0.0006551059414681455,
+      "loss": 0.2467,
+      "step": 1000
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.22712233662605286,
+      "learning_rate": 0.0006400988130371969,
+      "loss": 0.2454,
+      "step": 1025
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.16690626740455627,
+      "learning_rate": 0.0006249534239921153,
+      "loss": 0.2347,
+      "step": 1050
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.173813134431839,
+      "learning_rate": 0.000609684721003363,
+      "loss": 0.2316,
+      "step": 1075
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.1734803467988968,
+      "learning_rate": 0.0005943077724373775,
+      "loss": 0.2321,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.18156838417053223,
+      "learning_rate": 0.0005788377534859114,
+      "loss": 0.2412,
+      "step": 1125
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.15780498087406158,
+      "learning_rate": 0.0005632899311899521,
+      "loss": 0.2343,
+      "step": 1150
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.16868476569652557,
+      "learning_rate": 0.0005476796493729943,
+      "loss": 0.2374,
+      "step": 1175
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.15904894471168518,
+      "learning_rate": 0.0005320223134985392,
+      "loss": 0.236,
+      "step": 1200
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.17931579053401947,
+      "learning_rate": 0.000516333375466762,
+      "loss": 0.2378,
+      "step": 1225
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.18797090649604797,
+      "learning_rate": 0.0005006283183653513,
+      "loss": 0.2267,
+      "step": 1250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.17563988268375397,
+      "learning_rate": 0.0004849226411895716,
+      "loss": 0.1948,
+      "step": 1275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.14821073412895203,
+      "learning_rate": 0.0004692318435466265,
+      "loss": 0.2021,
+      "step": 1300
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.16894803941249847,
+      "learning_rate": 0.0004535714103594162,
+      "loss": 0.1982,
+      "step": 1325
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.16558986902236938,
+      "learning_rate": 0.0004379567965847896,
+      "loss": 0.2029,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.14962488412857056,
+      "learning_rate": 0.000422403411961367,
+      "loss": 0.2071,
+      "step": 1375
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.1625063121318817,
+      "learning_rate": 0.00040692660580198903,
+      "loss": 0.2038,
+      "step": 1400
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.170761376619339,
+      "learning_rate": 0.00039154165184579736,
+      "loss": 0.1958,
+      "step": 1425
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.15951049327850342,
+      "learning_rate": 0.00037626373318489886,
+      "loss": 0.2021,
+      "step": 1450
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.14665117859840393,
+      "learning_rate": 0.00036110792728048633,
+      "loss": 0.1996,
+      "step": 1475
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.17691372334957123,
+      "learning_rate": 0.00034608919108320487,
+      "loss": 0.2006,
+      "step": 1500
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.16786377131938934,
+      "learning_rate": 0.0003312223462724472,
+      "loss": 0.2035,
+      "step": 1525
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.13940677046775818,
+      "learning_rate": 0.0003165220646291454,
+      "loss": 0.1957,
+      "step": 1550
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.1505332589149475,
+      "learning_rate": 0.000302002853556495,
+      "loss": 0.1949,
+      "step": 1575
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.17429719865322113,
+      "learning_rate": 0.0002876790417628994,
+      "loss": 0.1898,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.16113583743572235,
+      "learning_rate": 0.00027356476512126383,
+      "loss": 0.1911,
+      "step": 1625
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.17427127063274384,
+      "learning_rate": 0.0002596739527185961,
+      "loss": 0.1858,
+      "step": 1650
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.13851557672023773,
+      "learning_rate": 0.0002460203131096801,
+      "loss": 0.1897,
+      "step": 1675
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.15388120710849762,
+      "learning_rate": 0.00023261732078838537,
+      "loss": 0.1884,
+      "step": 1700
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.16655150055885315,
+      "learning_rate": 0.00021947820288997067,
+      "loss": 0.1895,
+      "step": 1725
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.17470037937164307,
+      "learning_rate": 0.00020661592613749636,
+      "loss": 0.1882,
+      "step": 1750
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.15715032815933228,
+      "learning_rate": 0.00019404318404523603,
+      "loss": 0.1906,
+      "step": 1775
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.19270935654640198,
+      "learning_rate": 0.00018177238439170883,
+      "loss": 0.1783,
+      "step": 1800
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.14684396982192993,
+      "learning_rate": 0.00016981563697470158,
+      "loss": 0.1846,
+      "step": 1825
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.18586526811122894,
+      "learning_rate": 0.00015818474166035906,
+      "loss": 0.1883,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.169161856174469,
+      "learning_rate": 0.00014689117673814133,
+      "loss": 0.1869,
+      "step": 1875
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.17364977300167084,
+      "learning_rate": 0.00013594608759313833,
+      "loss": 0.1862,
+      "step": 1900
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.17725440859794617,
+      "learning_rate": 0.00012536027570691938,
+      "loss": 0.1858,
+      "step": 1925
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.1515907198190689,
+      "learning_rate": 0.00011514418799777554,
+      "loss": 0.1758,
+      "step": 1950
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.1727982610464096,
+      "learning_rate": 0.0001053079065108728,
+      "loss": 0.1884,
+      "step": 1975
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.15956495702266693,
+      "learning_rate": 9.586113846848982e-05,
+      "loss": 0.1863,
+      "step": 2000
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.14366982877254486,
+      "learning_rate": 8.68132066901623e-05,
+      "loss": 0.1744,
+      "step": 2025
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.15725454688072205,
+      "learning_rate": 7.81730403921856e-05,
+      "loss": 0.1827,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.13973355293273926,
+      "learning_rate": 6.994916637555571e-05,
+      "loss": 0.1865,
+      "step": 2075
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.16779528558254242,
+      "learning_rate": 6.214970061104686e-05,
+      "loss": 0.1843,
+      "step": 2100
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.15546084940433502,
+      "learning_rate": 5.4782340229727555e-05,
+      "loss": 0.1828,
+      "step": 2125
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.16239280998706818,
+      "learning_rate": 4.785435592682219e-05,
+      "loss": 0.1751,
+      "step": 2150
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.1440260261297226,
+      "learning_rate": 4.137258478641176e-05,
+      "loss": 0.1909,
+      "step": 2175
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.15595702826976776,
+      "learning_rate": 3.534342353405834e-05,
+      "loss": 0.1822,
+      "step": 2200
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.15584272146224976,
+      "learning_rate": 2.9772822224008513e-05,
+      "loss": 0.1769,
+      "step": 2225
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.15885038673877716,
+      "learning_rate": 2.4666278367208418e-05,
+      "loss": 0.1849,
+      "step": 2250
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.1490071564912796,
+      "learning_rate": 2.0028831505924162e-05,
+      "loss": 0.177,
+      "step": 2275
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.16527613997459412,
+      "learning_rate": 1.586505824032214e-05,
+      "loss": 0.1799,
+      "step": 2300
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.1699392944574356,
+      "learning_rate": 1.2179067711917014e-05,
+      "loss": 0.1848,
+      "step": 2325
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.1474064588546753,
+      "learning_rate": 8.974497548345395e-06,
+      "loss": 0.1751,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.17423242330551147,
+      "learning_rate": 6.254510273466185e-06,
+      "loss": 0.1978,
+      "step": 2375
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.16892707347869873,
+      "learning_rate": 4.021790186331753e-06,
+      "loss": 0.1844,
+      "step": 2400
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.19659648835659027,
+      "learning_rate": 2.2785407121084233e-06,
+      "loss": 0.1774,
+      "step": 2425
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.14788737893104553,
+      "learning_rate": 1.026482227562242e-06,
+      "loss": 0.1794,
+      "step": 2450
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.15226797759532928,
+      "learning_rate": 2.668503632545782e-07,
+      "loss": 0.1848,
+      "step": 2475
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.13729043304920197,
+      "learning_rate": 3.947841241136452e-10,
+      "loss": 0.1815,
+      "step": 2500
+    },
+    {
+      "epoch": 2.0,
+      "step": 2500,
+      "total_flos": 1.62588235137024e+18,
+      "train_loss": 0.22959588203430176,
+      "train_runtime": 2193.5556,
+      "train_samples_per_second": 36.47,
+      "train_steps_per_second": 1.14
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 2500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62588235137024e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/exprep/run_ex22_2ep/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": false,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex22_2ep/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/exprep/run_ex22_2ep/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/exprep/run_ex22_2ep/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": true,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex22_2ep/ft2/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1eb0157103be76af903b8368fbc6c8cd4cd0b7723f5114dab11ecd5f66ca403a
+size 33602915

nl_tasks/exprep/run_ex22_2ep/output.txt ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 7.82
3	+
4	+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.73464746019712

nl_tasks/exprep/run_ex22_2ep/trainer_state.json ADDED Viewed

	@@ -0,0 +1,743 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 100,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.18882547318935394,
+      "learning_rate": 0.0009997726215503421,
+      "loss": 0.4826,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.1954464167356491,
+      "learning_rate": 0.0009990524226456182,
+      "loss": 0.3389,
+      "step": 50
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2120068222284317,
+      "learning_rate": 0.000997839719251072,
+      "loss": 0.3338,
+      "step": 75
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.22273743152618408,
+      "learning_rate": 0.0009961357081585429,
+      "loss": 0.315,
+      "step": 100
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.34039777517318726,
+      "learning_rate": 0.0009939420710212512,
+      "loss": 0.3076,
+      "step": 125
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.23580029606819153,
+      "learning_rate": 0.0009912609726942104,
+      "loss": 0.3012,
+      "step": 150
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.19439291954040527,
+      "learning_rate": 0.0009880950590977764,
+      "loss": 0.2978,
+      "step": 175
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2402939349412918,
+      "learning_rate": 0.0009844474546064435,
+      "loss": 0.3006,
+      "step": 200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.2308524250984192,
+      "learning_rate": 0.000980321758965464,
+      "loss": 0.2958,
+      "step": 225
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.22124530375003815,
+      "learning_rate": 0.0009757220437383345,
+      "loss": 0.2908,
+      "step": 250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.23367883265018463,
+      "learning_rate": 0.0009706528482886534,
+      "loss": 0.2902,
+      "step": 275
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2590882182121277,
+      "learning_rate": 0.0009651191753003186,
+      "loss": 0.3007,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.21626244485378265,
+      "learning_rate": 0.0009591264858404809,
+      "loss": 0.2903,
+      "step": 325
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.24277330935001373,
+      "learning_rate": 0.000952680693970131,
+      "loss": 0.2927,
+      "step": 350
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.21639184653759003,
+      "learning_rate": 0.0009457881609076351,
+      "loss": 0.2848,
+      "step": 375
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.28448352217674255,
+      "learning_rate": 0.0009384556887509802,
+      "loss": 0.2853,
+      "step": 400
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.19180361926555634,
+      "learning_rate": 0.000930690513764925,
+      "loss": 0.2745,
+      "step": 425
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2169312685728073,
+      "learning_rate": 0.0009225002992396796,
+      "loss": 0.2767,
+      "step": 450
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.19704866409301758,
+      "learning_rate": 0.000913893127928164,
+      "loss": 0.2726,
+      "step": 475
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.21796244382858276,
+      "learning_rate": 0.0009048774940693062,
+      "loss": 0.2712,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.1845843344926834,
+      "learning_rate": 0.0008954622950052542,
+      "loss": 0.2704,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.26246872544288635,
+      "learning_rate": 0.0008856568224007735,
+      "loss": 0.2589,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.24823100864887238,
+      "learning_rate": 0.0008754707530734958,
+      "loss": 0.278,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.27403074502944946,
+      "learning_rate": 0.0008649141394440677,
+      "loss": 0.2637,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.19122664630413055,
+      "learning_rate": 0.0008539973996156264,
+      "loss": 0.2774,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.19332978129386902,
+      "learning_rate": 0.0008427313070923884,
+      "loss": 0.2659,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.1755480021238327,
+      "learning_rate": 0.0008311269801475025,
+      "loss": 0.2566,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19162705540657043,
+      "learning_rate": 0.0008191958708506557,
+      "loss": 0.2563,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.2055402398109436,
+      "learning_rate": 0.0008069497537662638,
+      "loss": 0.2628,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2031162679195404,
+      "learning_rate": 0.0007944007143333976,
+      "loss": 0.2602,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.19989728927612305,
+      "learning_rate": 0.0007815611369389133,
+      "loss": 0.2486,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.20189844071865082,
+      "learning_rate": 0.0007684436926955582,
+      "loss": 0.2521,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.22558659315109253,
+      "learning_rate": 0.0007550613269371124,
+      "loss": 0.2455,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.18982113897800446,
+      "learning_rate": 0.0007414272464429068,
+      "loss": 0.2474,
+      "step": 850
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.16264374554157257,
+      "learning_rate": 0.0007275549064043269,
+      "loss": 0.2537,
+      "step": 875
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.17321237921714783,
+      "learning_rate": 0.0007134579971461626,
+      "loss": 0.2488,
+      "step": 900
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.17718718945980072,
+      "learning_rate": 0.0006991504306159115,
+      "loss": 0.2463,
+      "step": 925
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.18864493072032928,
+      "learning_rate": 0.0006846463266543652,
+      "loss": 0.2434,
+      "step": 950
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.1813870519399643,
+      "learning_rate": 0.0006699599990610323,
+      "loss": 0.2386,
+      "step": 975
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.21876706182956696,
+      "learning_rate": 0.0006551059414681455,
+      "loss": 0.2458,
+      "step": 1000
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.18162848055362701,
+      "learning_rate": 0.0006400988130371969,
+      "loss": 0.2447,
+      "step": 1025
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.1748015582561493,
+      "learning_rate": 0.0006249534239921153,
+      "loss": 0.2349,
+      "step": 1050
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.1879139095544815,
+      "learning_rate": 0.000609684721003363,
+      "loss": 0.2326,
+      "step": 1075
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.17181971669197083,
+      "learning_rate": 0.0005943077724373775,
+      "loss": 0.2313,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.17912223935127258,
+      "learning_rate": 0.0005788377534859114,
+      "loss": 0.2407,
+      "step": 1125
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.15652291476726532,
+      "learning_rate": 0.0005632899311899521,
+      "loss": 0.2337,
+      "step": 1150
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17756380140781403,
+      "learning_rate": 0.0005476796493729943,
+      "loss": 0.2376,
+      "step": 1175
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.162031352519989,
+      "learning_rate": 0.0005320223134985392,
+      "loss": 0.2361,
+      "step": 1200
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.17621825635433197,
+      "learning_rate": 0.000516333375466762,
+      "loss": 0.2381,
+      "step": 1225
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1739271581172943,
+      "learning_rate": 0.0005006283183653513,
+      "loss": 0.2269,
+      "step": 1250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.17948779463768005,
+      "learning_rate": 0.0004849226411895716,
+      "loss": 0.1949,
+      "step": 1275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.14728158712387085,
+      "learning_rate": 0.0004692318435466265,
+      "loss": 0.2019,
+      "step": 1300
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.17756135761737823,
+      "learning_rate": 0.0004535714103594162,
+      "loss": 0.1973,
+      "step": 1325
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.17123408615589142,
+      "learning_rate": 0.0004379567965847896,
+      "loss": 0.2029,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.15491564571857452,
+      "learning_rate": 0.000422403411961367,
+      "loss": 0.2064,
+      "step": 1375
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.16682790219783783,
+      "learning_rate": 0.00040692660580198903,
+      "loss": 0.2042,
+      "step": 1400
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.16679638624191284,
+      "learning_rate": 0.00039154165184579736,
+      "loss": 0.196,
+      "step": 1425
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.16937030851840973,
+      "learning_rate": 0.00037626373318489886,
+      "loss": 0.2032,
+      "step": 1450
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.15793775022029877,
+      "learning_rate": 0.00036110792728048633,
+      "loss": 0.1999,
+      "step": 1475
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.1879836767911911,
+      "learning_rate": 0.00034608919108320487,
+      "loss": 0.2011,
+      "step": 1500
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.15118607878684998,
+      "learning_rate": 0.0003312223462724472,
+      "loss": 0.2025,
+      "step": 1525
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.1501808613538742,
+      "learning_rate": 0.0003165220646291454,
+      "loss": 0.1955,
+      "step": 1550
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.15337912738323212,
+      "learning_rate": 0.000302002853556495,
+      "loss": 0.1949,
+      "step": 1575
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.15560321509838104,
+      "learning_rate": 0.0002876790417628994,
+      "loss": 0.1897,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.16006162762641907,
+      "learning_rate": 0.00027356476512126383,
+      "loss": 0.1901,
+      "step": 1625
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.17619337141513824,
+      "learning_rate": 0.0002596739527185961,
+      "loss": 0.1856,
+      "step": 1650
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.14214114844799042,
+      "learning_rate": 0.0002460203131096801,
+      "loss": 0.1898,
+      "step": 1675
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.15580157935619354,
+      "learning_rate": 0.00023261732078838537,
+      "loss": 0.1887,
+      "step": 1700
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.15826581418514252,
+      "learning_rate": 0.00021947820288997067,
+      "loss": 0.1892,
+      "step": 1725
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.17906101047992706,
+      "learning_rate": 0.00020661592613749636,
+      "loss": 0.1889,
+      "step": 1750
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.15785814821720123,
+      "learning_rate": 0.00019404318404523603,
+      "loss": 0.1897,
+      "step": 1775
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.16421250998973846,
+      "learning_rate": 0.00018177238439170883,
+      "loss": 0.1783,
+      "step": 1800
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.14710333943367004,
+      "learning_rate": 0.00016981563697470158,
+      "loss": 0.1844,
+      "step": 1825
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.19257409870624542,
+      "learning_rate": 0.00015818474166035906,
+      "loss": 0.1881,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.1730620265007019,
+      "learning_rate": 0.00014689117673814133,
+      "loss": 0.1884,
+      "step": 1875
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.1598033905029297,
+      "learning_rate": 0.00013594608759313833,
+      "loss": 0.1869,
+      "step": 1900
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.16913259029388428,
+      "learning_rate": 0.00012536027570691938,
+      "loss": 0.1853,
+      "step": 1925
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.15878750383853912,
+      "learning_rate": 0.00011514418799777554,
+      "loss": 0.1762,
+      "step": 1950
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.17556917667388916,
+      "learning_rate": 0.0001053079065108728,
+      "loss": 0.1884,
+      "step": 1975
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.1578860580921173,
+      "learning_rate": 9.586113846848982e-05,
+      "loss": 0.1863,
+      "step": 2000
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.14570850133895874,
+      "learning_rate": 8.68132066901623e-05,
+      "loss": 0.1752,
+      "step": 2025
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.15816844999790192,
+      "learning_rate": 7.81730403921856e-05,
+      "loss": 0.1833,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.15224431455135345,
+      "learning_rate": 6.994916637555571e-05,
+      "loss": 0.1872,
+      "step": 2075
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.17057636380195618,
+      "learning_rate": 6.214970061104686e-05,
+      "loss": 0.1841,
+      "step": 2100
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.15019550919532776,
+      "learning_rate": 5.4782340229727555e-05,
+      "loss": 0.1827,
+      "step": 2125
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.16247619688510895,
+      "learning_rate": 4.785435592682219e-05,
+      "loss": 0.1755,
+      "step": 2150
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.14806580543518066,
+      "learning_rate": 4.137258478641176e-05,
+      "loss": 0.1914,
+      "step": 2175
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.159623384475708,
+      "learning_rate": 3.534342353405834e-05,
+      "loss": 0.1821,
+      "step": 2200
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.16583149135112762,
+      "learning_rate": 2.9772822224008513e-05,
+      "loss": 0.1773,
+      "step": 2225
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.16739916801452637,
+      "learning_rate": 2.4666278367208418e-05,
+      "loss": 0.1854,
+      "step": 2250
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.1582023650407791,
+      "learning_rate": 2.0028831505924162e-05,
+      "loss": 0.1766,
+      "step": 2275
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.17545472085475922,
+      "learning_rate": 1.586505824032214e-05,
+      "loss": 0.1803,
+      "step": 2300
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.16175879538059235,
+      "learning_rate": 1.2179067711917014e-05,
+      "loss": 0.1836,
+      "step": 2325
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.1553802192211151,
+      "learning_rate": 8.974497548345395e-06,
+      "loss": 0.1752,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.18709959089756012,
+      "learning_rate": 6.254510273466185e-06,
+      "loss": 0.1979,
+      "step": 2375
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.16829761862754822,
+      "learning_rate": 4.021790186331753e-06,
+      "loss": 0.185,
+      "step": 2400
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.18192219734191895,
+      "learning_rate": 2.2785407121084233e-06,
+      "loss": 0.1776,
+      "step": 2425
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.14456026256084442,
+      "learning_rate": 1.026482227562242e-06,
+      "loss": 0.1798,
+      "step": 2450
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.16220049560070038,
+      "learning_rate": 2.668503632545782e-07,
+      "loss": 0.1844,
+      "step": 2475
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.14179222285747528,
+      "learning_rate": 3.947841241136452e-10,
+      "loss": 0.1811,
+      "step": 2500
+    },
+    {
+      "epoch": 2.0,
+      "step": 2500,
+      "total_flos": 1.62588235137024e+18,
+      "train_loss": 0.23000563049316405,
+      "train_runtime": 2190.0291,
+      "train_samples_per_second": 36.529,
+      "train_steps_per_second": 1.142
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 2500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62588235137024e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/exprep/run_ex23_3ep/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": false,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex23_3ep/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nl_tasks/exprep/run_ex23_3ep/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/exprep/run_ex23_3ep/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "drop_out": 0.0,
+  "inference_mode": true,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 1,
+  "peft_type": "ROTATION",
+  "r": 16,
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/exprep/run_ex23_3ep/ft2/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d974f91a03a6c0817e827b06f4abd84cbc16d81f95bfe0311ee0d480ebebd8a
+size 33602915

nl_tasks/exprep/run_ex23_3ep/output.txt ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 8.4
3	+
4	+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.65883244882487

nl_tasks/exprep/run_ex23_3ep/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1093 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 100,
+  "global_step": 3750,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.2009446620941162,
+      "learning_rate": 0.0009998989386555814,
+      "loss": 0.5927,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.2013162076473236,
+      "learning_rate": 0.0009995787805744778,
+      "loss": 0.3393,
+      "step": 50
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2067354917526245,
+      "learning_rate": 0.000999039490728981,
+      "loss": 0.3324,
+      "step": 75
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.2190377116203308,
+      "learning_rate": 0.000998281305669441,
+      "loss": 0.3145,
+      "step": 100
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2233731746673584,
+      "learning_rate": 0.0009973045579608833,
+      "loss": 0.3059,
+      "step": 125
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.22983631491661072,
+      "learning_rate": 0.0009961096760371347,
+      "loss": 0.3007,
+      "step": 150
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.201463520526886,
+      "learning_rate": 0.0009946971840128981,
+      "loss": 0.2997,
+      "step": 175
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3527778685092926,
+      "learning_rate": 0.0009930677014538588,
+      "loss": 0.3008,
+      "step": 200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.25101253390312195,
+      "learning_rate": 0.0009912219431049217,
+      "loss": 0.2969,
+      "step": 225
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2339434176683426,
+      "learning_rate": 0.0009891607185767018,
+      "loss": 0.2929,
+      "step": 250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.27121809124946594,
+      "learning_rate": 0.0009868849319904012,
+      "loss": 0.2907,
+      "step": 275
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2264082282781601,
+      "learning_rate": 0.000984395581581232,
+      "loss": 0.2996,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.21018308401107788,
+      "learning_rate": 0.000981693759260558,
+      "loss": 0.2932,
+      "step": 325
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.23615872859954834,
+      "learning_rate": 0.0009787806501369446,
+      "loss": 0.2946,
+      "step": 350
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.24151696264743805,
+      "learning_rate": 0.0009756575319963324,
+      "loss": 0.2842,
+      "step": 375
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.22058282792568207,
+      "learning_rate": 0.0009723257747415584,
+      "loss": 0.2852,
+      "step": 400
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.19068972766399384,
+      "learning_rate": 0.00096878683979147,
+      "loss": 0.2743,
+      "step": 425
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.22631610929965973,
+      "learning_rate": 0.000965042279439899,
+      "loss": 0.2782,
+      "step": 450
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.2208271324634552,
+      "learning_rate": 0.0009610937361747747,
+      "loss": 0.2731,
+      "step": 475
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.22609329223632812,
+      "learning_rate": 0.0009569429419576737,
+      "loss": 0.2727,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.19579938054084778,
+      "learning_rate": 0.0009525917174641245,
+      "loss": 0.2727,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.20854559540748596,
+      "learning_rate": 0.0009480419712849994,
+      "loss": 0.2606,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.21399244666099548,
+      "learning_rate": 0.0009432956990893433,
+      "loss": 0.2789,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.21186676621437073,
+      "learning_rate": 0.0009383549827490066,
+      "loss": 0.2631,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2264874279499054,
+      "learning_rate": 0.0009332219894254686,
+      "loss": 0.2782,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.20289474725723267,
+      "learning_rate": 0.0009278989706192479,
+      "loss": 0.2675,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.165157750248909,
+      "learning_rate": 0.0009223882611823205,
+      "loss": 0.2581,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.21674442291259766,
+      "learning_rate": 0.0009166922782939757,
+      "loss": 0.2587,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.20473162829875946,
+      "learning_rate": 0.0009108135204005628,
+      "loss": 0.265,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.21469838917255402,
+      "learning_rate": 0.0009047545661195884,
+      "loss": 0.2607,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.18564464151859283,
+      "learning_rate": 0.0008985180731086505,
+      "loss": 0.2502,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.19200780987739563,
+      "learning_rate": 0.0008921067768997017,
+      "loss": 0.2546,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.2028547078371048,
+      "learning_rate": 0.0008855234896991544,
+      "loss": 0.2474,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.18282544612884521,
+      "learning_rate": 0.0008787710991543547,
+      "loss": 0.2495,
+      "step": 850
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.16127045452594757,
+      "learning_rate": 0.0008718525670869639,
+      "loss": 0.2564,
+      "step": 875
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.17876853048801422,
+      "learning_rate": 0.0008647709281938065,
+      "loss": 0.2523,
+      "step": 900
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.18054188787937164,
+      "learning_rate": 0.0008575292887157515,
+      "loss": 0.2481,
+      "step": 925
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.18751810491085052,
+      "learning_rate": 0.0008501308250752123,
+      "loss": 0.2461,
+      "step": 950
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.15981389582157135,
+      "learning_rate": 0.0008425787824828631,
+      "loss": 0.2428,
+      "step": 975
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.21012845635414124,
+      "learning_rate": 0.0008348764735141823,
+      "loss": 0.2514,
+      "step": 1000
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.20988446474075317,
+      "learning_rate": 0.0008270272766564472,
+      "loss": 0.2498,
+      "step": 1025
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.1669292151927948,
+      "learning_rate": 0.000819034634826818,
+      "loss": 0.2382,
+      "step": 1050
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.18733477592468262,
+      "learning_rate": 0.0008109020538621606,
+      "loss": 0.2363,
+      "step": 1075
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.17588213086128235,
+      "learning_rate": 0.0008026331009812703,
+      "loss": 0.2367,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.15529929101467133,
+      "learning_rate": 0.0007942314032201719,
+      "loss": 0.2443,
+      "step": 1125
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.16888514161109924,
+      "learning_rate": 0.0007857006458411826,
+      "loss": 0.238,
+      "step": 1150
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17457760870456696,
+      "learning_rate": 0.0007770445707164325,
+      "loss": 0.241,
+      "step": 1175
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.1416597217321396,
+      "learning_rate": 0.0007682669746865577,
+      "loss": 0.2397,
+      "step": 1200
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.19417856633663177,
+      "learning_rate": 0.0007593717078952787,
+      "loss": 0.2427,
+      "step": 1225
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.18347010016441345,
+      "learning_rate": 0.0007503626721006019,
+      "loss": 0.2315,
+      "step": 1250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.1707679182291031,
+      "learning_rate": 0.0007412438189633781,
+      "loss": 0.2006,
+      "step": 1275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.14253617823123932,
+      "learning_rate": 0.0007320191483139742,
+      "loss": 0.2094,
+      "step": 1300
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.16302773356437683,
+      "learning_rate": 0.0007226927063978153,
+      "loss": 0.2046,
+      "step": 1325
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.17237479984760284,
+      "learning_rate": 0.0007132685841005674,
+      "loss": 0.2106,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.14080215990543365,
+      "learning_rate": 0.0007037509151537404,
+      "loss": 0.2139,
+      "step": 1375
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.15111273527145386,
+      "learning_rate": 0.0006941438743214963,
+      "loss": 0.2118,
+      "step": 1400
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.16434533894062042,
+      "learning_rate": 0.0006844516755694598,
+      "loss": 0.2036,
+      "step": 1425
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.1573331654071808,
+      "learning_rate": 0.0006746785702163335,
+      "loss": 0.2109,
+      "step": 1450
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.14934468269348145,
+      "learning_rate": 0.0006648288450691298,
+      "loss": 0.2088,
+      "step": 1475
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.1662275493144989,
+      "learning_rate": 0.0006549068205428343,
+      "loss": 0.2091,
+      "step": 1500
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.15690551698207855,
+      "learning_rate": 0.0006449168487653305,
+      "loss": 0.2125,
+      "step": 1525
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.13499705493450165,
+      "learning_rate": 0.0006348633116684117,
+      "loss": 0.2049,
+      "step": 1550
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.14175556600093842,
+      "learning_rate": 0.0006247506190657209,
+      "loss": 0.2033,
+      "step": 1575
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.14965271949768066,
+      "learning_rate": 0.0006145832067184614,
+      "loss": 0.1996,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.15024539828300476,
+      "learning_rate": 0.0006043655343897249,
+      "loss": 0.2009,
+      "step": 1625
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.16027085483074188,
+      "learning_rate": 0.0005941020838882917,
+      "loss": 0.1952,
+      "step": 1650
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.15198029577732086,
+      "learning_rate": 0.000583797357102762,
+      "loss": 0.1996,
+      "step": 1675
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.14285215735435486,
+      "learning_rate": 0.0005734558740268789,
+      "loss": 0.1987,
+      "step": 1700
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.14066115021705627,
+      "learning_rate": 0.000563082170776908,
+      "loss": 0.1985,
+      "step": 1725
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.16289477050304413,
+      "learning_rate": 0.0005526807976019493,
+      "loss": 0.1998,
+      "step": 1750
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.12963451445102692,
+      "learning_rate": 0.0005422563168880455,
+      "loss": 0.1999,
+      "step": 1775
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.16663536429405212,
+      "learning_rate": 0.0005318133011569704,
+      "loss": 0.1892,
+      "step": 1800
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.1475083827972412,
+      "learning_rate": 0.0005213563310605686,
+      "loss": 0.1952,
+      "step": 1825
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.1764904111623764,
+      "learning_rate": 0.00051088999337153,
+      "loss": 0.1981,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.144892156124115,
+      "learning_rate": 0.0005004188789714811,
+      "loss": 0.1991,
+      "step": 1875
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.15162664651870728,
+      "learning_rate": 0.0004899475808372714,
+      "loss": 0.1968,
+      "step": 1900
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.15749579668045044,
+      "learning_rate": 0.0004794806920263417,
+      "loss": 0.1956,
+      "step": 1925
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.1367015242576599,
+      "learning_rate": 0.0004690228036620589,
+      "loss": 0.1863,
+      "step": 1950
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.16991527378559113,
+      "learning_rate": 0.0004585785029198959,
+      "loss": 0.1989,
+      "step": 1975
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.14471600949764252,
+      "learning_rate": 0.00044815237101534535,
+      "loss": 0.1986,
+      "step": 2000
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.130776047706604,
+      "learning_rate": 0.0004377489811944478,
+      "loss": 0.1849,
+      "step": 2025
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.1394679993391037,
+      "learning_rate": 0.00042737289672781367,
+      "loss": 0.1942,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.13585664331912994,
+      "learning_rate": 0.0004170286689090228,
+      "loss": 0.1971,
+      "step": 2075
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.15923821926116943,
+      "learning_rate": 0.0004067208350582768,
+      "loss": 0.1946,
+      "step": 2100
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.13468892872333527,
+      "learning_rate": 0.0003964539165321794,
+      "loss": 0.1931,
+      "step": 2125
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.14454448223114014,
+      "learning_rate": 0.00038623241674052113,
+      "loss": 0.1852,
+      "step": 2150
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.12442506849765778,
+      "learning_rate": 0.00037606081917093416,
+      "loss": 0.2022,
+      "step": 2175
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.1347043663263321,
+      "learning_rate": 0.0003659435854222869,
+      "loss": 0.1928,
+      "step": 2200
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.14994265139102936,
+      "learning_rate": 0.0003558851532476796,
+      "loss": 0.1857,
+      "step": 2225
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.13082821667194366,
+      "learning_rate": 0.0003458899346078979,
+      "loss": 0.1961,
+      "step": 2250
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.13206897675991058,
+      "learning_rate": 0.00033596231373618247,
+      "loss": 0.1866,
+      "step": 2275
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.15052682161331177,
+      "learning_rate": 0.0003261066452151587,
+      "loss": 0.1895,
+      "step": 2300
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.14751003682613373,
+      "learning_rate": 0.0003163272520667726,
+      "loss": 0.1935,
+      "step": 2325
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.13202938437461853,
+      "learning_rate": 0.00030662842385607126,
+      "loss": 0.1848,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.16639895737171173,
+      "learning_rate": 0.0002970144148096568,
+      "loss": 0.2072,
+      "step": 2375
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.1549963802099228,
+      "learning_rate": 0.0002874894419496431,
+      "loss": 0.193,
+      "step": 2400
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.15094412863254547,
+      "learning_rate": 0.00027805768324393014,
+      "loss": 0.1854,
+      "step": 2425
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.12347867339849472,
+      "learning_rate": 0.0002687232757736082,
+      "loss": 0.1868,
+      "step": 2450
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.1314866691827774,
+      "learning_rate": 0.0002594903139182996,
+      "loss": 0.1927,
+      "step": 2475
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.12323761731386185,
+      "learning_rate": 0.0002503628475602256,
+      "loss": 0.1888,
+      "step": 2500
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.14616355299949646,
+      "learning_rate": 0.00024134488030779655,
+      "loss": 0.159,
+      "step": 2525
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.16476809978485107,
+      "learning_rate": 0.00023244036773949656,
+      "loss": 0.1527,
+      "step": 2550
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.1476244181394577,
+      "learning_rate": 0.00022365321566883433,
+      "loss": 0.1579,
+      "step": 2575
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.15825843811035156,
+      "learning_rate": 0.0002149872784311262,
+      "loss": 0.1511,
+      "step": 2600
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.13213805854320526,
+      "learning_rate": 0.00020644635719285705,
+      "loss": 0.1577,
+      "step": 2625
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.12865176796913147,
+      "learning_rate": 0.0001980341982843616,
+      "loss": 0.1523,
+      "step": 2650
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.14385676383972168,
+      "learning_rate": 0.0001897544915565616,
+      "loss": 0.1582,
+      "step": 2675
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.15633924305438995,
+      "learning_rate": 0.0001816108687624749,
+      "loss": 0.1551,
+      "step": 2700
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.13521474599838257,
+      "learning_rate": 0.00017360690196420813,
+      "loss": 0.1597,
+      "step": 2725
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.14471761882305145,
+      "learning_rate": 0.0001657461019661326,
+      "loss": 0.1562,
+      "step": 2750
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.13948746025562286,
+      "learning_rate": 0.0001580319167749294,
+      "loss": 0.1518,
+      "step": 2775
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.14780576527118683,
+      "learning_rate": 0.00015046773008717967,
+      "loss": 0.1529,
+      "step": 2800
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.13603238761425018,
+      "learning_rate": 0.00014305685980516293,
+      "loss": 0.1597,
+      "step": 2825
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.14286421239376068,
+      "learning_rate": 0.00013580255658151685,
+      "loss": 0.1503,
+      "step": 2850
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.1361972689628601,
+      "learning_rate": 0.00012870800239339237,
+      "loss": 0.1535,
+      "step": 2875
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.14035378396511078,
+      "learning_rate": 0.00012177630914673327,
+      "loss": 0.16,
+      "step": 2900
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.12597130239009857,
+      "learning_rate": 0.00011501051731129224,
+      "loss": 0.1501,
+      "step": 2925
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.1465565711259842,
+      "learning_rate": 0.00010841359458697985,
+      "loss": 0.1565,
+      "step": 2950
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.14047019183635712,
+      "learning_rate": 0.00010198843460213336,
+      "loss": 0.1515,
+      "step": 2975
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.1407793015241623,
+      "learning_rate": 9.573785564427562e-05,
+      "loss": 0.1517,
+      "step": 3000
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.15099190175533295,
+      "learning_rate": 8.966459942392108e-05,
+      "loss": 0.1528,
+      "step": 3025
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.14255790412425995,
+      "learning_rate": 8.3771329871971e-05,
+      "loss": 0.1554,
+      "step": 3050
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.14905491471290588,
+      "learning_rate": 7.806063197122521e-05,
+      "loss": 0.1548,
+      "step": 3075
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.15821439027786255,
+      "learning_rate": 7.253501062252338e-05,
+      "loss": 0.1503,
+      "step": 3100
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.15232159197330475,
+      "learning_rate": 6.719688954601267e-05,
+      "loss": 0.1568,
+      "step": 3125
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.14521776139736176,
+      "learning_rate": 6.204861021802333e-05,
+      "loss": 0.1506,
+      "step": 3150
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.14902536571025848,
+      "learning_rate": 5.709243084402127e-05,
+      "loss": 0.1533,
+      "step": 3175
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.13219137489795685,
+      "learning_rate": 5.2330525368083193e-05,
+      "loss": 0.1509,
+      "step": 3200
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.13296589255332947,
+      "learning_rate": 4.776498251933292e-05,
+      "loss": 0.1482,
+      "step": 3225
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.15772312879562378,
+      "learning_rate": 4.3397804895756956e-05,
+      "loss": 0.1489,
+      "step": 3250
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 0.14949175715446472,
+      "learning_rate": 3.923090808579727e-05,
+      "loss": 0.1505,
+      "step": 3275
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.14660567045211792,
+      "learning_rate": 3.5266119828111955e-05,
+      "loss": 0.1503,
+      "step": 3300
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.1468629688024521,
+      "learning_rate": 3.150517920986851e-05,
+      "loss": 0.1538,
+      "step": 3325
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.1596725881099701,
+      "learning_rate": 2.794973590392219e-05,
+      "loss": 0.156,
+      "step": 3350
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.14169612526893616,
+      "learning_rate": 2.460134944521547e-05,
+      "loss": 0.1512,
+      "step": 3375
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.16790783405303955,
+      "learning_rate": 2.1461488546714426e-05,
+      "loss": 0.1608,
+      "step": 3400
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.12738870084285736,
+      "learning_rate": 1.853153045518252e-05,
+      "loss": 0.153,
+      "step": 3425
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.13329529762268066,
+      "learning_rate": 1.581276034707463e-05,
+      "loss": 0.1439,
+      "step": 3450
+    },
+    {
+      "epoch": 2.7800000000000002,
+      "grad_norm": 0.15731698274612427,
+      "learning_rate": 1.3306370764816389e-05,
+      "loss": 0.1485,
+      "step": 3475
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.14650867879390717,
+      "learning_rate": 1.1013461093715594e-05,
+      "loss": 0.1493,
+      "step": 3500
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 0.1446894407272339,
+      "learning_rate": 8.935037079735309e-06,
+      "loss": 0.1519,
+      "step": 3525
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.15621596574783325,
+      "learning_rate": 7.072010388340655e-06,
+      "loss": 0.147,
+      "step": 3550
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.14237834513187408,
+      "learning_rate": 5.425198204612069e-06,
+      "loss": 0.1456,
+      "step": 3575
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.15466976165771484,
+      "learning_rate": 3.995322874800922e-06,
+      "loss": 0.1537,
+      "step": 3600
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.14236176013946533,
+      "learning_rate": 2.7830115894847407e-06,
+      "loss": 0.1528,
+      "step": 3625
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.16872696578502655,
+      "learning_rate": 1.7887961084605553e-06,
+      "loss": 0.1509,
+      "step": 3650
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 0.14598286151885986,
+      "learning_rate": 1.013112527497473e-06,
+      "loss": 0.1466,
+      "step": 3675
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.1556728333234787,
+      "learning_rate": 4.563010870506368e-07,
+      "loss": 0.1482,
+      "step": 3700
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.13495859503746033,
+      "learning_rate": 1.1860602302066203e-07,
+      "loss": 0.1472,
+      "step": 3725
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.1348726600408554,
+      "learning_rate": 1.7545962355258739e-10,
+      "loss": 0.1518,
+      "step": 3750
+    },
+    {
+      "epoch": 3.0,
+      "step": 3750,
+      "total_flos": 2.43882352705536e+18,
+      "train_loss": 0.20868528219858806,
+      "train_runtime": 3281.7804,
+      "train_samples_per_second": 36.566,
+      "train_steps_per_second": 1.143
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 3750,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.43882352705536e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/exprep/run_ex24_3ep/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/exprep/run_ex24_3ep/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

nl_tasks/exprep/run_ex24_3ep/output.txt ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 8.32
3	+
4	+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 50.49279757391963

nl_tasks/exprep/run_ex24_3ep/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1093 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 100,
+  "global_step": 3750,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.21860866248607635,
+      "learning_rate": 0.0009998989386555814,
+      "loss": 0.435,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.1902492791414261,
+      "learning_rate": 0.0009995787805744778,
+      "loss": 0.3384,
+      "step": 50
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2332111895084381,
+      "learning_rate": 0.000999039490728981,
+      "loss": 0.332,
+      "step": 75
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.21917028725147247,
+      "learning_rate": 0.000998281305669441,
+      "loss": 0.3147,
+      "step": 100
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2263868749141693,
+      "learning_rate": 0.0009973045579608833,
+      "loss": 0.3063,
+      "step": 125
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.21854978799819946,
+      "learning_rate": 0.0009961096760371347,
+      "loss": 0.3011,
+      "step": 150
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.21285760402679443,
+      "learning_rate": 0.0009946971840128981,
+      "loss": 0.3008,
+      "step": 175
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.22584690153598785,
+      "learning_rate": 0.0009930677014538588,
+      "loss": 0.3011,
+      "step": 200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.2559281587600708,
+      "learning_rate": 0.0009912219431049217,
+      "loss": 0.2959,
+      "step": 225
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2262907326221466,
+      "learning_rate": 0.0009891607185767018,
+      "loss": 0.292,
+      "step": 250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.22938783466815948,
+      "learning_rate": 0.0009868849319904012,
+      "loss": 0.2913,
+      "step": 275
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.22400376200675964,
+      "learning_rate": 0.000984395581581232,
+      "loss": 0.3003,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.20974433422088623,
+      "learning_rate": 0.000981693759260558,
+      "loss": 0.2925,
+      "step": 325
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.26473161578178406,
+      "learning_rate": 0.0009787806501369446,
+      "loss": 0.2937,
+      "step": 350
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.2237643599510193,
+      "learning_rate": 0.0009756575319963324,
+      "loss": 0.2845,
+      "step": 375
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.23942594230175018,
+      "learning_rate": 0.0009723257747415584,
+      "loss": 0.2846,
+      "step": 400
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.18676090240478516,
+      "learning_rate": 0.00096878683979147,
+      "loss": 0.2743,
+      "step": 425
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2044542133808136,
+      "learning_rate": 0.000965042279439899,
+      "loss": 0.2765,
+      "step": 450
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.22793416678905487,
+      "learning_rate": 0.0009610937361747747,
+      "loss": 0.2751,
+      "step": 475
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.22194074094295502,
+      "learning_rate": 0.0009569429419576737,
+      "loss": 0.2721,
+      "step": 500
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.19625091552734375,
+      "learning_rate": 0.0009525917174641245,
+      "loss": 0.274,
+      "step": 525
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.21780937910079956,
+      "learning_rate": 0.0009480419712849994,
+      "loss": 0.2618,
+      "step": 550
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.20032677054405212,
+      "learning_rate": 0.0009432956990893433,
+      "loss": 0.2781,
+      "step": 575
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1942606121301651,
+      "learning_rate": 0.0009383549827490066,
+      "loss": 0.2633,
+      "step": 600
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.20562711358070374,
+      "learning_rate": 0.0009332219894254686,
+      "loss": 0.2775,
+      "step": 625
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.9733890295028687,
+      "learning_rate": 0.0009278989706192479,
+      "loss": 0.2679,
+      "step": 650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.17540724575519562,
+      "learning_rate": 0.0009223882611823205,
+      "loss": 0.2611,
+      "step": 675
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.24498209357261658,
+      "learning_rate": 0.0009166922782939757,
+      "loss": 0.2624,
+      "step": 700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.20900648832321167,
+      "learning_rate": 0.0009108135204005628,
+      "loss": 0.2661,
+      "step": 725
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2016984522342682,
+      "learning_rate": 0.0009047545661195884,
+      "loss": 0.2622,
+      "step": 750
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.19367337226867676,
+      "learning_rate": 0.0008985180731086505,
+      "loss": 0.2505,
+      "step": 775
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.1933523416519165,
+      "learning_rate": 0.0008921067768997017,
+      "loss": 0.255,
+      "step": 800
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.2022523581981659,
+      "learning_rate": 0.0008855234896991544,
+      "loss": 0.2486,
+      "step": 825
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.18164286017417908,
+      "learning_rate": 0.0008787710991543547,
+      "loss": 0.2491,
+      "step": 850
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.15638867020606995,
+      "learning_rate": 0.0008718525670869639,
+      "loss": 0.2564,
+      "step": 875
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1735961139202118,
+      "learning_rate": 0.0008647709281938065,
+      "loss": 0.2522,
+      "step": 900
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.18064841628074646,
+      "learning_rate": 0.0008575292887157515,
+      "loss": 0.2489,
+      "step": 925
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.1719369888305664,
+      "learning_rate": 0.0008501308250752123,
+      "loss": 0.2469,
+      "step": 950
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.16980446875095367,
+      "learning_rate": 0.0008425787824828631,
+      "loss": 0.2428,
+      "step": 975
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2074533998966217,
+      "learning_rate": 0.0008348764735141823,
+      "loss": 0.2486,
+      "step": 1000
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.19303739070892334,
+      "learning_rate": 0.0008270272766564472,
+      "loss": 0.2474,
+      "step": 1025
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.16771027445793152,
+      "learning_rate": 0.000819034634826818,
+      "loss": 0.2394,
+      "step": 1050
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.18652425706386566,
+      "learning_rate": 0.0008109020538621606,
+      "loss": 0.2349,
+      "step": 1075
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.18362785875797272,
+      "learning_rate": 0.0008026331009812703,
+      "loss": 0.2368,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.17980653047561646,
+      "learning_rate": 0.0007942314032201719,
+      "loss": 0.2438,
+      "step": 1125
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.15695154666900635,
+      "learning_rate": 0.0007857006458411826,
+      "loss": 0.2379,
+      "step": 1150
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.161969855427742,
+      "learning_rate": 0.0007770445707164325,
+      "loss": 0.2408,
+      "step": 1175
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.1580193191766739,
+      "learning_rate": 0.0007682669746865577,
+      "loss": 0.2405,
+      "step": 1200
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.16899335384368896,
+      "learning_rate": 0.0007593717078952787,
+      "loss": 0.243,
+      "step": 1225
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.17789579927921295,
+      "learning_rate": 0.0007503626721006019,
+      "loss": 0.2314,
+      "step": 1250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.17090484499931335,
+      "learning_rate": 0.0007412438189633781,
+      "loss": 0.2001,
+      "step": 1275
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.13509897887706757,
+      "learning_rate": 0.0007320191483139742,
+      "loss": 0.2074,
+      "step": 1300
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.18126724660396576,
+      "learning_rate": 0.0007226927063978153,
+      "loss": 0.2038,
+      "step": 1325
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.2104470133781433,
+      "learning_rate": 0.0007132685841005674,
+      "loss": 0.21,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.1306389570236206,
+      "learning_rate": 0.0007037509151537404,
+      "loss": 0.2128,
+      "step": 1375
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.1495552808046341,
+      "learning_rate": 0.0006941438743214963,
+      "loss": 0.2115,
+      "step": 1400
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.16927556693553925,
+      "learning_rate": 0.0006844516755694598,
+      "loss": 0.2032,
+      "step": 1425
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.16095060110092163,
+      "learning_rate": 0.0006746785702163335,
+      "loss": 0.2111,
+      "step": 1450
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.1520528346300125,
+      "learning_rate": 0.0006648288450691298,
+      "loss": 0.2077,
+      "step": 1475
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.17617478966712952,
+      "learning_rate": 0.0006549068205428343,
+      "loss": 0.2088,
+      "step": 1500
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.17085497081279755,
+      "learning_rate": 0.0006449168487653305,
+      "loss": 0.2125,
+      "step": 1525
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.14262431859970093,
+      "learning_rate": 0.0006348633116684117,
+      "loss": 0.2053,
+      "step": 1550
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.15984103083610535,
+      "learning_rate": 0.0006247506190657209,
+      "loss": 0.2039,
+      "step": 1575
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.1432267129421234,
+      "learning_rate": 0.0006145832067184614,
+      "loss": 0.1994,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.1414625495672226,
+      "learning_rate": 0.0006043655343897249,
+      "loss": 0.1997,
+      "step": 1625
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.1499958485364914,
+      "learning_rate": 0.0005941020838882917,
+      "loss": 0.195,
+      "step": 1650
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.12970398366451263,
+      "learning_rate": 0.000583797357102762,
+      "loss": 0.1992,
+      "step": 1675
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.13817213475704193,
+      "learning_rate": 0.0005734558740268789,
+      "loss": 0.1974,
+      "step": 1700
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.15048491954803467,
+      "learning_rate": 0.000563082170776908,
+      "loss": 0.1986,
+      "step": 1725
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.1596592664718628,
+      "learning_rate": 0.0005526807976019493,
+      "loss": 0.1989,
+      "step": 1750
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.1446324735879898,
+      "learning_rate": 0.0005422563168880455,
+      "loss": 0.2013,
+      "step": 1775
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.15277068316936493,
+      "learning_rate": 0.0005318133011569704,
+      "loss": 0.1894,
+      "step": 1800
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.13572098314762115,
+      "learning_rate": 0.0005213563310605686,
+      "loss": 0.1955,
+      "step": 1825
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.174700066447258,
+      "learning_rate": 0.00051088999337153,
+      "loss": 0.199,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.14586439728736877,
+      "learning_rate": 0.0005004188789714811,
+      "loss": 0.1993,
+      "step": 1875
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.15438227355480194,
+      "learning_rate": 0.0004899475808372714,
+      "loss": 0.1963,
+      "step": 1900
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.1574651598930359,
+      "learning_rate": 0.0004794806920263417,
+      "loss": 0.1961,
+      "step": 1925
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.13583463430404663,
+      "learning_rate": 0.0004690228036620589,
+      "loss": 0.1865,
+      "step": 1950
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.16844166815280914,
+      "learning_rate": 0.0004585785029198959,
+      "loss": 0.1989,
+      "step": 1975
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.1481410562992096,
+      "learning_rate": 0.00044815237101534535,
+      "loss": 0.198,
+      "step": 2000
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.13595865666866302,
+      "learning_rate": 0.0004377489811944478,
+      "loss": 0.1852,
+      "step": 2025
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.13887101411819458,
+      "learning_rate": 0.00042737289672781367,
+      "loss": 0.1941,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.1371852606534958,
+      "learning_rate": 0.0004170286689090228,
+      "loss": 0.1969,
+      "step": 2075
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.1503254920244217,
+      "learning_rate": 0.0004067208350582768,
+      "loss": 0.1948,
+      "step": 2100
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.14083200693130493,
+      "learning_rate": 0.0003964539165321794,
+      "loss": 0.1926,
+      "step": 2125
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.13528388738632202,
+      "learning_rate": 0.00038623241674052113,
+      "loss": 0.1857,
+      "step": 2150
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.13011986017227173,
+      "learning_rate": 0.00037606081917093416,
+      "loss": 0.2014,
+      "step": 2175
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.13956771790981293,
+      "learning_rate": 0.0003659435854222869,
+      "loss": 0.1927,
+      "step": 2200
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.1562594622373581,
+      "learning_rate": 0.0003558851532476796,
+      "loss": 0.1856,
+      "step": 2225
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.13830530643463135,
+      "learning_rate": 0.0003458899346078979,
+      "loss": 0.1955,
+      "step": 2250
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.1343943476676941,
+      "learning_rate": 0.00033596231373618247,
+      "loss": 0.1864,
+      "step": 2275
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.15038716793060303,
+      "learning_rate": 0.0003261066452151587,
+      "loss": 0.1902,
+      "step": 2300
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.1379692554473877,
+      "learning_rate": 0.0003163272520667726,
+      "loss": 0.1935,
+      "step": 2325
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.1431427299976349,
+      "learning_rate": 0.00030662842385607126,
+      "loss": 0.1844,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.16267234086990356,
+      "learning_rate": 0.0002970144148096568,
+      "loss": 0.2061,
+      "step": 2375
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.14323991537094116,
+      "learning_rate": 0.0002874894419496431,
+      "loss": 0.192,
+      "step": 2400
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.1614535003900528,
+      "learning_rate": 0.00027805768324393014,
+      "loss": 0.186,
+      "step": 2425
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.1243644580245018,
+      "learning_rate": 0.0002687232757736082,
+      "loss": 0.1872,
+      "step": 2450
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.13049174845218658,
+      "learning_rate": 0.0002594903139182996,
+      "loss": 0.1922,
+      "step": 2475
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.11963597685098648,
+      "learning_rate": 0.0002503628475602256,
+      "loss": 0.1877,
+      "step": 2500
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.13807468116283417,
+      "learning_rate": 0.00024134488030779655,
+      "loss": 0.1585,
+      "step": 2525
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.15679411590099335,
+      "learning_rate": 0.00023244036773949656,
+      "loss": 0.152,
+      "step": 2550
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.13696733117103577,
+      "learning_rate": 0.00022365321566883433,
+      "loss": 0.1586,
+      "step": 2575
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.16110782325267792,
+      "learning_rate": 0.0002149872784311262,
+      "loss": 0.1501,
+      "step": 2600
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.1372232884168625,
+      "learning_rate": 0.00020644635719285705,
+      "loss": 0.1568,
+      "step": 2625
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.13187378644943237,
+      "learning_rate": 0.0001980341982843616,
+      "loss": 0.1521,
+      "step": 2650
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.13496607542037964,
+      "learning_rate": 0.0001897544915565616,
+      "loss": 0.1579,
+      "step": 2675
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.15261121094226837,
+      "learning_rate": 0.0001816108687624749,
+      "loss": 0.155,
+      "step": 2700
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.14363905787467957,
+      "learning_rate": 0.00017360690196420813,
+      "loss": 0.1594,
+      "step": 2725
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.14778032898902893,
+      "learning_rate": 0.0001657461019661326,
+      "loss": 0.1563,
+      "step": 2750
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.1397523283958435,
+      "learning_rate": 0.0001580319167749294,
+      "loss": 0.151,
+      "step": 2775
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.13365571200847626,
+      "learning_rate": 0.00015046773008717967,
+      "loss": 0.1537,
+      "step": 2800
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.1419825255870819,
+      "learning_rate": 0.00014305685980516293,
+      "loss": 0.1591,
+      "step": 2825
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.14078690111637115,
+      "learning_rate": 0.00013580255658151685,
+      "loss": 0.149,
+      "step": 2850
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.1345941126346588,
+      "learning_rate": 0.00012870800239339237,
+      "loss": 0.1539,
+      "step": 2875
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.12599675357341766,
+      "learning_rate": 0.00012177630914673327,
+      "loss": 0.16,
+      "step": 2900
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.12124225497245789,
+      "learning_rate": 0.00011501051731129224,
+      "loss": 0.1497,
+      "step": 2925
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.14786431193351746,
+      "learning_rate": 0.00010841359458697985,
+      "loss": 0.1562,
+      "step": 2950
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.14894621074199677,
+      "learning_rate": 0.00010198843460213336,
+      "loss": 0.1505,
+      "step": 2975
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.13606901466846466,
+      "learning_rate": 9.573785564427562e-05,
+      "loss": 0.1518,
+      "step": 3000
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.1460190862417221,
+      "learning_rate": 8.966459942392108e-05,
+      "loss": 0.1519,
+      "step": 3025
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.14195576310157776,
+      "learning_rate": 8.3771329871971e-05,
+      "loss": 0.1552,
+      "step": 3050
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.1606581211090088,
+      "learning_rate": 7.806063197122521e-05,
+      "loss": 0.1552,
+      "step": 3075
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.17038539052009583,
+      "learning_rate": 7.253501062252338e-05,
+      "loss": 0.1506,
+      "step": 3100
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.15008100867271423,
+      "learning_rate": 6.719688954601267e-05,
+      "loss": 0.1569,
+      "step": 3125
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.1496666818857193,
+      "learning_rate": 6.204861021802333e-05,
+      "loss": 0.1497,
+      "step": 3150
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.15540142357349396,
+      "learning_rate": 5.709243084402127e-05,
+      "loss": 0.1535,
+      "step": 3175
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.13507093489170074,
+      "learning_rate": 5.2330525368083193e-05,
+      "loss": 0.1513,
+      "step": 3200
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.12770436704158783,
+      "learning_rate": 4.776498251933292e-05,
+      "loss": 0.1474,
+      "step": 3225
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.15767820179462433,
+      "learning_rate": 4.3397804895756956e-05,
+      "loss": 0.1483,
+      "step": 3250
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 0.15003468096256256,
+      "learning_rate": 3.923090808579727e-05,
+      "loss": 0.1499,
+      "step": 3275
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.13899756968021393,
+      "learning_rate": 3.5266119828111955e-05,
+      "loss": 0.1494,
+      "step": 3300
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.1829822063446045,
+      "learning_rate": 3.150517920986851e-05,
+      "loss": 0.1536,
+      "step": 3325
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.14924921095371246,
+      "learning_rate": 2.794973590392219e-05,
+      "loss": 0.1551,
+      "step": 3350
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.1427408903837204,
+      "learning_rate": 2.460134944521547e-05,
+      "loss": 0.1513,
+      "step": 3375
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.1617504507303238,
+      "learning_rate": 2.1461488546714426e-05,
+      "loss": 0.1604,
+      "step": 3400
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.11996972560882568,
+      "learning_rate": 1.853153045518252e-05,
+      "loss": 0.1532,
+      "step": 3425
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.13687242567539215,
+      "learning_rate": 1.581276034707463e-05,
+      "loss": 0.1435,
+      "step": 3450
+    },
+    {
+      "epoch": 2.7800000000000002,
+      "grad_norm": 0.15579059720039368,
+      "learning_rate": 1.3306370764816389e-05,
+      "loss": 0.1479,
+      "step": 3475
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.18821687996387482,
+      "learning_rate": 1.1013461093715594e-05,
+      "loss": 0.1485,
+      "step": 3500
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 0.1499922275543213,
+      "learning_rate": 8.935037079735309e-06,
+      "loss": 0.1511,
+      "step": 3525
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.16106846928596497,
+      "learning_rate": 7.072010388340655e-06,
+      "loss": 0.1474,
+      "step": 3550
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.14959703385829926,
+      "learning_rate": 5.425198204612069e-06,
+      "loss": 0.1459,
+      "step": 3575
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.1529090404510498,
+      "learning_rate": 3.995322874800922e-06,
+      "loss": 0.1528,
+      "step": 3600
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.1568026840686798,
+      "learning_rate": 2.7830115894847407e-06,
+      "loss": 0.1522,
+      "step": 3625
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.17276383936405182,
+      "learning_rate": 1.7887961084605553e-06,
+      "loss": 0.1508,
+      "step": 3650
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 0.13572101294994354,
+      "learning_rate": 1.013112527497473e-06,
+      "loss": 0.1464,
+      "step": 3675
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.1407610923051834,
+      "learning_rate": 4.563010870506368e-07,
+      "loss": 0.1477,
+      "step": 3700
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.13577981293201447,
+      "learning_rate": 1.1860602302066203e-07,
+      "loss": 0.1466,
+      "step": 3725
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.13064716756343842,
+      "learning_rate": 1.7545962355258739e-10,
+      "loss": 0.1512,
+      "step": 3750
+    },
+    {
+      "epoch": 3.0,
+      "step": 3750,
+      "total_flos": 2.43882352705536e+18,
+      "train_loss": 0.20749481468200684,
+      "train_runtime": 3284.9286,
+      "train_samples_per_second": 36.53,
+      "train_steps_per_second": 1.142
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 3750,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.43882352705536e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

nl_tasks/run_exps/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "inference_mode": false,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 4,
+  "peft_type": "ROTATION",
+  "r": 4,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/run_exps/ft/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/run_exps/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nl_tasks/run_exps/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/run_exps/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "</s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "</s>",
+  "use_default_system_prompt": false
+}

nl_tasks/run_exps/ft/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:368cd4dd820d88773de4a09b9db97770bd1e60d2167a7d4fdef5c6dc4925cdb7
+size 6481

nl_tasks/run_exps/ft/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nl_tasks/run_exps/ft2/adapter_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "T": 1.0,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "inference_mode": true,
+  "layers_to_transform": null,
+  "modules_to_save": null,
+  "num_rotations": 4,
+  "peft_type": "ROTATION",
+  "r": 4,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "target_modules_to_skip": null,
+  "task_type": "CAUSAL_LM"
+}

nl_tasks/run_exps/ft2/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe7ebf73d215d485e5bca2853a23f022a473de81b4bb99283cc66ce0a57c5665
+size 33602659

nl_tasks/run_exps/trainer_state.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.016406890894175553,
+  "eval_steps": 10,
+  "global_step": 20,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008203445447087777,
+      "grad_norm": 0.23545275628566742,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.6594,
+      "step": 10
+    },
+    {
+      "epoch": 0.008203445447087777,
+      "eval_loss": 0.48451828956604004,
+      "eval_runtime": 19.7025,
+      "eval_samples_per_second": 50.755,
+      "eval_steps_per_second": 0.812,
+      "step": 10
+    },
+    {
+      "epoch": 0.016406890894175553,
+      "grad_norm": 0.146519273519516,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.4624,
+      "step": 20
+    },
+    {
+      "epoch": 0.016406890894175553,
+      "eval_loss": 0.44331175088882446,
+      "eval_runtime": 19.2624,
+      "eval_samples_per_second": 51.915,
+      "eval_steps_per_second": 0.831,
+      "step": 20
+    },
+    {
+      "epoch": 0.016406890894175553,
+      "step": 20,
+      "total_flos": 1.30070668640256e+16,
+      "train_loss": 0.5609269142150879,
+      "train_runtime": 110.7,
+      "train_samples_per_second": 5.781,
+      "train_steps_per_second": 0.181
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 20,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.30070668640256e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}