nvan15 commited on
Commit
5d64278
·
verified ·
1 Parent(s): 70ff38e

Batch upload part 12

Browse files
Files changed (50) hide show
  1. nl_tasks/exprep/run_ex20_2ep/ft/adapter_config.json +19 -0
  2. nl_tasks/exprep/run_ex20_2ep/ft/special_tokens_map.json +24 -0
  3. nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.json +0 -0
  4. nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.model +3 -0
  5. nl_tasks/exprep/run_ex20_2ep/ft/tokenizer_config.json +43 -0
  6. nl_tasks/exprep/run_ex20_2ep/ft2/adapter_config.json +19 -0
  7. nl_tasks/exprep/run_ex20_2ep/ft2/adapter_model.bin +3 -0
  8. nl_tasks/exprep/run_ex20_2ep/output.txt +4 -0
  9. nl_tasks/exprep/run_ex20_2ep/trainer_state.json +743 -0
  10. nl_tasks/exprep/run_ex21_2ep/ft/adapter_config.json +19 -0
  11. nl_tasks/exprep/run_ex21_2ep/ft/special_tokens_map.json +24 -0
  12. nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.json +0 -0
  13. nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.model +3 -0
  14. nl_tasks/exprep/run_ex21_2ep/ft/tokenizer_config.json +43 -0
  15. nl_tasks/exprep/run_ex21_2ep/ft2/adapter_config.json +19 -0
  16. nl_tasks/exprep/run_ex21_2ep/ft2/adapter_model.bin +3 -0
  17. nl_tasks/exprep/run_ex21_2ep/output.txt +4 -0
  18. nl_tasks/exprep/run_ex21_2ep/trainer_state.json +743 -0
  19. nl_tasks/exprep/run_ex22_2ep/ft/adapter_config.json +19 -0
  20. nl_tasks/exprep/run_ex22_2ep/ft/special_tokens_map.json +24 -0
  21. nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.json +0 -0
  22. nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.model +3 -0
  23. nl_tasks/exprep/run_ex22_2ep/ft/tokenizer_config.json +43 -0
  24. nl_tasks/exprep/run_ex22_2ep/ft2/adapter_config.json +19 -0
  25. nl_tasks/exprep/run_ex22_2ep/ft2/adapter_model.bin +3 -0
  26. nl_tasks/exprep/run_ex22_2ep/output.txt +4 -0
  27. nl_tasks/exprep/run_ex22_2ep/trainer_state.json +743 -0
  28. nl_tasks/exprep/run_ex23_3ep/ft/adapter_config.json +19 -0
  29. nl_tasks/exprep/run_ex23_3ep/ft/special_tokens_map.json +24 -0
  30. nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.json +0 -0
  31. nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.model +3 -0
  32. nl_tasks/exprep/run_ex23_3ep/ft/tokenizer_config.json +43 -0
  33. nl_tasks/exprep/run_ex23_3ep/ft2/adapter_config.json +19 -0
  34. nl_tasks/exprep/run_ex23_3ep/ft2/adapter_model.bin +3 -0
  35. nl_tasks/exprep/run_ex23_3ep/output.txt +4 -0
  36. nl_tasks/exprep/run_ex23_3ep/trainer_state.json +1093 -0
  37. nl_tasks/exprep/run_ex24_3ep/ft/special_tokens_map.json +24 -0
  38. nl_tasks/exprep/run_ex24_3ep/ft/tokenizer_config.json +43 -0
  39. nl_tasks/exprep/run_ex24_3ep/output.txt +4 -0
  40. nl_tasks/exprep/run_ex24_3ep/trainer_state.json +1093 -0
  41. nl_tasks/run_exps/ft/adapter_config.json +18 -0
  42. nl_tasks/run_exps/ft/merges.txt +0 -0
  43. nl_tasks/run_exps/ft/special_tokens_map.json +30 -0
  44. nl_tasks/run_exps/ft/tokenizer.json +0 -0
  45. nl_tasks/run_exps/ft/tokenizer_config.json +51 -0
  46. nl_tasks/run_exps/ft/training_args.bin +3 -0
  47. nl_tasks/run_exps/ft/vocab.json +0 -0
  48. nl_tasks/run_exps/ft2/adapter_config.json +18 -0
  49. nl_tasks/run_exps/ft2/adapter_model.bin +3 -0
  50. nl_tasks/run_exps/trainer_state.json +73 -0
nl_tasks/exprep/run_ex20_2ep/ft/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex20_2ep/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exprep/run_ex20_2ep/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exprep/run_ex20_2ep/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex20_2ep/ft2/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex20_2ep/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f79c0c1e022748a2c2946c6babb09613500a5e5696a7e2fb68e2b97d0b7020e
3
+ size 33602915
nl_tasks/exprep/run_ex20_2ep/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 8.04
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.50720242608037
nl_tasks/exprep/run_ex20_2ep/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 0.2626207172870636,
15
+ "learning_rate": 0.0009997726215503421,
16
+ "loss": 0.8046,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.18716809153556824,
22
+ "learning_rate": 0.0009990524226456182,
23
+ "loss": 0.3432,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.21953710913658142,
29
+ "learning_rate": 0.000997839719251072,
30
+ "loss": 0.3343,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.19770056009292603,
36
+ "learning_rate": 0.0009961357081585429,
37
+ "loss": 0.3165,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.22986027598381042,
43
+ "learning_rate": 0.0009939420710212512,
44
+ "loss": 0.307,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.21198776364326477,
50
+ "learning_rate": 0.0009912609726942104,
51
+ "loss": 0.3003,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.3816479444503784,
57
+ "learning_rate": 0.0009880950590977764,
58
+ "loss": 0.2983,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.2344156950712204,
64
+ "learning_rate": 0.0009844474546064435,
65
+ "loss": 0.3,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.20885460078716278,
71
+ "learning_rate": 0.000980321758965464,
72
+ "loss": 0.2948,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.2171780914068222,
78
+ "learning_rate": 0.0009757220437383345,
79
+ "loss": 0.2914,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.20872080326080322,
85
+ "learning_rate": 0.0009706528482886534,
86
+ "loss": 0.2913,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.22229866683483124,
92
+ "learning_rate": 0.0009651191753003186,
93
+ "loss": 0.3002,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.19446779787540436,
99
+ "learning_rate": 0.0009591264858404809,
100
+ "loss": 0.2913,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.20700472593307495,
106
+ "learning_rate": 0.000952680693970131,
107
+ "loss": 0.2935,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.22668114304542542,
113
+ "learning_rate": 0.0009457881609076351,
114
+ "loss": 0.2832,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.37962883710861206,
120
+ "learning_rate": 0.0009384556887509802,
121
+ "loss": 0.2839,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.20619851350784302,
127
+ "learning_rate": 0.000930690513764925,
128
+ "loss": 0.2749,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.21654820442199707,
134
+ "learning_rate": 0.0009225002992396796,
135
+ "loss": 0.2781,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.216287299990654,
141
+ "learning_rate": 0.000913893127928164,
142
+ "loss": 0.2756,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.22118358314037323,
148
+ "learning_rate": 0.0009048774940693062,
149
+ "loss": 0.2719,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.21317917108535767,
155
+ "learning_rate": 0.0008954622950052542,
156
+ "loss": 0.2723,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.277706116437912,
162
+ "learning_rate": 0.0008856568224007735,
163
+ "loss": 0.2589,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.20005273818969727,
169
+ "learning_rate": 0.0008754707530734958,
170
+ "loss": 0.2773,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.1920776069164276,
176
+ "learning_rate": 0.0008649141394440677,
177
+ "loss": 0.2621,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.2016347497701645,
183
+ "learning_rate": 0.0008539973996156264,
184
+ "loss": 0.2767,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.17412111163139343,
190
+ "learning_rate": 0.0008427313070923884,
191
+ "loss": 0.2656,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.16364696621894836,
197
+ "learning_rate": 0.0008311269801475025,
198
+ "loss": 0.2563,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.20392268896102905,
204
+ "learning_rate": 0.0008191958708506557,
205
+ "loss": 0.2561,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.23820936679840088,
211
+ "learning_rate": 0.0008069497537662638,
212
+ "loss": 0.2628,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.1817774921655655,
218
+ "learning_rate": 0.0007944007143333976,
219
+ "loss": 0.2585,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.18769747018814087,
225
+ "learning_rate": 0.0007815611369389133,
226
+ "loss": 0.2472,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.19526880979537964,
232
+ "learning_rate": 0.0007684436926955582,
233
+ "loss": 0.2512,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.19683842360973358,
239
+ "learning_rate": 0.0007550613269371124,
240
+ "loss": 0.245,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.18514588475227356,
246
+ "learning_rate": 0.0007414272464429068,
247
+ "loss": 0.2469,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.15977084636688232,
253
+ "learning_rate": 0.0007275549064043269,
254
+ "loss": 0.2529,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.17137399315834045,
260
+ "learning_rate": 0.0007134579971461626,
261
+ "loss": 0.2489,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.19649483263492584,
267
+ "learning_rate": 0.0006991504306159115,
268
+ "loss": 0.2452,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.1726703643798828,
274
+ "learning_rate": 0.0006846463266543652,
275
+ "loss": 0.2429,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.18005579710006714,
281
+ "learning_rate": 0.0006699599990610323,
282
+ "loss": 0.2393,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.2112029492855072,
288
+ "learning_rate": 0.0006551059414681455,
289
+ "loss": 0.2468,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.18817350268363953,
295
+ "learning_rate": 0.0006400988130371969,
296
+ "loss": 0.2457,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.1755235642194748,
302
+ "learning_rate": 0.0006249534239921153,
303
+ "loss": 0.2354,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.1836978793144226,
309
+ "learning_rate": 0.000609684721003363,
310
+ "loss": 0.2317,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.17404377460479736,
316
+ "learning_rate": 0.0005943077724373775,
317
+ "loss": 0.2324,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.183836430311203,
323
+ "learning_rate": 0.0005788377534859114,
324
+ "loss": 0.2405,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.15977102518081665,
330
+ "learning_rate": 0.0005632899311899521,
331
+ "loss": 0.2339,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.17550553381443024,
337
+ "learning_rate": 0.0005476796493729943,
338
+ "loss": 0.2365,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.15650643408298492,
344
+ "learning_rate": 0.0005320223134985392,
345
+ "loss": 0.2364,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.18774349987506866,
351
+ "learning_rate": 0.000516333375466762,
352
+ "loss": 0.2366,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.1758316457271576,
358
+ "learning_rate": 0.0005006283183653513,
359
+ "loss": 0.2279,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.16334125399589539,
365
+ "learning_rate": 0.0004849226411895716,
366
+ "loss": 0.1952,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.14510266482830048,
372
+ "learning_rate": 0.0004692318435466265,
373
+ "loss": 0.2013,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.16317233443260193,
379
+ "learning_rate": 0.0004535714103594162,
380
+ "loss": 0.198,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.17068150639533997,
386
+ "learning_rate": 0.0004379567965847896,
387
+ "loss": 0.2031,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.14690789580345154,
393
+ "learning_rate": 0.000422403411961367,
394
+ "loss": 0.2061,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.16131256520748138,
400
+ "learning_rate": 0.00040692660580198903,
401
+ "loss": 0.2037,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.16582365334033966,
407
+ "learning_rate": 0.00039154165184579736,
408
+ "loss": 0.1955,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.16103099286556244,
414
+ "learning_rate": 0.00037626373318489886,
415
+ "loss": 0.2029,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.1566431075334549,
421
+ "learning_rate": 0.00036110792728048633,
422
+ "loss": 0.1999,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.18320217728614807,
428
+ "learning_rate": 0.00034608919108320487,
429
+ "loss": 0.2002,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.15252584218978882,
435
+ "learning_rate": 0.0003312223462724472,
436
+ "loss": 0.2026,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.14527903497219086,
442
+ "learning_rate": 0.0003165220646291454,
443
+ "loss": 0.195,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.16251923143863678,
449
+ "learning_rate": 0.000302002853556495,
450
+ "loss": 0.1928,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.15034204721450806,
456
+ "learning_rate": 0.0002876790417628994,
457
+ "loss": 0.1896,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.16318462789058685,
463
+ "learning_rate": 0.00027356476512126383,
464
+ "loss": 0.1901,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.16164065897464752,
470
+ "learning_rate": 0.0002596739527185961,
471
+ "loss": 0.185,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.13852697610855103,
477
+ "learning_rate": 0.0002460203131096801,
478
+ "loss": 0.1887,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.15581487119197845,
484
+ "learning_rate": 0.00023261732078838537,
485
+ "loss": 0.188,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.16509482264518738,
491
+ "learning_rate": 0.00021947820288997067,
492
+ "loss": 0.1895,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.1754249930381775,
498
+ "learning_rate": 0.00020661592613749636,
499
+ "loss": 0.1885,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.14935770630836487,
505
+ "learning_rate": 0.00019404318404523603,
506
+ "loss": 0.1898,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.16208966076374054,
512
+ "learning_rate": 0.00018177238439170883,
513
+ "loss": 0.1778,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.13969115912914276,
519
+ "learning_rate": 0.00016981563697470158,
520
+ "loss": 0.1843,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.18950283527374268,
526
+ "learning_rate": 0.00015818474166035906,
527
+ "loss": 0.1874,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.14980795979499817,
533
+ "learning_rate": 0.00014689117673814133,
534
+ "loss": 0.1884,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.16838234663009644,
540
+ "learning_rate": 0.00013594608759313833,
541
+ "loss": 0.186,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.17522183060646057,
547
+ "learning_rate": 0.00012536027570691938,
548
+ "loss": 0.1856,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.1533781737089157,
554
+ "learning_rate": 0.00011514418799777554,
555
+ "loss": 0.1753,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.17392344772815704,
561
+ "learning_rate": 0.0001053079065108728,
562
+ "loss": 0.1882,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.16130375862121582,
568
+ "learning_rate": 9.586113846848982e-05,
569
+ "loss": 0.1859,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.15282462537288666,
575
+ "learning_rate": 8.68132066901623e-05,
576
+ "loss": 0.1747,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.14969567954540253,
582
+ "learning_rate": 7.81730403921856e-05,
583
+ "loss": 0.1829,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.19932319223880768,
589
+ "learning_rate": 6.994916637555571e-05,
590
+ "loss": 0.1868,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.16214902698993683,
596
+ "learning_rate": 6.214970061104686e-05,
597
+ "loss": 0.1837,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.15913426876068115,
603
+ "learning_rate": 5.4782340229727555e-05,
604
+ "loss": 0.181,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.15780578553676605,
610
+ "learning_rate": 4.785435592682219e-05,
611
+ "loss": 0.1747,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.15023760497570038,
617
+ "learning_rate": 4.137258478641176e-05,
618
+ "loss": 0.191,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.15563510358333588,
624
+ "learning_rate": 3.534342353405834e-05,
625
+ "loss": 0.1827,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.1683374047279358,
631
+ "learning_rate": 2.9772822224008513e-05,
632
+ "loss": 0.1761,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.14578016102313995,
638
+ "learning_rate": 2.4666278367208418e-05,
639
+ "loss": 0.1844,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.1600412130355835,
645
+ "learning_rate": 2.0028831505924162e-05,
646
+ "loss": 0.1769,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.16546602547168732,
652
+ "learning_rate": 1.586505824032214e-05,
653
+ "loss": 0.1806,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.16472336649894714,
659
+ "learning_rate": 1.2179067711917014e-05,
660
+ "loss": 0.1841,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.15828381478786469,
666
+ "learning_rate": 8.974497548345395e-06,
667
+ "loss": 0.175,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.17364706099033356,
673
+ "learning_rate": 6.254510273466185e-06,
674
+ "loss": 0.1975,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.1747918426990509,
680
+ "learning_rate": 4.021790186331753e-06,
681
+ "loss": 0.1839,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.18869030475616455,
687
+ "learning_rate": 2.2785407121084233e-06,
688
+ "loss": 0.177,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.15464165806770325,
694
+ "learning_rate": 1.026482227562242e-06,
695
+ "loss": 0.1792,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.1583382487297058,
701
+ "learning_rate": 2.668503632545782e-07,
702
+ "loss": 0.185,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.1495935469865799,
708
+ "learning_rate": 3.947841241136452e-10,
709
+ "loss": 0.1809,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.0,
714
+ "step": 2500,
715
+ "total_flos": 1.62588235137024e+18,
716
+ "train_loss": 0.23306660480499267,
717
+ "train_runtime": 2262.3527,
718
+ "train_samples_per_second": 35.361,
719
+ "train_steps_per_second": 1.105
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2500,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 2,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": false,
734
+ "should_training_stop": false
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 1.62588235137024e+18,
740
+ "train_batch_size": 32,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exprep/run_ex21_2ep/ft/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex21_2ep/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exprep/run_ex21_2ep/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exprep/run_ex21_2ep/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex21_2ep/ft2/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex21_2ep/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:838406364c781d904f6e794a55ddbd15163323b48ad1436b297f5155013a0054
3
+ size 33602915
nl_tasks/exprep/run_ex21_2ep/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 7.84
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.88627748294162
nl_tasks/exprep/run_ex21_2ep/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 0.4475359320640564,
15
+ "learning_rate": 0.0009997726215503421,
16
+ "loss": 0.4344,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.186162531375885,
22
+ "learning_rate": 0.0009990524226456182,
23
+ "loss": 0.3436,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.21256987750530243,
29
+ "learning_rate": 0.000997839719251072,
30
+ "loss": 0.3324,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.22403238713741302,
36
+ "learning_rate": 0.0009961357081585429,
37
+ "loss": 0.3146,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.26591813564300537,
43
+ "learning_rate": 0.0009939420710212512,
44
+ "loss": 0.3064,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.20775578916072845,
50
+ "learning_rate": 0.0009912609726942104,
51
+ "loss": 0.3012,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.1952112317085266,
57
+ "learning_rate": 0.0009880950590977764,
58
+ "loss": 0.2985,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.23744182288646698,
64
+ "learning_rate": 0.0009844474546064435,
65
+ "loss": 0.3004,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.25152233242988586,
71
+ "learning_rate": 0.000980321758965464,
72
+ "loss": 0.2942,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.2282615751028061,
78
+ "learning_rate": 0.0009757220437383345,
79
+ "loss": 0.2915,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.24063201248645782,
85
+ "learning_rate": 0.0009706528482886534,
86
+ "loss": 0.2902,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.2492981255054474,
92
+ "learning_rate": 0.0009651191753003186,
93
+ "loss": 0.302,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.20875616371631622,
99
+ "learning_rate": 0.0009591264858404809,
100
+ "loss": 0.2944,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.24029485881328583,
106
+ "learning_rate": 0.000952680693970131,
107
+ "loss": 0.297,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.21902728080749512,
113
+ "learning_rate": 0.0009457881609076351,
114
+ "loss": 0.2863,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.24253377318382263,
120
+ "learning_rate": 0.0009384556887509802,
121
+ "loss": 0.2847,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.19963406026363373,
127
+ "learning_rate": 0.000930690513764925,
128
+ "loss": 0.2752,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.2175171673297882,
134
+ "learning_rate": 0.0009225002992396796,
135
+ "loss": 0.2765,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.20239321887493134,
141
+ "learning_rate": 0.000913893127928164,
142
+ "loss": 0.2739,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.23220029473304749,
148
+ "learning_rate": 0.0009048774940693062,
149
+ "loss": 0.2706,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.20775073766708374,
155
+ "learning_rate": 0.0008954622950052542,
156
+ "loss": 0.272,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.2095000296831131,
162
+ "learning_rate": 0.0008856568224007735,
163
+ "loss": 0.2606,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.22149509191513062,
169
+ "learning_rate": 0.0008754707530734958,
170
+ "loss": 0.2773,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.20700635015964508,
176
+ "learning_rate": 0.0008649141394440677,
177
+ "loss": 0.2619,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.18627957999706268,
183
+ "learning_rate": 0.0008539973996156264,
184
+ "loss": 0.2754,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.18619582056999207,
190
+ "learning_rate": 0.0008427313070923884,
191
+ "loss": 0.2653,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.16383656859397888,
197
+ "learning_rate": 0.0008311269801475025,
198
+ "loss": 0.2561,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.2009345144033432,
204
+ "learning_rate": 0.0008191958708506557,
205
+ "loss": 0.2552,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.21614684164524078,
211
+ "learning_rate": 0.0008069497537662638,
212
+ "loss": 0.2624,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.1810019165277481,
218
+ "learning_rate": 0.0007944007143333976,
219
+ "loss": 0.2593,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.18752247095108032,
225
+ "learning_rate": 0.0007815611369389133,
226
+ "loss": 0.2471,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.22774486243724823,
232
+ "learning_rate": 0.0007684436926955582,
233
+ "loss": 0.253,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.6058231592178345,
239
+ "learning_rate": 0.0007550613269371124,
240
+ "loss": 0.2452,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.21049253642559052,
246
+ "learning_rate": 0.0007414272464429068,
247
+ "loss": 0.2473,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.1695825457572937,
253
+ "learning_rate": 0.0007275549064043269,
254
+ "loss": 0.2535,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.17602591216564178,
260
+ "learning_rate": 0.0007134579971461626,
261
+ "loss": 0.2489,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.18434438109397888,
267
+ "learning_rate": 0.0006991504306159115,
268
+ "loss": 0.2463,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.17386262118816376,
274
+ "learning_rate": 0.0006846463266543652,
275
+ "loss": 0.2432,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.19186212122440338,
281
+ "learning_rate": 0.0006699599990610323,
282
+ "loss": 0.2404,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.20263203978538513,
288
+ "learning_rate": 0.0006551059414681455,
289
+ "loss": 0.2467,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.22712233662605286,
295
+ "learning_rate": 0.0006400988130371969,
296
+ "loss": 0.2454,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.16690626740455627,
302
+ "learning_rate": 0.0006249534239921153,
303
+ "loss": 0.2347,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.173813134431839,
309
+ "learning_rate": 0.000609684721003363,
310
+ "loss": 0.2316,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.1734803467988968,
316
+ "learning_rate": 0.0005943077724373775,
317
+ "loss": 0.2321,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.18156838417053223,
323
+ "learning_rate": 0.0005788377534859114,
324
+ "loss": 0.2412,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.15780498087406158,
330
+ "learning_rate": 0.0005632899311899521,
331
+ "loss": 0.2343,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.16868476569652557,
337
+ "learning_rate": 0.0005476796493729943,
338
+ "loss": 0.2374,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.15904894471168518,
344
+ "learning_rate": 0.0005320223134985392,
345
+ "loss": 0.236,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.17931579053401947,
351
+ "learning_rate": 0.000516333375466762,
352
+ "loss": 0.2378,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.18797090649604797,
358
+ "learning_rate": 0.0005006283183653513,
359
+ "loss": 0.2267,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.17563988268375397,
365
+ "learning_rate": 0.0004849226411895716,
366
+ "loss": 0.1948,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.14821073412895203,
372
+ "learning_rate": 0.0004692318435466265,
373
+ "loss": 0.2021,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.16894803941249847,
379
+ "learning_rate": 0.0004535714103594162,
380
+ "loss": 0.1982,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.16558986902236938,
386
+ "learning_rate": 0.0004379567965847896,
387
+ "loss": 0.2029,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.14962488412857056,
393
+ "learning_rate": 0.000422403411961367,
394
+ "loss": 0.2071,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.1625063121318817,
400
+ "learning_rate": 0.00040692660580198903,
401
+ "loss": 0.2038,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.170761376619339,
407
+ "learning_rate": 0.00039154165184579736,
408
+ "loss": 0.1958,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.15951049327850342,
414
+ "learning_rate": 0.00037626373318489886,
415
+ "loss": 0.2021,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.14665117859840393,
421
+ "learning_rate": 0.00036110792728048633,
422
+ "loss": 0.1996,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.17691372334957123,
428
+ "learning_rate": 0.00034608919108320487,
429
+ "loss": 0.2006,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.16786377131938934,
435
+ "learning_rate": 0.0003312223462724472,
436
+ "loss": 0.2035,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.13940677046775818,
442
+ "learning_rate": 0.0003165220646291454,
443
+ "loss": 0.1957,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.1505332589149475,
449
+ "learning_rate": 0.000302002853556495,
450
+ "loss": 0.1949,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.17429719865322113,
456
+ "learning_rate": 0.0002876790417628994,
457
+ "loss": 0.1898,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.16113583743572235,
463
+ "learning_rate": 0.00027356476512126383,
464
+ "loss": 0.1911,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.17427127063274384,
470
+ "learning_rate": 0.0002596739527185961,
471
+ "loss": 0.1858,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.13851557672023773,
477
+ "learning_rate": 0.0002460203131096801,
478
+ "loss": 0.1897,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.15388120710849762,
484
+ "learning_rate": 0.00023261732078838537,
485
+ "loss": 0.1884,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.16655150055885315,
491
+ "learning_rate": 0.00021947820288997067,
492
+ "loss": 0.1895,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.17470037937164307,
498
+ "learning_rate": 0.00020661592613749636,
499
+ "loss": 0.1882,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.15715032815933228,
505
+ "learning_rate": 0.00019404318404523603,
506
+ "loss": 0.1906,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.19270935654640198,
512
+ "learning_rate": 0.00018177238439170883,
513
+ "loss": 0.1783,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.14684396982192993,
519
+ "learning_rate": 0.00016981563697470158,
520
+ "loss": 0.1846,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.18586526811122894,
526
+ "learning_rate": 0.00015818474166035906,
527
+ "loss": 0.1883,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.169161856174469,
533
+ "learning_rate": 0.00014689117673814133,
534
+ "loss": 0.1869,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.17364977300167084,
540
+ "learning_rate": 0.00013594608759313833,
541
+ "loss": 0.1862,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.17725440859794617,
547
+ "learning_rate": 0.00012536027570691938,
548
+ "loss": 0.1858,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.1515907198190689,
554
+ "learning_rate": 0.00011514418799777554,
555
+ "loss": 0.1758,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.1727982610464096,
561
+ "learning_rate": 0.0001053079065108728,
562
+ "loss": 0.1884,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.15956495702266693,
568
+ "learning_rate": 9.586113846848982e-05,
569
+ "loss": 0.1863,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.14366982877254486,
575
+ "learning_rate": 8.68132066901623e-05,
576
+ "loss": 0.1744,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.15725454688072205,
582
+ "learning_rate": 7.81730403921856e-05,
583
+ "loss": 0.1827,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.13973355293273926,
589
+ "learning_rate": 6.994916637555571e-05,
590
+ "loss": 0.1865,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.16779528558254242,
596
+ "learning_rate": 6.214970061104686e-05,
597
+ "loss": 0.1843,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.15546084940433502,
603
+ "learning_rate": 5.4782340229727555e-05,
604
+ "loss": 0.1828,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.16239280998706818,
610
+ "learning_rate": 4.785435592682219e-05,
611
+ "loss": 0.1751,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.1440260261297226,
617
+ "learning_rate": 4.137258478641176e-05,
618
+ "loss": 0.1909,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.15595702826976776,
624
+ "learning_rate": 3.534342353405834e-05,
625
+ "loss": 0.1822,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.15584272146224976,
631
+ "learning_rate": 2.9772822224008513e-05,
632
+ "loss": 0.1769,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.15885038673877716,
638
+ "learning_rate": 2.4666278367208418e-05,
639
+ "loss": 0.1849,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.1490071564912796,
645
+ "learning_rate": 2.0028831505924162e-05,
646
+ "loss": 0.177,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.16527613997459412,
652
+ "learning_rate": 1.586505824032214e-05,
653
+ "loss": 0.1799,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.1699392944574356,
659
+ "learning_rate": 1.2179067711917014e-05,
660
+ "loss": 0.1848,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.1474064588546753,
666
+ "learning_rate": 8.974497548345395e-06,
667
+ "loss": 0.1751,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.17423242330551147,
673
+ "learning_rate": 6.254510273466185e-06,
674
+ "loss": 0.1978,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.16892707347869873,
680
+ "learning_rate": 4.021790186331753e-06,
681
+ "loss": 0.1844,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.19659648835659027,
687
+ "learning_rate": 2.2785407121084233e-06,
688
+ "loss": 0.1774,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.14788737893104553,
694
+ "learning_rate": 1.026482227562242e-06,
695
+ "loss": 0.1794,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.15226797759532928,
701
+ "learning_rate": 2.668503632545782e-07,
702
+ "loss": 0.1848,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.13729043304920197,
708
+ "learning_rate": 3.947841241136452e-10,
709
+ "loss": 0.1815,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.0,
714
+ "step": 2500,
715
+ "total_flos": 1.62588235137024e+18,
716
+ "train_loss": 0.22959588203430176,
717
+ "train_runtime": 2193.5556,
718
+ "train_samples_per_second": 36.47,
719
+ "train_steps_per_second": 1.14
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2500,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 2,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": false,
734
+ "should_training_stop": false
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 1.62588235137024e+18,
740
+ "train_batch_size": 32,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exprep/run_ex22_2ep/ft/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "v_proj",
15
+ "q_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex22_2ep/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exprep/run_ex22_2ep/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exprep/run_ex22_2ep/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex22_2ep/ft2/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "v_proj",
15
+ "q_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex22_2ep/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eb0157103be76af903b8368fbc6c8cd4cd0b7723f5114dab11ecd5f66ca403a
3
+ size 33602915
nl_tasks/exprep/run_ex22_2ep/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 7.82
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.73464746019712
nl_tasks/exprep/run_ex22_2ep/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 0.18882547318935394,
15
+ "learning_rate": 0.0009997726215503421,
16
+ "loss": 0.4826,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.1954464167356491,
22
+ "learning_rate": 0.0009990524226456182,
23
+ "loss": 0.3389,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.2120068222284317,
29
+ "learning_rate": 0.000997839719251072,
30
+ "loss": 0.3338,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.22273743152618408,
36
+ "learning_rate": 0.0009961357081585429,
37
+ "loss": 0.315,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.34039777517318726,
43
+ "learning_rate": 0.0009939420710212512,
44
+ "loss": 0.3076,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.23580029606819153,
50
+ "learning_rate": 0.0009912609726942104,
51
+ "loss": 0.3012,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.19439291954040527,
57
+ "learning_rate": 0.0009880950590977764,
58
+ "loss": 0.2978,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.2402939349412918,
64
+ "learning_rate": 0.0009844474546064435,
65
+ "loss": 0.3006,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.2308524250984192,
71
+ "learning_rate": 0.000980321758965464,
72
+ "loss": 0.2958,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.22124530375003815,
78
+ "learning_rate": 0.0009757220437383345,
79
+ "loss": 0.2908,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.23367883265018463,
85
+ "learning_rate": 0.0009706528482886534,
86
+ "loss": 0.2902,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.2590882182121277,
92
+ "learning_rate": 0.0009651191753003186,
93
+ "loss": 0.3007,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.21626244485378265,
99
+ "learning_rate": 0.0009591264858404809,
100
+ "loss": 0.2903,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.24277330935001373,
106
+ "learning_rate": 0.000952680693970131,
107
+ "loss": 0.2927,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.21639184653759003,
113
+ "learning_rate": 0.0009457881609076351,
114
+ "loss": 0.2848,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.28448352217674255,
120
+ "learning_rate": 0.0009384556887509802,
121
+ "loss": 0.2853,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.19180361926555634,
127
+ "learning_rate": 0.000930690513764925,
128
+ "loss": 0.2745,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.2169312685728073,
134
+ "learning_rate": 0.0009225002992396796,
135
+ "loss": 0.2767,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.19704866409301758,
141
+ "learning_rate": 0.000913893127928164,
142
+ "loss": 0.2726,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.21796244382858276,
148
+ "learning_rate": 0.0009048774940693062,
149
+ "loss": 0.2712,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.1845843344926834,
155
+ "learning_rate": 0.0008954622950052542,
156
+ "loss": 0.2704,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.26246872544288635,
162
+ "learning_rate": 0.0008856568224007735,
163
+ "loss": 0.2589,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.24823100864887238,
169
+ "learning_rate": 0.0008754707530734958,
170
+ "loss": 0.278,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.27403074502944946,
176
+ "learning_rate": 0.0008649141394440677,
177
+ "loss": 0.2637,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.19122664630413055,
183
+ "learning_rate": 0.0008539973996156264,
184
+ "loss": 0.2774,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.19332978129386902,
190
+ "learning_rate": 0.0008427313070923884,
191
+ "loss": 0.2659,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.1755480021238327,
197
+ "learning_rate": 0.0008311269801475025,
198
+ "loss": 0.2566,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.19162705540657043,
204
+ "learning_rate": 0.0008191958708506557,
205
+ "loss": 0.2563,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.2055402398109436,
211
+ "learning_rate": 0.0008069497537662638,
212
+ "loss": 0.2628,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.2031162679195404,
218
+ "learning_rate": 0.0007944007143333976,
219
+ "loss": 0.2602,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.19989728927612305,
225
+ "learning_rate": 0.0007815611369389133,
226
+ "loss": 0.2486,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.20189844071865082,
232
+ "learning_rate": 0.0007684436926955582,
233
+ "loss": 0.2521,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.22558659315109253,
239
+ "learning_rate": 0.0007550613269371124,
240
+ "loss": 0.2455,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.18982113897800446,
246
+ "learning_rate": 0.0007414272464429068,
247
+ "loss": 0.2474,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.16264374554157257,
253
+ "learning_rate": 0.0007275549064043269,
254
+ "loss": 0.2537,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.17321237921714783,
260
+ "learning_rate": 0.0007134579971461626,
261
+ "loss": 0.2488,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.17718718945980072,
267
+ "learning_rate": 0.0006991504306159115,
268
+ "loss": 0.2463,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.18864493072032928,
274
+ "learning_rate": 0.0006846463266543652,
275
+ "loss": 0.2434,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.1813870519399643,
281
+ "learning_rate": 0.0006699599990610323,
282
+ "loss": 0.2386,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.21876706182956696,
288
+ "learning_rate": 0.0006551059414681455,
289
+ "loss": 0.2458,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.18162848055362701,
295
+ "learning_rate": 0.0006400988130371969,
296
+ "loss": 0.2447,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.1748015582561493,
302
+ "learning_rate": 0.0006249534239921153,
303
+ "loss": 0.2349,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.1879139095544815,
309
+ "learning_rate": 0.000609684721003363,
310
+ "loss": 0.2326,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.17181971669197083,
316
+ "learning_rate": 0.0005943077724373775,
317
+ "loss": 0.2313,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.17912223935127258,
323
+ "learning_rate": 0.0005788377534859114,
324
+ "loss": 0.2407,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.15652291476726532,
330
+ "learning_rate": 0.0005632899311899521,
331
+ "loss": 0.2337,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.17756380140781403,
337
+ "learning_rate": 0.0005476796493729943,
338
+ "loss": 0.2376,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.162031352519989,
344
+ "learning_rate": 0.0005320223134985392,
345
+ "loss": 0.2361,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.17621825635433197,
351
+ "learning_rate": 0.000516333375466762,
352
+ "loss": 0.2381,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.1739271581172943,
358
+ "learning_rate": 0.0005006283183653513,
359
+ "loss": 0.2269,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.17948779463768005,
365
+ "learning_rate": 0.0004849226411895716,
366
+ "loss": 0.1949,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.14728158712387085,
372
+ "learning_rate": 0.0004692318435466265,
373
+ "loss": 0.2019,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.17756135761737823,
379
+ "learning_rate": 0.0004535714103594162,
380
+ "loss": 0.1973,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.17123408615589142,
386
+ "learning_rate": 0.0004379567965847896,
387
+ "loss": 0.2029,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.15491564571857452,
393
+ "learning_rate": 0.000422403411961367,
394
+ "loss": 0.2064,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.16682790219783783,
400
+ "learning_rate": 0.00040692660580198903,
401
+ "loss": 0.2042,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.16679638624191284,
407
+ "learning_rate": 0.00039154165184579736,
408
+ "loss": 0.196,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.16937030851840973,
414
+ "learning_rate": 0.00037626373318489886,
415
+ "loss": 0.2032,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.15793775022029877,
421
+ "learning_rate": 0.00036110792728048633,
422
+ "loss": 0.1999,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.1879836767911911,
428
+ "learning_rate": 0.00034608919108320487,
429
+ "loss": 0.2011,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.15118607878684998,
435
+ "learning_rate": 0.0003312223462724472,
436
+ "loss": 0.2025,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.1501808613538742,
442
+ "learning_rate": 0.0003165220646291454,
443
+ "loss": 0.1955,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.15337912738323212,
449
+ "learning_rate": 0.000302002853556495,
450
+ "loss": 0.1949,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.15560321509838104,
456
+ "learning_rate": 0.0002876790417628994,
457
+ "loss": 0.1897,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.16006162762641907,
463
+ "learning_rate": 0.00027356476512126383,
464
+ "loss": 0.1901,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.17619337141513824,
470
+ "learning_rate": 0.0002596739527185961,
471
+ "loss": 0.1856,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.14214114844799042,
477
+ "learning_rate": 0.0002460203131096801,
478
+ "loss": 0.1898,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.15580157935619354,
484
+ "learning_rate": 0.00023261732078838537,
485
+ "loss": 0.1887,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.15826581418514252,
491
+ "learning_rate": 0.00021947820288997067,
492
+ "loss": 0.1892,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.17906101047992706,
498
+ "learning_rate": 0.00020661592613749636,
499
+ "loss": 0.1889,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.15785814821720123,
505
+ "learning_rate": 0.00019404318404523603,
506
+ "loss": 0.1897,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.16421250998973846,
512
+ "learning_rate": 0.00018177238439170883,
513
+ "loss": 0.1783,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.14710333943367004,
519
+ "learning_rate": 0.00016981563697470158,
520
+ "loss": 0.1844,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.19257409870624542,
526
+ "learning_rate": 0.00015818474166035906,
527
+ "loss": 0.1881,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.1730620265007019,
533
+ "learning_rate": 0.00014689117673814133,
534
+ "loss": 0.1884,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.1598033905029297,
540
+ "learning_rate": 0.00013594608759313833,
541
+ "loss": 0.1869,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.16913259029388428,
547
+ "learning_rate": 0.00012536027570691938,
548
+ "loss": 0.1853,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.15878750383853912,
554
+ "learning_rate": 0.00011514418799777554,
555
+ "loss": 0.1762,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.17556917667388916,
561
+ "learning_rate": 0.0001053079065108728,
562
+ "loss": 0.1884,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.1578860580921173,
568
+ "learning_rate": 9.586113846848982e-05,
569
+ "loss": 0.1863,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.14570850133895874,
575
+ "learning_rate": 8.68132066901623e-05,
576
+ "loss": 0.1752,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.15816844999790192,
582
+ "learning_rate": 7.81730403921856e-05,
583
+ "loss": 0.1833,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.15224431455135345,
589
+ "learning_rate": 6.994916637555571e-05,
590
+ "loss": 0.1872,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.17057636380195618,
596
+ "learning_rate": 6.214970061104686e-05,
597
+ "loss": 0.1841,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.15019550919532776,
603
+ "learning_rate": 5.4782340229727555e-05,
604
+ "loss": 0.1827,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.16247619688510895,
610
+ "learning_rate": 4.785435592682219e-05,
611
+ "loss": 0.1755,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.14806580543518066,
617
+ "learning_rate": 4.137258478641176e-05,
618
+ "loss": 0.1914,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.159623384475708,
624
+ "learning_rate": 3.534342353405834e-05,
625
+ "loss": 0.1821,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.16583149135112762,
631
+ "learning_rate": 2.9772822224008513e-05,
632
+ "loss": 0.1773,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.16739916801452637,
638
+ "learning_rate": 2.4666278367208418e-05,
639
+ "loss": 0.1854,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.1582023650407791,
645
+ "learning_rate": 2.0028831505924162e-05,
646
+ "loss": 0.1766,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.17545472085475922,
652
+ "learning_rate": 1.586505824032214e-05,
653
+ "loss": 0.1803,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.16175879538059235,
659
+ "learning_rate": 1.2179067711917014e-05,
660
+ "loss": 0.1836,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.1553802192211151,
666
+ "learning_rate": 8.974497548345395e-06,
667
+ "loss": 0.1752,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.18709959089756012,
673
+ "learning_rate": 6.254510273466185e-06,
674
+ "loss": 0.1979,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.16829761862754822,
680
+ "learning_rate": 4.021790186331753e-06,
681
+ "loss": 0.185,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.18192219734191895,
687
+ "learning_rate": 2.2785407121084233e-06,
688
+ "loss": 0.1776,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.14456026256084442,
694
+ "learning_rate": 1.026482227562242e-06,
695
+ "loss": 0.1798,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.16220049560070038,
701
+ "learning_rate": 2.668503632545782e-07,
702
+ "loss": 0.1844,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.14179222285747528,
708
+ "learning_rate": 3.947841241136452e-10,
709
+ "loss": 0.1811,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.0,
714
+ "step": 2500,
715
+ "total_flos": 1.62588235137024e+18,
716
+ "train_loss": 0.23000563049316405,
717
+ "train_runtime": 2190.0291,
718
+ "train_samples_per_second": 36.529,
719
+ "train_steps_per_second": 1.142
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2500,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 2,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": false,
734
+ "should_training_stop": false
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 1.62588235137024e+18,
740
+ "train_batch_size": 32,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exprep/run_ex23_3ep/ft/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "v_proj",
15
+ "q_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex23_3ep/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exprep/run_ex23_3ep/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exprep/run_ex23_3ep/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex23_3ep/ft2/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "v_proj",
15
+ "q_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex23_3ep/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d974f91a03a6c0817e827b06f4abd84cbc16d81f95bfe0311ee0d480ebebd8a
3
+ size 33602915
nl_tasks/exprep/run_ex23_3ep/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 8.4
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.65883244882487
nl_tasks/exprep/run_ex23_3ep/trainer_state.json ADDED
@@ -0,0 +1,1093 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 100,
7
+ "global_step": 3750,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 0.2009446620941162,
15
+ "learning_rate": 0.0009998989386555814,
16
+ "loss": 0.5927,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.2013162076473236,
22
+ "learning_rate": 0.0009995787805744778,
23
+ "loss": 0.3393,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.2067354917526245,
29
+ "learning_rate": 0.000999039490728981,
30
+ "loss": 0.3324,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.2190377116203308,
36
+ "learning_rate": 0.000998281305669441,
37
+ "loss": 0.3145,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.2233731746673584,
43
+ "learning_rate": 0.0009973045579608833,
44
+ "loss": 0.3059,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.22983631491661072,
50
+ "learning_rate": 0.0009961096760371347,
51
+ "loss": 0.3007,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.201463520526886,
57
+ "learning_rate": 0.0009946971840128981,
58
+ "loss": 0.2997,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.3527778685092926,
64
+ "learning_rate": 0.0009930677014538588,
65
+ "loss": 0.3008,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.25101253390312195,
71
+ "learning_rate": 0.0009912219431049217,
72
+ "loss": 0.2969,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.2339434176683426,
78
+ "learning_rate": 0.0009891607185767018,
79
+ "loss": 0.2929,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.27121809124946594,
85
+ "learning_rate": 0.0009868849319904012,
86
+ "loss": 0.2907,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.2264082282781601,
92
+ "learning_rate": 0.000984395581581232,
93
+ "loss": 0.2996,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.21018308401107788,
99
+ "learning_rate": 0.000981693759260558,
100
+ "loss": 0.2932,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.23615872859954834,
106
+ "learning_rate": 0.0009787806501369446,
107
+ "loss": 0.2946,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.24151696264743805,
113
+ "learning_rate": 0.0009756575319963324,
114
+ "loss": 0.2842,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.22058282792568207,
120
+ "learning_rate": 0.0009723257747415584,
121
+ "loss": 0.2852,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.19068972766399384,
127
+ "learning_rate": 0.00096878683979147,
128
+ "loss": 0.2743,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.22631610929965973,
134
+ "learning_rate": 0.000965042279439899,
135
+ "loss": 0.2782,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.2208271324634552,
141
+ "learning_rate": 0.0009610937361747747,
142
+ "loss": 0.2731,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.22609329223632812,
148
+ "learning_rate": 0.0009569429419576737,
149
+ "loss": 0.2727,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.19579938054084778,
155
+ "learning_rate": 0.0009525917174641245,
156
+ "loss": 0.2727,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.20854559540748596,
162
+ "learning_rate": 0.0009480419712849994,
163
+ "loss": 0.2606,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.21399244666099548,
169
+ "learning_rate": 0.0009432956990893433,
170
+ "loss": 0.2789,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.21186676621437073,
176
+ "learning_rate": 0.0009383549827490066,
177
+ "loss": 0.2631,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.2264874279499054,
183
+ "learning_rate": 0.0009332219894254686,
184
+ "loss": 0.2782,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.20289474725723267,
190
+ "learning_rate": 0.0009278989706192479,
191
+ "loss": 0.2675,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.165157750248909,
197
+ "learning_rate": 0.0009223882611823205,
198
+ "loss": 0.2581,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.21674442291259766,
204
+ "learning_rate": 0.0009166922782939757,
205
+ "loss": 0.2587,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.20473162829875946,
211
+ "learning_rate": 0.0009108135204005628,
212
+ "loss": 0.265,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.21469838917255402,
218
+ "learning_rate": 0.0009047545661195884,
219
+ "loss": 0.2607,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.18564464151859283,
225
+ "learning_rate": 0.0008985180731086505,
226
+ "loss": 0.2502,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.19200780987739563,
232
+ "learning_rate": 0.0008921067768997017,
233
+ "loss": 0.2546,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.2028547078371048,
239
+ "learning_rate": 0.0008855234896991544,
240
+ "loss": 0.2474,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.18282544612884521,
246
+ "learning_rate": 0.0008787710991543547,
247
+ "loss": 0.2495,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.16127045452594757,
253
+ "learning_rate": 0.0008718525670869639,
254
+ "loss": 0.2564,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.17876853048801422,
260
+ "learning_rate": 0.0008647709281938065,
261
+ "loss": 0.2523,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.18054188787937164,
267
+ "learning_rate": 0.0008575292887157515,
268
+ "loss": 0.2481,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.18751810491085052,
274
+ "learning_rate": 0.0008501308250752123,
275
+ "loss": 0.2461,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.15981389582157135,
281
+ "learning_rate": 0.0008425787824828631,
282
+ "loss": 0.2428,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.21012845635414124,
288
+ "learning_rate": 0.0008348764735141823,
289
+ "loss": 0.2514,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.20988446474075317,
295
+ "learning_rate": 0.0008270272766564472,
296
+ "loss": 0.2498,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.1669292151927948,
302
+ "learning_rate": 0.000819034634826818,
303
+ "loss": 0.2382,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.18733477592468262,
309
+ "learning_rate": 0.0008109020538621606,
310
+ "loss": 0.2363,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.17588213086128235,
316
+ "learning_rate": 0.0008026331009812703,
317
+ "loss": 0.2367,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.15529929101467133,
323
+ "learning_rate": 0.0007942314032201719,
324
+ "loss": 0.2443,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.16888514161109924,
330
+ "learning_rate": 0.0007857006458411826,
331
+ "loss": 0.238,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.17457760870456696,
337
+ "learning_rate": 0.0007770445707164325,
338
+ "loss": 0.241,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.1416597217321396,
344
+ "learning_rate": 0.0007682669746865577,
345
+ "loss": 0.2397,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.19417856633663177,
351
+ "learning_rate": 0.0007593717078952787,
352
+ "loss": 0.2427,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.18347010016441345,
358
+ "learning_rate": 0.0007503626721006019,
359
+ "loss": 0.2315,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.1707679182291031,
365
+ "learning_rate": 0.0007412438189633781,
366
+ "loss": 0.2006,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.14253617823123932,
372
+ "learning_rate": 0.0007320191483139742,
373
+ "loss": 0.2094,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.16302773356437683,
379
+ "learning_rate": 0.0007226927063978153,
380
+ "loss": 0.2046,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.17237479984760284,
386
+ "learning_rate": 0.0007132685841005674,
387
+ "loss": 0.2106,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.14080215990543365,
393
+ "learning_rate": 0.0007037509151537404,
394
+ "loss": 0.2139,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.15111273527145386,
400
+ "learning_rate": 0.0006941438743214963,
401
+ "loss": 0.2118,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.16434533894062042,
407
+ "learning_rate": 0.0006844516755694598,
408
+ "loss": 0.2036,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.1573331654071808,
414
+ "learning_rate": 0.0006746785702163335,
415
+ "loss": 0.2109,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.14934468269348145,
421
+ "learning_rate": 0.0006648288450691298,
422
+ "loss": 0.2088,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.1662275493144989,
428
+ "learning_rate": 0.0006549068205428343,
429
+ "loss": 0.2091,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.15690551698207855,
435
+ "learning_rate": 0.0006449168487653305,
436
+ "loss": 0.2125,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.13499705493450165,
442
+ "learning_rate": 0.0006348633116684117,
443
+ "loss": 0.2049,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.14175556600093842,
449
+ "learning_rate": 0.0006247506190657209,
450
+ "loss": 0.2033,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.14965271949768066,
456
+ "learning_rate": 0.0006145832067184614,
457
+ "loss": 0.1996,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.15024539828300476,
463
+ "learning_rate": 0.0006043655343897249,
464
+ "loss": 0.2009,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.16027085483074188,
470
+ "learning_rate": 0.0005941020838882917,
471
+ "loss": 0.1952,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.15198029577732086,
477
+ "learning_rate": 0.000583797357102762,
478
+ "loss": 0.1996,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.14285215735435486,
484
+ "learning_rate": 0.0005734558740268789,
485
+ "loss": 0.1987,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.14066115021705627,
491
+ "learning_rate": 0.000563082170776908,
492
+ "loss": 0.1985,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.16289477050304413,
498
+ "learning_rate": 0.0005526807976019493,
499
+ "loss": 0.1998,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.12963451445102692,
505
+ "learning_rate": 0.0005422563168880455,
506
+ "loss": 0.1999,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.16663536429405212,
512
+ "learning_rate": 0.0005318133011569704,
513
+ "loss": 0.1892,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.1475083827972412,
519
+ "learning_rate": 0.0005213563310605686,
520
+ "loss": 0.1952,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.1764904111623764,
526
+ "learning_rate": 0.00051088999337153,
527
+ "loss": 0.1981,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.144892156124115,
533
+ "learning_rate": 0.0005004188789714811,
534
+ "loss": 0.1991,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.15162664651870728,
540
+ "learning_rate": 0.0004899475808372714,
541
+ "loss": 0.1968,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.15749579668045044,
547
+ "learning_rate": 0.0004794806920263417,
548
+ "loss": 0.1956,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.1367015242576599,
554
+ "learning_rate": 0.0004690228036620589,
555
+ "loss": 0.1863,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.16991527378559113,
561
+ "learning_rate": 0.0004585785029198959,
562
+ "loss": 0.1989,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.14471600949764252,
568
+ "learning_rate": 0.00044815237101534535,
569
+ "loss": 0.1986,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.130776047706604,
575
+ "learning_rate": 0.0004377489811944478,
576
+ "loss": 0.1849,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.1394679993391037,
582
+ "learning_rate": 0.00042737289672781367,
583
+ "loss": 0.1942,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.13585664331912994,
589
+ "learning_rate": 0.0004170286689090228,
590
+ "loss": 0.1971,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.15923821926116943,
596
+ "learning_rate": 0.0004067208350582768,
597
+ "loss": 0.1946,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.13468892872333527,
603
+ "learning_rate": 0.0003964539165321794,
604
+ "loss": 0.1931,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.14454448223114014,
610
+ "learning_rate": 0.00038623241674052113,
611
+ "loss": 0.1852,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.12442506849765778,
617
+ "learning_rate": 0.00037606081917093416,
618
+ "loss": 0.2022,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.1347043663263321,
624
+ "learning_rate": 0.0003659435854222869,
625
+ "loss": 0.1928,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.14994265139102936,
631
+ "learning_rate": 0.0003558851532476796,
632
+ "loss": 0.1857,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.13082821667194366,
638
+ "learning_rate": 0.0003458899346078979,
639
+ "loss": 0.1961,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.13206897675991058,
645
+ "learning_rate": 0.00033596231373618247,
646
+ "loss": 0.1866,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.15052682161331177,
652
+ "learning_rate": 0.0003261066452151587,
653
+ "loss": 0.1895,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.14751003682613373,
659
+ "learning_rate": 0.0003163272520667726,
660
+ "loss": 0.1935,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.13202938437461853,
666
+ "learning_rate": 0.00030662842385607126,
667
+ "loss": 0.1848,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.16639895737171173,
673
+ "learning_rate": 0.0002970144148096568,
674
+ "loss": 0.2072,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.1549963802099228,
680
+ "learning_rate": 0.0002874894419496431,
681
+ "loss": 0.193,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.15094412863254547,
687
+ "learning_rate": 0.00027805768324393014,
688
+ "loss": 0.1854,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.12347867339849472,
694
+ "learning_rate": 0.0002687232757736082,
695
+ "loss": 0.1868,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.1314866691827774,
701
+ "learning_rate": 0.0002594903139182996,
702
+ "loss": 0.1927,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.12323761731386185,
708
+ "learning_rate": 0.0002503628475602256,
709
+ "loss": 0.1888,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.02,
714
+ "grad_norm": 0.14616355299949646,
715
+ "learning_rate": 0.00024134488030779655,
716
+ "loss": 0.159,
717
+ "step": 2525
718
+ },
719
+ {
720
+ "epoch": 2.04,
721
+ "grad_norm": 0.16476809978485107,
722
+ "learning_rate": 0.00023244036773949656,
723
+ "loss": 0.1527,
724
+ "step": 2550
725
+ },
726
+ {
727
+ "epoch": 2.06,
728
+ "grad_norm": 0.1476244181394577,
729
+ "learning_rate": 0.00022365321566883433,
730
+ "loss": 0.1579,
731
+ "step": 2575
732
+ },
733
+ {
734
+ "epoch": 2.08,
735
+ "grad_norm": 0.15825843811035156,
736
+ "learning_rate": 0.0002149872784311262,
737
+ "loss": 0.1511,
738
+ "step": 2600
739
+ },
740
+ {
741
+ "epoch": 2.1,
742
+ "grad_norm": 0.13213805854320526,
743
+ "learning_rate": 0.00020644635719285705,
744
+ "loss": 0.1577,
745
+ "step": 2625
746
+ },
747
+ {
748
+ "epoch": 2.12,
749
+ "grad_norm": 0.12865176796913147,
750
+ "learning_rate": 0.0001980341982843616,
751
+ "loss": 0.1523,
752
+ "step": 2650
753
+ },
754
+ {
755
+ "epoch": 2.14,
756
+ "grad_norm": 0.14385676383972168,
757
+ "learning_rate": 0.0001897544915565616,
758
+ "loss": 0.1582,
759
+ "step": 2675
760
+ },
761
+ {
762
+ "epoch": 2.16,
763
+ "grad_norm": 0.15633924305438995,
764
+ "learning_rate": 0.0001816108687624749,
765
+ "loss": 0.1551,
766
+ "step": 2700
767
+ },
768
+ {
769
+ "epoch": 2.18,
770
+ "grad_norm": 0.13521474599838257,
771
+ "learning_rate": 0.00017360690196420813,
772
+ "loss": 0.1597,
773
+ "step": 2725
774
+ },
775
+ {
776
+ "epoch": 2.2,
777
+ "grad_norm": 0.14471761882305145,
778
+ "learning_rate": 0.0001657461019661326,
779
+ "loss": 0.1562,
780
+ "step": 2750
781
+ },
782
+ {
783
+ "epoch": 2.22,
784
+ "grad_norm": 0.13948746025562286,
785
+ "learning_rate": 0.0001580319167749294,
786
+ "loss": 0.1518,
787
+ "step": 2775
788
+ },
789
+ {
790
+ "epoch": 2.24,
791
+ "grad_norm": 0.14780576527118683,
792
+ "learning_rate": 0.00015046773008717967,
793
+ "loss": 0.1529,
794
+ "step": 2800
795
+ },
796
+ {
797
+ "epoch": 2.26,
798
+ "grad_norm": 0.13603238761425018,
799
+ "learning_rate": 0.00014305685980516293,
800
+ "loss": 0.1597,
801
+ "step": 2825
802
+ },
803
+ {
804
+ "epoch": 2.2800000000000002,
805
+ "grad_norm": 0.14286421239376068,
806
+ "learning_rate": 0.00013580255658151685,
807
+ "loss": 0.1503,
808
+ "step": 2850
809
+ },
810
+ {
811
+ "epoch": 2.3,
812
+ "grad_norm": 0.1361972689628601,
813
+ "learning_rate": 0.00012870800239339237,
814
+ "loss": 0.1535,
815
+ "step": 2875
816
+ },
817
+ {
818
+ "epoch": 2.32,
819
+ "grad_norm": 0.14035378396511078,
820
+ "learning_rate": 0.00012177630914673327,
821
+ "loss": 0.16,
822
+ "step": 2900
823
+ },
824
+ {
825
+ "epoch": 2.34,
826
+ "grad_norm": 0.12597130239009857,
827
+ "learning_rate": 0.00011501051731129224,
828
+ "loss": 0.1501,
829
+ "step": 2925
830
+ },
831
+ {
832
+ "epoch": 2.36,
833
+ "grad_norm": 0.1465565711259842,
834
+ "learning_rate": 0.00010841359458697985,
835
+ "loss": 0.1565,
836
+ "step": 2950
837
+ },
838
+ {
839
+ "epoch": 2.38,
840
+ "grad_norm": 0.14047019183635712,
841
+ "learning_rate": 0.00010198843460213336,
842
+ "loss": 0.1515,
843
+ "step": 2975
844
+ },
845
+ {
846
+ "epoch": 2.4,
847
+ "grad_norm": 0.1407793015241623,
848
+ "learning_rate": 9.573785564427562e-05,
849
+ "loss": 0.1517,
850
+ "step": 3000
851
+ },
852
+ {
853
+ "epoch": 2.42,
854
+ "grad_norm": 0.15099190175533295,
855
+ "learning_rate": 8.966459942392108e-05,
856
+ "loss": 0.1528,
857
+ "step": 3025
858
+ },
859
+ {
860
+ "epoch": 2.44,
861
+ "grad_norm": 0.14255790412425995,
862
+ "learning_rate": 8.3771329871971e-05,
863
+ "loss": 0.1554,
864
+ "step": 3050
865
+ },
866
+ {
867
+ "epoch": 2.46,
868
+ "grad_norm": 0.14905491471290588,
869
+ "learning_rate": 7.806063197122521e-05,
870
+ "loss": 0.1548,
871
+ "step": 3075
872
+ },
873
+ {
874
+ "epoch": 2.48,
875
+ "grad_norm": 0.15821439027786255,
876
+ "learning_rate": 7.253501062252338e-05,
877
+ "loss": 0.1503,
878
+ "step": 3100
879
+ },
880
+ {
881
+ "epoch": 2.5,
882
+ "grad_norm": 0.15232159197330475,
883
+ "learning_rate": 6.719688954601267e-05,
884
+ "loss": 0.1568,
885
+ "step": 3125
886
+ },
887
+ {
888
+ "epoch": 2.52,
889
+ "grad_norm": 0.14521776139736176,
890
+ "learning_rate": 6.204861021802333e-05,
891
+ "loss": 0.1506,
892
+ "step": 3150
893
+ },
894
+ {
895
+ "epoch": 2.54,
896
+ "grad_norm": 0.14902536571025848,
897
+ "learning_rate": 5.709243084402127e-05,
898
+ "loss": 0.1533,
899
+ "step": 3175
900
+ },
901
+ {
902
+ "epoch": 2.56,
903
+ "grad_norm": 0.13219137489795685,
904
+ "learning_rate": 5.2330525368083193e-05,
905
+ "loss": 0.1509,
906
+ "step": 3200
907
+ },
908
+ {
909
+ "epoch": 2.58,
910
+ "grad_norm": 0.13296589255332947,
911
+ "learning_rate": 4.776498251933292e-05,
912
+ "loss": 0.1482,
913
+ "step": 3225
914
+ },
915
+ {
916
+ "epoch": 2.6,
917
+ "grad_norm": 0.15772312879562378,
918
+ "learning_rate": 4.3397804895756956e-05,
919
+ "loss": 0.1489,
920
+ "step": 3250
921
+ },
922
+ {
923
+ "epoch": 2.62,
924
+ "grad_norm": 0.14949175715446472,
925
+ "learning_rate": 3.923090808579727e-05,
926
+ "loss": 0.1505,
927
+ "step": 3275
928
+ },
929
+ {
930
+ "epoch": 2.64,
931
+ "grad_norm": 0.14660567045211792,
932
+ "learning_rate": 3.5266119828111955e-05,
933
+ "loss": 0.1503,
934
+ "step": 3300
935
+ },
936
+ {
937
+ "epoch": 2.66,
938
+ "grad_norm": 0.1468629688024521,
939
+ "learning_rate": 3.150517920986851e-05,
940
+ "loss": 0.1538,
941
+ "step": 3325
942
+ },
943
+ {
944
+ "epoch": 2.68,
945
+ "grad_norm": 0.1596725881099701,
946
+ "learning_rate": 2.794973590392219e-05,
947
+ "loss": 0.156,
948
+ "step": 3350
949
+ },
950
+ {
951
+ "epoch": 2.7,
952
+ "grad_norm": 0.14169612526893616,
953
+ "learning_rate": 2.460134944521547e-05,
954
+ "loss": 0.1512,
955
+ "step": 3375
956
+ },
957
+ {
958
+ "epoch": 2.7199999999999998,
959
+ "grad_norm": 0.16790783405303955,
960
+ "learning_rate": 2.1461488546714426e-05,
961
+ "loss": 0.1608,
962
+ "step": 3400
963
+ },
964
+ {
965
+ "epoch": 2.74,
966
+ "grad_norm": 0.12738870084285736,
967
+ "learning_rate": 1.853153045518252e-05,
968
+ "loss": 0.153,
969
+ "step": 3425
970
+ },
971
+ {
972
+ "epoch": 2.76,
973
+ "grad_norm": 0.13329529762268066,
974
+ "learning_rate": 1.581276034707463e-05,
975
+ "loss": 0.1439,
976
+ "step": 3450
977
+ },
978
+ {
979
+ "epoch": 2.7800000000000002,
980
+ "grad_norm": 0.15731698274612427,
981
+ "learning_rate": 1.3306370764816389e-05,
982
+ "loss": 0.1485,
983
+ "step": 3475
984
+ },
985
+ {
986
+ "epoch": 2.8,
987
+ "grad_norm": 0.14650867879390717,
988
+ "learning_rate": 1.1013461093715594e-05,
989
+ "loss": 0.1493,
990
+ "step": 3500
991
+ },
992
+ {
993
+ "epoch": 2.82,
994
+ "grad_norm": 0.1446894407272339,
995
+ "learning_rate": 8.935037079735309e-06,
996
+ "loss": 0.1519,
997
+ "step": 3525
998
+ },
999
+ {
1000
+ "epoch": 2.84,
1001
+ "grad_norm": 0.15621596574783325,
1002
+ "learning_rate": 7.072010388340655e-06,
1003
+ "loss": 0.147,
1004
+ "step": 3550
1005
+ },
1006
+ {
1007
+ "epoch": 2.86,
1008
+ "grad_norm": 0.14237834513187408,
1009
+ "learning_rate": 5.425198204612069e-06,
1010
+ "loss": 0.1456,
1011
+ "step": 3575
1012
+ },
1013
+ {
1014
+ "epoch": 2.88,
1015
+ "grad_norm": 0.15466976165771484,
1016
+ "learning_rate": 3.995322874800922e-06,
1017
+ "loss": 0.1537,
1018
+ "step": 3600
1019
+ },
1020
+ {
1021
+ "epoch": 2.9,
1022
+ "grad_norm": 0.14236176013946533,
1023
+ "learning_rate": 2.7830115894847407e-06,
1024
+ "loss": 0.1528,
1025
+ "step": 3625
1026
+ },
1027
+ {
1028
+ "epoch": 2.92,
1029
+ "grad_norm": 0.16872696578502655,
1030
+ "learning_rate": 1.7887961084605553e-06,
1031
+ "loss": 0.1509,
1032
+ "step": 3650
1033
+ },
1034
+ {
1035
+ "epoch": 2.94,
1036
+ "grad_norm": 0.14598286151885986,
1037
+ "learning_rate": 1.013112527497473e-06,
1038
+ "loss": 0.1466,
1039
+ "step": 3675
1040
+ },
1041
+ {
1042
+ "epoch": 2.96,
1043
+ "grad_norm": 0.1556728333234787,
1044
+ "learning_rate": 4.563010870506368e-07,
1045
+ "loss": 0.1482,
1046
+ "step": 3700
1047
+ },
1048
+ {
1049
+ "epoch": 2.98,
1050
+ "grad_norm": 0.13495859503746033,
1051
+ "learning_rate": 1.1860602302066203e-07,
1052
+ "loss": 0.1472,
1053
+ "step": 3725
1054
+ },
1055
+ {
1056
+ "epoch": 3.0,
1057
+ "grad_norm": 0.1348726600408554,
1058
+ "learning_rate": 1.7545962355258739e-10,
1059
+ "loss": 0.1518,
1060
+ "step": 3750
1061
+ },
1062
+ {
1063
+ "epoch": 3.0,
1064
+ "step": 3750,
1065
+ "total_flos": 2.43882352705536e+18,
1066
+ "train_loss": 0.20868528219858806,
1067
+ "train_runtime": 3281.7804,
1068
+ "train_samples_per_second": 36.566,
1069
+ "train_steps_per_second": 1.143
1070
+ }
1071
+ ],
1072
+ "logging_steps": 25,
1073
+ "max_steps": 3750,
1074
+ "num_input_tokens_seen": 0,
1075
+ "num_train_epochs": 3,
1076
+ "save_steps": 0,
1077
+ "stateful_callbacks": {
1078
+ "TrainerControl": {
1079
+ "args": {
1080
+ "should_epoch_stop": false,
1081
+ "should_evaluate": false,
1082
+ "should_log": false,
1083
+ "should_save": false,
1084
+ "should_training_stop": false
1085
+ },
1086
+ "attributes": {}
1087
+ }
1088
+ },
1089
+ "total_flos": 2.43882352705536e+18,
1090
+ "train_batch_size": 32,
1091
+ "trial_name": null,
1092
+ "trial_params": null
1093
+ }
nl_tasks/exprep/run_ex24_3ep/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex24_3ep/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex24_3ep/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 8.32
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 50.49279757391963
nl_tasks/exprep/run_ex24_3ep/trainer_state.json ADDED
@@ -0,0 +1,1093 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 100,
7
+ "global_step": 3750,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 0.21860866248607635,
15
+ "learning_rate": 0.0009998989386555814,
16
+ "loss": 0.435,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.1902492791414261,
22
+ "learning_rate": 0.0009995787805744778,
23
+ "loss": 0.3384,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.2332111895084381,
29
+ "learning_rate": 0.000999039490728981,
30
+ "loss": 0.332,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.21917028725147247,
36
+ "learning_rate": 0.000998281305669441,
37
+ "loss": 0.3147,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.2263868749141693,
43
+ "learning_rate": 0.0009973045579608833,
44
+ "loss": 0.3063,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.21854978799819946,
50
+ "learning_rate": 0.0009961096760371347,
51
+ "loss": 0.3011,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.21285760402679443,
57
+ "learning_rate": 0.0009946971840128981,
58
+ "loss": 0.3008,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.22584690153598785,
64
+ "learning_rate": 0.0009930677014538588,
65
+ "loss": 0.3011,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.2559281587600708,
71
+ "learning_rate": 0.0009912219431049217,
72
+ "loss": 0.2959,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.2262907326221466,
78
+ "learning_rate": 0.0009891607185767018,
79
+ "loss": 0.292,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.22938783466815948,
85
+ "learning_rate": 0.0009868849319904012,
86
+ "loss": 0.2913,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.22400376200675964,
92
+ "learning_rate": 0.000984395581581232,
93
+ "loss": 0.3003,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.20974433422088623,
99
+ "learning_rate": 0.000981693759260558,
100
+ "loss": 0.2925,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.26473161578178406,
106
+ "learning_rate": 0.0009787806501369446,
107
+ "loss": 0.2937,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.2237643599510193,
113
+ "learning_rate": 0.0009756575319963324,
114
+ "loss": 0.2845,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.23942594230175018,
120
+ "learning_rate": 0.0009723257747415584,
121
+ "loss": 0.2846,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.18676090240478516,
127
+ "learning_rate": 0.00096878683979147,
128
+ "loss": 0.2743,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.2044542133808136,
134
+ "learning_rate": 0.000965042279439899,
135
+ "loss": 0.2765,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.22793416678905487,
141
+ "learning_rate": 0.0009610937361747747,
142
+ "loss": 0.2751,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.22194074094295502,
148
+ "learning_rate": 0.0009569429419576737,
149
+ "loss": 0.2721,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.19625091552734375,
155
+ "learning_rate": 0.0009525917174641245,
156
+ "loss": 0.274,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.21780937910079956,
162
+ "learning_rate": 0.0009480419712849994,
163
+ "loss": 0.2618,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.20032677054405212,
169
+ "learning_rate": 0.0009432956990893433,
170
+ "loss": 0.2781,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.1942606121301651,
176
+ "learning_rate": 0.0009383549827490066,
177
+ "loss": 0.2633,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.20562711358070374,
183
+ "learning_rate": 0.0009332219894254686,
184
+ "loss": 0.2775,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.9733890295028687,
190
+ "learning_rate": 0.0009278989706192479,
191
+ "loss": 0.2679,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.17540724575519562,
197
+ "learning_rate": 0.0009223882611823205,
198
+ "loss": 0.2611,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.24498209357261658,
204
+ "learning_rate": 0.0009166922782939757,
205
+ "loss": 0.2624,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.20900648832321167,
211
+ "learning_rate": 0.0009108135204005628,
212
+ "loss": 0.2661,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.2016984522342682,
218
+ "learning_rate": 0.0009047545661195884,
219
+ "loss": 0.2622,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.19367337226867676,
225
+ "learning_rate": 0.0008985180731086505,
226
+ "loss": 0.2505,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.1933523416519165,
232
+ "learning_rate": 0.0008921067768997017,
233
+ "loss": 0.255,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.2022523581981659,
239
+ "learning_rate": 0.0008855234896991544,
240
+ "loss": 0.2486,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.18164286017417908,
246
+ "learning_rate": 0.0008787710991543547,
247
+ "loss": 0.2491,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.15638867020606995,
253
+ "learning_rate": 0.0008718525670869639,
254
+ "loss": 0.2564,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.1735961139202118,
260
+ "learning_rate": 0.0008647709281938065,
261
+ "loss": 0.2522,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.18064841628074646,
267
+ "learning_rate": 0.0008575292887157515,
268
+ "loss": 0.2489,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.1719369888305664,
274
+ "learning_rate": 0.0008501308250752123,
275
+ "loss": 0.2469,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.16980446875095367,
281
+ "learning_rate": 0.0008425787824828631,
282
+ "loss": 0.2428,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.2074533998966217,
288
+ "learning_rate": 0.0008348764735141823,
289
+ "loss": 0.2486,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.19303739070892334,
295
+ "learning_rate": 0.0008270272766564472,
296
+ "loss": 0.2474,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.16771027445793152,
302
+ "learning_rate": 0.000819034634826818,
303
+ "loss": 0.2394,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.18652425706386566,
309
+ "learning_rate": 0.0008109020538621606,
310
+ "loss": 0.2349,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.18362785875797272,
316
+ "learning_rate": 0.0008026331009812703,
317
+ "loss": 0.2368,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.17980653047561646,
323
+ "learning_rate": 0.0007942314032201719,
324
+ "loss": 0.2438,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.15695154666900635,
330
+ "learning_rate": 0.0007857006458411826,
331
+ "loss": 0.2379,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.161969855427742,
337
+ "learning_rate": 0.0007770445707164325,
338
+ "loss": 0.2408,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.1580193191766739,
344
+ "learning_rate": 0.0007682669746865577,
345
+ "loss": 0.2405,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.16899335384368896,
351
+ "learning_rate": 0.0007593717078952787,
352
+ "loss": 0.243,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.17789579927921295,
358
+ "learning_rate": 0.0007503626721006019,
359
+ "loss": 0.2314,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.17090484499931335,
365
+ "learning_rate": 0.0007412438189633781,
366
+ "loss": 0.2001,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.13509897887706757,
372
+ "learning_rate": 0.0007320191483139742,
373
+ "loss": 0.2074,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.18126724660396576,
379
+ "learning_rate": 0.0007226927063978153,
380
+ "loss": 0.2038,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.2104470133781433,
386
+ "learning_rate": 0.0007132685841005674,
387
+ "loss": 0.21,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.1306389570236206,
393
+ "learning_rate": 0.0007037509151537404,
394
+ "loss": 0.2128,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.1495552808046341,
400
+ "learning_rate": 0.0006941438743214963,
401
+ "loss": 0.2115,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.16927556693553925,
407
+ "learning_rate": 0.0006844516755694598,
408
+ "loss": 0.2032,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.16095060110092163,
414
+ "learning_rate": 0.0006746785702163335,
415
+ "loss": 0.2111,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.1520528346300125,
421
+ "learning_rate": 0.0006648288450691298,
422
+ "loss": 0.2077,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.17617478966712952,
428
+ "learning_rate": 0.0006549068205428343,
429
+ "loss": 0.2088,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.17085497081279755,
435
+ "learning_rate": 0.0006449168487653305,
436
+ "loss": 0.2125,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.14262431859970093,
442
+ "learning_rate": 0.0006348633116684117,
443
+ "loss": 0.2053,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.15984103083610535,
449
+ "learning_rate": 0.0006247506190657209,
450
+ "loss": 0.2039,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.1432267129421234,
456
+ "learning_rate": 0.0006145832067184614,
457
+ "loss": 0.1994,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.1414625495672226,
463
+ "learning_rate": 0.0006043655343897249,
464
+ "loss": 0.1997,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.1499958485364914,
470
+ "learning_rate": 0.0005941020838882917,
471
+ "loss": 0.195,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.12970398366451263,
477
+ "learning_rate": 0.000583797357102762,
478
+ "loss": 0.1992,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.13817213475704193,
484
+ "learning_rate": 0.0005734558740268789,
485
+ "loss": 0.1974,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.15048491954803467,
491
+ "learning_rate": 0.000563082170776908,
492
+ "loss": 0.1986,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.1596592664718628,
498
+ "learning_rate": 0.0005526807976019493,
499
+ "loss": 0.1989,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.1446324735879898,
505
+ "learning_rate": 0.0005422563168880455,
506
+ "loss": 0.2013,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.15277068316936493,
512
+ "learning_rate": 0.0005318133011569704,
513
+ "loss": 0.1894,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.13572098314762115,
519
+ "learning_rate": 0.0005213563310605686,
520
+ "loss": 0.1955,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.174700066447258,
526
+ "learning_rate": 0.00051088999337153,
527
+ "loss": 0.199,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.14586439728736877,
533
+ "learning_rate": 0.0005004188789714811,
534
+ "loss": 0.1993,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.15438227355480194,
540
+ "learning_rate": 0.0004899475808372714,
541
+ "loss": 0.1963,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.1574651598930359,
547
+ "learning_rate": 0.0004794806920263417,
548
+ "loss": 0.1961,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.13583463430404663,
554
+ "learning_rate": 0.0004690228036620589,
555
+ "loss": 0.1865,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.16844166815280914,
561
+ "learning_rate": 0.0004585785029198959,
562
+ "loss": 0.1989,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.1481410562992096,
568
+ "learning_rate": 0.00044815237101534535,
569
+ "loss": 0.198,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.13595865666866302,
575
+ "learning_rate": 0.0004377489811944478,
576
+ "loss": 0.1852,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.13887101411819458,
582
+ "learning_rate": 0.00042737289672781367,
583
+ "loss": 0.1941,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.1371852606534958,
589
+ "learning_rate": 0.0004170286689090228,
590
+ "loss": 0.1969,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.1503254920244217,
596
+ "learning_rate": 0.0004067208350582768,
597
+ "loss": 0.1948,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.14083200693130493,
603
+ "learning_rate": 0.0003964539165321794,
604
+ "loss": 0.1926,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.13528388738632202,
610
+ "learning_rate": 0.00038623241674052113,
611
+ "loss": 0.1857,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.13011986017227173,
617
+ "learning_rate": 0.00037606081917093416,
618
+ "loss": 0.2014,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.13956771790981293,
624
+ "learning_rate": 0.0003659435854222869,
625
+ "loss": 0.1927,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.1562594622373581,
631
+ "learning_rate": 0.0003558851532476796,
632
+ "loss": 0.1856,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.13830530643463135,
638
+ "learning_rate": 0.0003458899346078979,
639
+ "loss": 0.1955,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.1343943476676941,
645
+ "learning_rate": 0.00033596231373618247,
646
+ "loss": 0.1864,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.15038716793060303,
652
+ "learning_rate": 0.0003261066452151587,
653
+ "loss": 0.1902,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.1379692554473877,
659
+ "learning_rate": 0.0003163272520667726,
660
+ "loss": 0.1935,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.1431427299976349,
666
+ "learning_rate": 0.00030662842385607126,
667
+ "loss": 0.1844,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.16267234086990356,
673
+ "learning_rate": 0.0002970144148096568,
674
+ "loss": 0.2061,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.14323991537094116,
680
+ "learning_rate": 0.0002874894419496431,
681
+ "loss": 0.192,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.1614535003900528,
687
+ "learning_rate": 0.00027805768324393014,
688
+ "loss": 0.186,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.1243644580245018,
694
+ "learning_rate": 0.0002687232757736082,
695
+ "loss": 0.1872,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.13049174845218658,
701
+ "learning_rate": 0.0002594903139182996,
702
+ "loss": 0.1922,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.11963597685098648,
708
+ "learning_rate": 0.0002503628475602256,
709
+ "loss": 0.1877,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.02,
714
+ "grad_norm": 0.13807468116283417,
715
+ "learning_rate": 0.00024134488030779655,
716
+ "loss": 0.1585,
717
+ "step": 2525
718
+ },
719
+ {
720
+ "epoch": 2.04,
721
+ "grad_norm": 0.15679411590099335,
722
+ "learning_rate": 0.00023244036773949656,
723
+ "loss": 0.152,
724
+ "step": 2550
725
+ },
726
+ {
727
+ "epoch": 2.06,
728
+ "grad_norm": 0.13696733117103577,
729
+ "learning_rate": 0.00022365321566883433,
730
+ "loss": 0.1586,
731
+ "step": 2575
732
+ },
733
+ {
734
+ "epoch": 2.08,
735
+ "grad_norm": 0.16110782325267792,
736
+ "learning_rate": 0.0002149872784311262,
737
+ "loss": 0.1501,
738
+ "step": 2600
739
+ },
740
+ {
741
+ "epoch": 2.1,
742
+ "grad_norm": 0.1372232884168625,
743
+ "learning_rate": 0.00020644635719285705,
744
+ "loss": 0.1568,
745
+ "step": 2625
746
+ },
747
+ {
748
+ "epoch": 2.12,
749
+ "grad_norm": 0.13187378644943237,
750
+ "learning_rate": 0.0001980341982843616,
751
+ "loss": 0.1521,
752
+ "step": 2650
753
+ },
754
+ {
755
+ "epoch": 2.14,
756
+ "grad_norm": 0.13496607542037964,
757
+ "learning_rate": 0.0001897544915565616,
758
+ "loss": 0.1579,
759
+ "step": 2675
760
+ },
761
+ {
762
+ "epoch": 2.16,
763
+ "grad_norm": 0.15261121094226837,
764
+ "learning_rate": 0.0001816108687624749,
765
+ "loss": 0.155,
766
+ "step": 2700
767
+ },
768
+ {
769
+ "epoch": 2.18,
770
+ "grad_norm": 0.14363905787467957,
771
+ "learning_rate": 0.00017360690196420813,
772
+ "loss": 0.1594,
773
+ "step": 2725
774
+ },
775
+ {
776
+ "epoch": 2.2,
777
+ "grad_norm": 0.14778032898902893,
778
+ "learning_rate": 0.0001657461019661326,
779
+ "loss": 0.1563,
780
+ "step": 2750
781
+ },
782
+ {
783
+ "epoch": 2.22,
784
+ "grad_norm": 0.1397523283958435,
785
+ "learning_rate": 0.0001580319167749294,
786
+ "loss": 0.151,
787
+ "step": 2775
788
+ },
789
+ {
790
+ "epoch": 2.24,
791
+ "grad_norm": 0.13365571200847626,
792
+ "learning_rate": 0.00015046773008717967,
793
+ "loss": 0.1537,
794
+ "step": 2800
795
+ },
796
+ {
797
+ "epoch": 2.26,
798
+ "grad_norm": 0.1419825255870819,
799
+ "learning_rate": 0.00014305685980516293,
800
+ "loss": 0.1591,
801
+ "step": 2825
802
+ },
803
+ {
804
+ "epoch": 2.2800000000000002,
805
+ "grad_norm": 0.14078690111637115,
806
+ "learning_rate": 0.00013580255658151685,
807
+ "loss": 0.149,
808
+ "step": 2850
809
+ },
810
+ {
811
+ "epoch": 2.3,
812
+ "grad_norm": 0.1345941126346588,
813
+ "learning_rate": 0.00012870800239339237,
814
+ "loss": 0.1539,
815
+ "step": 2875
816
+ },
817
+ {
818
+ "epoch": 2.32,
819
+ "grad_norm": 0.12599675357341766,
820
+ "learning_rate": 0.00012177630914673327,
821
+ "loss": 0.16,
822
+ "step": 2900
823
+ },
824
+ {
825
+ "epoch": 2.34,
826
+ "grad_norm": 0.12124225497245789,
827
+ "learning_rate": 0.00011501051731129224,
828
+ "loss": 0.1497,
829
+ "step": 2925
830
+ },
831
+ {
832
+ "epoch": 2.36,
833
+ "grad_norm": 0.14786431193351746,
834
+ "learning_rate": 0.00010841359458697985,
835
+ "loss": 0.1562,
836
+ "step": 2950
837
+ },
838
+ {
839
+ "epoch": 2.38,
840
+ "grad_norm": 0.14894621074199677,
841
+ "learning_rate": 0.00010198843460213336,
842
+ "loss": 0.1505,
843
+ "step": 2975
844
+ },
845
+ {
846
+ "epoch": 2.4,
847
+ "grad_norm": 0.13606901466846466,
848
+ "learning_rate": 9.573785564427562e-05,
849
+ "loss": 0.1518,
850
+ "step": 3000
851
+ },
852
+ {
853
+ "epoch": 2.42,
854
+ "grad_norm": 0.1460190862417221,
855
+ "learning_rate": 8.966459942392108e-05,
856
+ "loss": 0.1519,
857
+ "step": 3025
858
+ },
859
+ {
860
+ "epoch": 2.44,
861
+ "grad_norm": 0.14195576310157776,
862
+ "learning_rate": 8.3771329871971e-05,
863
+ "loss": 0.1552,
864
+ "step": 3050
865
+ },
866
+ {
867
+ "epoch": 2.46,
868
+ "grad_norm": 0.1606581211090088,
869
+ "learning_rate": 7.806063197122521e-05,
870
+ "loss": 0.1552,
871
+ "step": 3075
872
+ },
873
+ {
874
+ "epoch": 2.48,
875
+ "grad_norm": 0.17038539052009583,
876
+ "learning_rate": 7.253501062252338e-05,
877
+ "loss": 0.1506,
878
+ "step": 3100
879
+ },
880
+ {
881
+ "epoch": 2.5,
882
+ "grad_norm": 0.15008100867271423,
883
+ "learning_rate": 6.719688954601267e-05,
884
+ "loss": 0.1569,
885
+ "step": 3125
886
+ },
887
+ {
888
+ "epoch": 2.52,
889
+ "grad_norm": 0.1496666818857193,
890
+ "learning_rate": 6.204861021802333e-05,
891
+ "loss": 0.1497,
892
+ "step": 3150
893
+ },
894
+ {
895
+ "epoch": 2.54,
896
+ "grad_norm": 0.15540142357349396,
897
+ "learning_rate": 5.709243084402127e-05,
898
+ "loss": 0.1535,
899
+ "step": 3175
900
+ },
901
+ {
902
+ "epoch": 2.56,
903
+ "grad_norm": 0.13507093489170074,
904
+ "learning_rate": 5.2330525368083193e-05,
905
+ "loss": 0.1513,
906
+ "step": 3200
907
+ },
908
+ {
909
+ "epoch": 2.58,
910
+ "grad_norm": 0.12770436704158783,
911
+ "learning_rate": 4.776498251933292e-05,
912
+ "loss": 0.1474,
913
+ "step": 3225
914
+ },
915
+ {
916
+ "epoch": 2.6,
917
+ "grad_norm": 0.15767820179462433,
918
+ "learning_rate": 4.3397804895756956e-05,
919
+ "loss": 0.1483,
920
+ "step": 3250
921
+ },
922
+ {
923
+ "epoch": 2.62,
924
+ "grad_norm": 0.15003468096256256,
925
+ "learning_rate": 3.923090808579727e-05,
926
+ "loss": 0.1499,
927
+ "step": 3275
928
+ },
929
+ {
930
+ "epoch": 2.64,
931
+ "grad_norm": 0.13899756968021393,
932
+ "learning_rate": 3.5266119828111955e-05,
933
+ "loss": 0.1494,
934
+ "step": 3300
935
+ },
936
+ {
937
+ "epoch": 2.66,
938
+ "grad_norm": 0.1829822063446045,
939
+ "learning_rate": 3.150517920986851e-05,
940
+ "loss": 0.1536,
941
+ "step": 3325
942
+ },
943
+ {
944
+ "epoch": 2.68,
945
+ "grad_norm": 0.14924921095371246,
946
+ "learning_rate": 2.794973590392219e-05,
947
+ "loss": 0.1551,
948
+ "step": 3350
949
+ },
950
+ {
951
+ "epoch": 2.7,
952
+ "grad_norm": 0.1427408903837204,
953
+ "learning_rate": 2.460134944521547e-05,
954
+ "loss": 0.1513,
955
+ "step": 3375
956
+ },
957
+ {
958
+ "epoch": 2.7199999999999998,
959
+ "grad_norm": 0.1617504507303238,
960
+ "learning_rate": 2.1461488546714426e-05,
961
+ "loss": 0.1604,
962
+ "step": 3400
963
+ },
964
+ {
965
+ "epoch": 2.74,
966
+ "grad_norm": 0.11996972560882568,
967
+ "learning_rate": 1.853153045518252e-05,
968
+ "loss": 0.1532,
969
+ "step": 3425
970
+ },
971
+ {
972
+ "epoch": 2.76,
973
+ "grad_norm": 0.13687242567539215,
974
+ "learning_rate": 1.581276034707463e-05,
975
+ "loss": 0.1435,
976
+ "step": 3450
977
+ },
978
+ {
979
+ "epoch": 2.7800000000000002,
980
+ "grad_norm": 0.15579059720039368,
981
+ "learning_rate": 1.3306370764816389e-05,
982
+ "loss": 0.1479,
983
+ "step": 3475
984
+ },
985
+ {
986
+ "epoch": 2.8,
987
+ "grad_norm": 0.18821687996387482,
988
+ "learning_rate": 1.1013461093715594e-05,
989
+ "loss": 0.1485,
990
+ "step": 3500
991
+ },
992
+ {
993
+ "epoch": 2.82,
994
+ "grad_norm": 0.1499922275543213,
995
+ "learning_rate": 8.935037079735309e-06,
996
+ "loss": 0.1511,
997
+ "step": 3525
998
+ },
999
+ {
1000
+ "epoch": 2.84,
1001
+ "grad_norm": 0.16106846928596497,
1002
+ "learning_rate": 7.072010388340655e-06,
1003
+ "loss": 0.1474,
1004
+ "step": 3550
1005
+ },
1006
+ {
1007
+ "epoch": 2.86,
1008
+ "grad_norm": 0.14959703385829926,
1009
+ "learning_rate": 5.425198204612069e-06,
1010
+ "loss": 0.1459,
1011
+ "step": 3575
1012
+ },
1013
+ {
1014
+ "epoch": 2.88,
1015
+ "grad_norm": 0.1529090404510498,
1016
+ "learning_rate": 3.995322874800922e-06,
1017
+ "loss": 0.1528,
1018
+ "step": 3600
1019
+ },
1020
+ {
1021
+ "epoch": 2.9,
1022
+ "grad_norm": 0.1568026840686798,
1023
+ "learning_rate": 2.7830115894847407e-06,
1024
+ "loss": 0.1522,
1025
+ "step": 3625
1026
+ },
1027
+ {
1028
+ "epoch": 2.92,
1029
+ "grad_norm": 0.17276383936405182,
1030
+ "learning_rate": 1.7887961084605553e-06,
1031
+ "loss": 0.1508,
1032
+ "step": 3650
1033
+ },
1034
+ {
1035
+ "epoch": 2.94,
1036
+ "grad_norm": 0.13572101294994354,
1037
+ "learning_rate": 1.013112527497473e-06,
1038
+ "loss": 0.1464,
1039
+ "step": 3675
1040
+ },
1041
+ {
1042
+ "epoch": 2.96,
1043
+ "grad_norm": 0.1407610923051834,
1044
+ "learning_rate": 4.563010870506368e-07,
1045
+ "loss": 0.1477,
1046
+ "step": 3700
1047
+ },
1048
+ {
1049
+ "epoch": 2.98,
1050
+ "grad_norm": 0.13577981293201447,
1051
+ "learning_rate": 1.1860602302066203e-07,
1052
+ "loss": 0.1466,
1053
+ "step": 3725
1054
+ },
1055
+ {
1056
+ "epoch": 3.0,
1057
+ "grad_norm": 0.13064716756343842,
1058
+ "learning_rate": 1.7545962355258739e-10,
1059
+ "loss": 0.1512,
1060
+ "step": 3750
1061
+ },
1062
+ {
1063
+ "epoch": 3.0,
1064
+ "step": 3750,
1065
+ "total_flos": 2.43882352705536e+18,
1066
+ "train_loss": 0.20749481468200684,
1067
+ "train_runtime": 3284.9286,
1068
+ "train_samples_per_second": 36.53,
1069
+ "train_steps_per_second": 1.142
1070
+ }
1071
+ ],
1072
+ "logging_steps": 25,
1073
+ "max_steps": 3750,
1074
+ "num_input_tokens_seen": 0,
1075
+ "num_train_epochs": 3,
1076
+ "save_steps": 0,
1077
+ "stateful_callbacks": {
1078
+ "TrainerControl": {
1079
+ "args": {
1080
+ "should_epoch_stop": false,
1081
+ "should_evaluate": false,
1082
+ "should_log": false,
1083
+ "should_save": false,
1084
+ "should_training_stop": false
1085
+ },
1086
+ "attributes": {}
1087
+ }
1088
+ },
1089
+ "total_flos": 2.43882352705536e+18,
1090
+ "train_batch_size": 32,
1091
+ "trial_name": null,
1092
+ "trial_params": null
1093
+ }
nl_tasks/run_exps/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 4,
9
+ "peft_type": "ROTATION",
10
+ "r": 4,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_exps/ft/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/run_exps/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/run_exps/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/run_exps/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/run_exps/ft/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:368cd4dd820d88773de4a09b9db97770bd1e60d2167a7d4fdef5c6dc4925cdb7
3
+ size 6481
nl_tasks/run_exps/ft/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/run_exps/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 4,
9
+ "peft_type": "ROTATION",
10
+ "r": 4,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_exps/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7ebf73d215d485e5bca2853a23f022a473de81b4bb99283cc66ce0a57c5665
3
+ size 33602659
nl_tasks/run_exps/trainer_state.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.016406890894175553,
6
+ "eval_steps": 10,
7
+ "global_step": 20,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008203445447087777,
14
+ "grad_norm": 0.23545275628566742,
15
+ "learning_rate": 0.00013420201433256689,
16
+ "loss": 0.6594,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.008203445447087777,
21
+ "eval_loss": 0.48451828956604004,
22
+ "eval_runtime": 19.7025,
23
+ "eval_samples_per_second": 50.755,
24
+ "eval_steps_per_second": 0.812,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.016406890894175553,
29
+ "grad_norm": 0.146519273519516,
30
+ "learning_rate": 1.5192246987791981e-06,
31
+ "loss": 0.4624,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 0.016406890894175553,
36
+ "eval_loss": 0.44331175088882446,
37
+ "eval_runtime": 19.2624,
38
+ "eval_samples_per_second": 51.915,
39
+ "eval_steps_per_second": 0.831,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.016406890894175553,
44
+ "step": 20,
45
+ "total_flos": 1.30070668640256e+16,
46
+ "train_loss": 0.5609269142150879,
47
+ "train_runtime": 110.7,
48
+ "train_samples_per_second": 5.781,
49
+ "train_steps_per_second": 0.181
50
+ }
51
+ ],
52
+ "logging_steps": 10,
53
+ "max_steps": 20,
54
+ "num_input_tokens_seen": 0,
55
+ "num_train_epochs": 1,
56
+ "save_steps": 500,
57
+ "stateful_callbacks": {
58
+ "TrainerControl": {
59
+ "args": {
60
+ "should_epoch_stop": false,
61
+ "should_evaluate": false,
62
+ "should_log": false,
63
+ "should_save": true,
64
+ "should_training_stop": true
65
+ },
66
+ "attributes": {}
67
+ }
68
+ },
69
+ "total_flos": 1.30070668640256e+16,
70
+ "train_batch_size": 32,
71
+ "trial_name": null,
72
+ "trial_params": null
73
+ }