andrk9 commited on
Commit
190eb1c
·
verified ·
1 Parent(s): 7b7578f

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +357 -0
  2. adapter_config.json +21 -0
  3. adapter_model.bin +3 -0
  4. checkpoint-290/README.md +23 -0
  5. checkpoint-290/adapter_config.json +21 -0
  6. checkpoint-290/adapter_model.safetensors +3 -0
  7. checkpoint-290/optimizer.pt +3 -0
  8. checkpoint-290/rng_state.pth +3 -0
  9. checkpoint-290/scheduler.pt +3 -0
  10. checkpoint-290/trainer_state.json +891 -0
  11. checkpoint-290/training_args.bin +3 -0
  12. checkpoint-320/README.md +23 -0
  13. checkpoint-320/adapter_config.json +21 -0
  14. checkpoint-320/adapter_model.safetensors +3 -0
  15. checkpoint-320/optimizer.pt +3 -0
  16. checkpoint-320/rng_state.pth +3 -0
  17. checkpoint-320/scheduler.pt +3 -0
  18. checkpoint-320/trainer_state.json +981 -0
  19. checkpoint-320/training_args.bin +3 -0
  20. checkpoint-330/README.md +23 -0
  21. checkpoint-330/adapter_config.json +21 -0
  22. checkpoint-330/adapter_model.safetensors +3 -0
  23. checkpoint-330/optimizer.pt +3 -0
  24. checkpoint-330/rng_state.pth +3 -0
  25. checkpoint-330/scheduler.pt +3 -0
  26. checkpoint-330/trainer_state.json +1011 -0
  27. checkpoint-330/training_args.bin +3 -0
  28. checkpoint-340/README.md +23 -0
  29. checkpoint-340/adapter_config.json +21 -0
  30. checkpoint-340/adapter_model.safetensors +3 -0
  31. checkpoint-340/optimizer.pt +3 -0
  32. checkpoint-340/rng_state.pth +3 -0
  33. checkpoint-340/scheduler.pt +3 -0
  34. checkpoint-340/trainer_state.json +1041 -0
  35. checkpoint-340/training_args.bin +3 -0
  36. checkpoint-350/README.md +23 -0
  37. checkpoint-350/adapter_config.json +21 -0
  38. checkpoint-350/adapter_model.safetensors +3 -0
  39. checkpoint-350/optimizer.pt +3 -0
  40. checkpoint-350/rng_state.pth +3 -0
  41. checkpoint-350/scheduler.pt +3 -0
  42. checkpoint-350/trainer_state.json +1071 -0
  43. checkpoint-350/training_args.bin +3 -0
  44. config.json +42 -0
  45. logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 +3 -0
  46. logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 +3 -0
  47. logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 +3 -0
  48. logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 +3 -0
  49. logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 +3 -0
  50. logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 +3 -0
README.md ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - _load_in_8bit: False
10
+ - _load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ - load_in_4bit: False
19
+ - load_in_8bit: False
20
+
21
+ The following `bitsandbytes` quantization config was used during training:
22
+ - quant_method: bitsandbytes
23
+ - _load_in_8bit: False
24
+ - _load_in_4bit: False
25
+ - llm_int8_threshold: 6.0
26
+ - llm_int8_skip_modules: None
27
+ - llm_int8_enable_fp32_cpu_offload: False
28
+ - llm_int8_has_fp16_weight: False
29
+ - bnb_4bit_quant_type: fp4
30
+ - bnb_4bit_use_double_quant: False
31
+ - bnb_4bit_compute_dtype: float32
32
+ - load_in_4bit: False
33
+ - load_in_8bit: False
34
+
35
+ The following `bitsandbytes` quantization config was used during training:
36
+ - quant_method: bitsandbytes
37
+ - _load_in_8bit: False
38
+ - _load_in_4bit: False
39
+ - llm_int8_threshold: 6.0
40
+ - llm_int8_skip_modules: None
41
+ - llm_int8_enable_fp32_cpu_offload: False
42
+ - llm_int8_has_fp16_weight: False
43
+ - bnb_4bit_quant_type: fp4
44
+ - bnb_4bit_use_double_quant: False
45
+ - bnb_4bit_compute_dtype: float32
46
+ - load_in_4bit: False
47
+ - load_in_8bit: False
48
+
49
+ The following `bitsandbytes` quantization config was used during training:
50
+ - quant_method: bitsandbytes
51
+ - _load_in_8bit: False
52
+ - _load_in_4bit: False
53
+ - llm_int8_threshold: 6.0
54
+ - llm_int8_skip_modules: None
55
+ - llm_int8_enable_fp32_cpu_offload: False
56
+ - llm_int8_has_fp16_weight: False
57
+ - bnb_4bit_quant_type: fp4
58
+ - bnb_4bit_use_double_quant: False
59
+ - bnb_4bit_compute_dtype: float32
60
+ - load_in_4bit: False
61
+ - load_in_8bit: False
62
+
63
+ The following `bitsandbytes` quantization config was used during training:
64
+ - quant_method: bitsandbytes
65
+ - _load_in_8bit: False
66
+ - _load_in_4bit: False
67
+ - llm_int8_threshold: 6.0
68
+ - llm_int8_skip_modules: None
69
+ - llm_int8_enable_fp32_cpu_offload: False
70
+ - llm_int8_has_fp16_weight: False
71
+ - bnb_4bit_quant_type: fp4
72
+ - bnb_4bit_use_double_quant: False
73
+ - bnb_4bit_compute_dtype: float32
74
+ - load_in_4bit: False
75
+ - load_in_8bit: False
76
+
77
+ The following `bitsandbytes` quantization config was used during training:
78
+ - quant_method: bitsandbytes
79
+ - _load_in_8bit: False
80
+ - _load_in_4bit: False
81
+ - llm_int8_threshold: 6.0
82
+ - llm_int8_skip_modules: None
83
+ - llm_int8_enable_fp32_cpu_offload: False
84
+ - llm_int8_has_fp16_weight: False
85
+ - bnb_4bit_quant_type: fp4
86
+ - bnb_4bit_use_double_quant: False
87
+ - bnb_4bit_compute_dtype: float32
88
+ - load_in_4bit: False
89
+ - load_in_8bit: False
90
+
91
+ The following `bitsandbytes` quantization config was used during training:
92
+ - quant_method: bitsandbytes
93
+ - _load_in_8bit: False
94
+ - _load_in_4bit: False
95
+ - llm_int8_threshold: 6.0
96
+ - llm_int8_skip_modules: None
97
+ - llm_int8_enable_fp32_cpu_offload: False
98
+ - llm_int8_has_fp16_weight: False
99
+ - bnb_4bit_quant_type: fp4
100
+ - bnb_4bit_use_double_quant: False
101
+ - bnb_4bit_compute_dtype: float32
102
+ - load_in_4bit: False
103
+ - load_in_8bit: False
104
+
105
+ The following `bitsandbytes` quantization config was used during training:
106
+ - quant_method: bitsandbytes
107
+ - _load_in_8bit: False
108
+ - _load_in_4bit: False
109
+ - llm_int8_threshold: 6.0
110
+ - llm_int8_skip_modules: None
111
+ - llm_int8_enable_fp32_cpu_offload: False
112
+ - llm_int8_has_fp16_weight: False
113
+ - bnb_4bit_quant_type: fp4
114
+ - bnb_4bit_use_double_quant: False
115
+ - bnb_4bit_compute_dtype: float32
116
+ - load_in_4bit: False
117
+ - load_in_8bit: False
118
+
119
+ The following `bitsandbytes` quantization config was used during training:
120
+ - quant_method: bitsandbytes
121
+ - _load_in_8bit: False
122
+ - _load_in_4bit: False
123
+ - llm_int8_threshold: 6.0
124
+ - llm_int8_skip_modules: None
125
+ - llm_int8_enable_fp32_cpu_offload: False
126
+ - llm_int8_has_fp16_weight: False
127
+ - bnb_4bit_quant_type: fp4
128
+ - bnb_4bit_use_double_quant: False
129
+ - bnb_4bit_compute_dtype: float32
130
+ - load_in_4bit: False
131
+ - load_in_8bit: False
132
+
133
+ The following `bitsandbytes` quantization config was used during training:
134
+ - quant_method: bitsandbytes
135
+ - _load_in_8bit: False
136
+ - _load_in_4bit: False
137
+ - llm_int8_threshold: 6.0
138
+ - llm_int8_skip_modules: None
139
+ - llm_int8_enable_fp32_cpu_offload: False
140
+ - llm_int8_has_fp16_weight: False
141
+ - bnb_4bit_quant_type: fp4
142
+ - bnb_4bit_use_double_quant: False
143
+ - bnb_4bit_compute_dtype: float32
144
+ - load_in_4bit: False
145
+ - load_in_8bit: False
146
+
147
+ The following `bitsandbytes` quantization config was used during training:
148
+ - quant_method: bitsandbytes
149
+ - _load_in_8bit: False
150
+ - _load_in_4bit: False
151
+ - llm_int8_threshold: 6.0
152
+ - llm_int8_skip_modules: None
153
+ - llm_int8_enable_fp32_cpu_offload: False
154
+ - llm_int8_has_fp16_weight: False
155
+ - bnb_4bit_quant_type: fp4
156
+ - bnb_4bit_use_double_quant: False
157
+ - bnb_4bit_compute_dtype: float32
158
+ - load_in_4bit: False
159
+ - load_in_8bit: False
160
+
161
+ The following `bitsandbytes` quantization config was used during training:
162
+ - quant_method: bitsandbytes
163
+ - _load_in_8bit: False
164
+ - _load_in_4bit: False
165
+ - llm_int8_threshold: 6.0
166
+ - llm_int8_skip_modules: None
167
+ - llm_int8_enable_fp32_cpu_offload: False
168
+ - llm_int8_has_fp16_weight: False
169
+ - bnb_4bit_quant_type: fp4
170
+ - bnb_4bit_use_double_quant: False
171
+ - bnb_4bit_compute_dtype: float32
172
+ - load_in_4bit: False
173
+ - load_in_8bit: False
174
+
175
+ The following `bitsandbytes` quantization config was used during training:
176
+ - quant_method: bitsandbytes
177
+ - load_in_8bit: True
178
+ - load_in_4bit: False
179
+ - llm_int8_threshold: 6.0
180
+ - llm_int8_skip_modules: None
181
+ - llm_int8_enable_fp32_cpu_offload: False
182
+ - llm_int8_has_fp16_weight: False
183
+ - bnb_4bit_quant_type: fp4
184
+ - bnb_4bit_use_double_quant: False
185
+ - bnb_4bit_compute_dtype: float32
186
+
187
+ The following `bitsandbytes` quantization config was used during training:
188
+ - quant_method: bitsandbytes
189
+ - load_in_8bit: True
190
+ - load_in_4bit: False
191
+ - llm_int8_threshold: 6.0
192
+ - llm_int8_skip_modules: None
193
+ - llm_int8_enable_fp32_cpu_offload: False
194
+ - llm_int8_has_fp16_weight: False
195
+ - bnb_4bit_quant_type: fp4
196
+ - bnb_4bit_use_double_quant: False
197
+ - bnb_4bit_compute_dtype: float32
198
+
199
+ The following `bitsandbytes` quantization config was used during training:
200
+ - quant_method: bitsandbytes
201
+ - load_in_8bit: True
202
+ - load_in_4bit: False
203
+ - llm_int8_threshold: 6.0
204
+ - llm_int8_skip_modules: None
205
+ - llm_int8_enable_fp32_cpu_offload: False
206
+ - llm_int8_has_fp16_weight: False
207
+ - bnb_4bit_quant_type: fp4
208
+ - bnb_4bit_use_double_quant: False
209
+ - bnb_4bit_compute_dtype: float32
210
+
211
+ The following `bitsandbytes` quantization config was used during training:
212
+ - quant_method: bitsandbytes
213
+ - load_in_8bit: True
214
+ - load_in_4bit: False
215
+ - llm_int8_threshold: 6.0
216
+ - llm_int8_skip_modules: None
217
+ - llm_int8_enable_fp32_cpu_offload: False
218
+ - llm_int8_has_fp16_weight: False
219
+ - bnb_4bit_quant_type: fp4
220
+ - bnb_4bit_use_double_quant: False
221
+ - bnb_4bit_compute_dtype: float32
222
+
223
+ The following `bitsandbytes` quantization config was used during training:
224
+ - quant_method: bitsandbytes
225
+ - load_in_8bit: True
226
+ - load_in_4bit: False
227
+ - llm_int8_threshold: 6.0
228
+ - llm_int8_skip_modules: None
229
+ - llm_int8_enable_fp32_cpu_offload: False
230
+ - llm_int8_has_fp16_weight: False
231
+ - bnb_4bit_quant_type: fp4
232
+ - bnb_4bit_use_double_quant: False
233
+ - bnb_4bit_compute_dtype: float32
234
+
235
+ The following `bitsandbytes` quantization config was used during training:
236
+ - quant_method: bitsandbytes
237
+ - load_in_8bit: True
238
+ - load_in_4bit: False
239
+ - llm_int8_threshold: 6.0
240
+ - llm_int8_skip_modules: None
241
+ - llm_int8_enable_fp32_cpu_offload: False
242
+ - llm_int8_has_fp16_weight: False
243
+ - bnb_4bit_quant_type: fp4
244
+ - bnb_4bit_use_double_quant: False
245
+ - bnb_4bit_compute_dtype: float32
246
+
247
+ The following `bitsandbytes` quantization config was used during training:
248
+ - quant_method: bitsandbytes
249
+ - load_in_8bit: True
250
+ - load_in_4bit: False
251
+ - llm_int8_threshold: 6.0
252
+ - llm_int8_skip_modules: None
253
+ - llm_int8_enable_fp32_cpu_offload: False
254
+ - llm_int8_has_fp16_weight: False
255
+ - bnb_4bit_quant_type: fp4
256
+ - bnb_4bit_use_double_quant: False
257
+ - bnb_4bit_compute_dtype: float32
258
+
259
+ The following `bitsandbytes` quantization config was used during training:
260
+ - quant_method: bitsandbytes
261
+ - load_in_8bit: True
262
+ - load_in_4bit: False
263
+ - llm_int8_threshold: 6.0
264
+ - llm_int8_skip_modules: None
265
+ - llm_int8_enable_fp32_cpu_offload: False
266
+ - llm_int8_has_fp16_weight: False
267
+ - bnb_4bit_quant_type: fp4
268
+ - bnb_4bit_use_double_quant: False
269
+ - bnb_4bit_compute_dtype: float32
270
+
271
+ The following `bitsandbytes` quantization config was used during training:
272
+ - quant_method: bitsandbytes
273
+ - load_in_8bit: True
274
+ - load_in_4bit: False
275
+ - llm_int8_threshold: 6.0
276
+ - llm_int8_skip_modules: None
277
+ - llm_int8_enable_fp32_cpu_offload: False
278
+ - llm_int8_has_fp16_weight: False
279
+ - bnb_4bit_quant_type: fp4
280
+ - bnb_4bit_use_double_quant: False
281
+ - bnb_4bit_compute_dtype: float32
282
+
283
+ The following `bitsandbytes` quantization config was used during training:
284
+ - quant_method: bitsandbytes
285
+ - load_in_8bit: True
286
+ - load_in_4bit: False
287
+ - llm_int8_threshold: 6.0
288
+ - llm_int8_skip_modules: None
289
+ - llm_int8_enable_fp32_cpu_offload: False
290
+ - llm_int8_has_fp16_weight: False
291
+ - bnb_4bit_quant_type: fp4
292
+ - bnb_4bit_use_double_quant: False
293
+ - bnb_4bit_compute_dtype: float32
294
+
295
+ The following `bitsandbytes` quantization config was used during training:
296
+ - quant_method: bitsandbytes
297
+ - load_in_8bit: True
298
+ - load_in_4bit: False
299
+ - llm_int8_threshold: 6.0
300
+ - llm_int8_skip_modules: None
301
+ - llm_int8_enable_fp32_cpu_offload: False
302
+ - llm_int8_has_fp16_weight: False
303
+ - bnb_4bit_quant_type: fp4
304
+ - bnb_4bit_use_double_quant: False
305
+ - bnb_4bit_compute_dtype: float32
306
+
307
+ The following `bitsandbytes` quantization config was used during training:
308
+ - quant_method: bitsandbytes
309
+ - load_in_8bit: True
310
+ - load_in_4bit: False
311
+ - llm_int8_threshold: 6.0
312
+ - llm_int8_skip_modules: None
313
+ - llm_int8_enable_fp32_cpu_offload: False
314
+ - llm_int8_has_fp16_weight: False
315
+ - bnb_4bit_quant_type: fp4
316
+ - bnb_4bit_use_double_quant: False
317
+ - bnb_4bit_compute_dtype: float32
318
+
319
+ The following `bitsandbytes` quantization config was used during training:
320
+ - quant_method: bitsandbytes
321
+ - load_in_8bit: True
322
+ - load_in_4bit: False
323
+ - llm_int8_threshold: 6.0
324
+ - llm_int8_skip_modules: None
325
+ - llm_int8_enable_fp32_cpu_offload: False
326
+ - llm_int8_has_fp16_weight: False
327
+ - bnb_4bit_quant_type: fp4
328
+ - bnb_4bit_use_double_quant: False
329
+ - bnb_4bit_compute_dtype: float32
330
+ ### Framework versions
331
+
332
+ - PEFT 0.5.0
333
+ - PEFT 0.5.0
334
+ - PEFT 0.5.0
335
+ - PEFT 0.5.0
336
+ - PEFT 0.5.0
337
+ - PEFT 0.5.0
338
+ - PEFT 0.5.0
339
+ - PEFT 0.5.0
340
+ - PEFT 0.5.0
341
+ - PEFT 0.5.0
342
+ - PEFT 0.5.0
343
+ - PEFT 0.5.0
344
+ - PEFT 0.5.0
345
+ - PEFT 0.5.0
346
+ - PEFT 0.5.0
347
+ - PEFT 0.5.0
348
+ - PEFT 0.5.0
349
+ - PEFT 0.5.0
350
+ - PEFT 0.5.0
351
+ - PEFT 0.5.0
352
+ - PEFT 0.5.0
353
+ - PEFT 0.5.0
354
+ - PEFT 0.5.0
355
+ - PEFT 0.5.0
356
+
357
+ - PEFT 0.5.0
adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:120826b59370d1320fc855be8a66bc4ed0018b13167789cf4c4bdc57459bc50d
3
+ size 65652106
checkpoint-290/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - _load_in_8bit: False
10
+ - _load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ - load_in_4bit: False
19
+ - load_in_8bit: False
20
+ ### Framework versions
21
+
22
+
23
+ - PEFT 0.5.0
checkpoint-290/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-290/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33b52e7f27a554af95df693aebd9a2193c06c2b8b5b11c5550cfcb897cec6d90
3
+ size 65578776
checkpoint-290/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63e609378f34d3612474ffc8b5abd122c6d145243a1294dd6cda2a1067d8e0ec
3
+ size 131345914
checkpoint-290/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dde7e0fcf5e900fab5c1d3d8900eb1c1683390e863c7d5842d9b61d44f86a207
3
+ size 14244
checkpoint-290/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de03e6c88bd8caf4db5520e8793fef794ac2af1f927ce3195c6aac84b5dec25c
3
+ size 1064
checkpoint-290/trainer_state.json ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9927281737327576,
3
+ "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-290",
4
+ "epoch": 38.666666666666664,
5
+ "eval_steps": 5,
6
+ "global_step": 290,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.67,
13
+ "grad_norm": 0.243179589509964,
14
+ "learning_rate": 9.857142857142858e-05,
15
+ "loss": 1.9956,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.67,
20
+ "eval_loss": 1.9701930284500122,
21
+ "eval_runtime": 17.115,
22
+ "eval_samples_per_second": 0.409,
23
+ "eval_steps_per_second": 0.058,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 1.33,
28
+ "grad_norm": 0.34590908885002136,
29
+ "learning_rate": 9.714285714285715e-05,
30
+ "loss": 1.9758,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 1.33,
35
+ "eval_loss": 1.8941271305084229,
36
+ "eval_runtime": 17.0912,
37
+ "eval_samples_per_second": 0.41,
38
+ "eval_steps_per_second": 0.059,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 2.0,
43
+ "grad_norm": 0.31595832109451294,
44
+ "learning_rate": 9.571428571428573e-05,
45
+ "loss": 1.849,
46
+ "step": 15
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "eval_loss": 1.8046789169311523,
51
+ "eval_runtime": 17.098,
52
+ "eval_samples_per_second": 0.409,
53
+ "eval_steps_per_second": 0.058,
54
+ "step": 15
55
+ },
56
+ {
57
+ "epoch": 2.67,
58
+ "grad_norm": 0.3428090512752533,
59
+ "learning_rate": 9.428571428571429e-05,
60
+ "loss": 1.789,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 2.67,
65
+ "eval_loss": 1.7658358812332153,
66
+ "eval_runtime": 17.0734,
67
+ "eval_samples_per_second": 0.41,
68
+ "eval_steps_per_second": 0.059,
69
+ "step": 20
70
+ },
71
+ {
72
+ "epoch": 3.33,
73
+ "grad_norm": 0.3102028965950012,
74
+ "learning_rate": 9.285714285714286e-05,
75
+ "loss": 1.7789,
76
+ "step": 25
77
+ },
78
+ {
79
+ "epoch": 3.33,
80
+ "eval_loss": 1.7225048542022705,
81
+ "eval_runtime": 17.0972,
82
+ "eval_samples_per_second": 0.409,
83
+ "eval_steps_per_second": 0.058,
84
+ "step": 25
85
+ },
86
+ {
87
+ "epoch": 4.0,
88
+ "grad_norm": 0.38602885603904724,
89
+ "learning_rate": 9.142857142857143e-05,
90
+ "loss": 1.7003,
91
+ "step": 30
92
+ },
93
+ {
94
+ "epoch": 4.0,
95
+ "eval_loss": 1.6749440431594849,
96
+ "eval_runtime": 17.1034,
97
+ "eval_samples_per_second": 0.409,
98
+ "eval_steps_per_second": 0.058,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 4.67,
103
+ "grad_norm": 0.37120407819747925,
104
+ "learning_rate": 9e-05,
105
+ "loss": 1.6424,
106
+ "step": 35
107
+ },
108
+ {
109
+ "epoch": 4.67,
110
+ "eval_loss": 1.6231099367141724,
111
+ "eval_runtime": 17.1067,
112
+ "eval_samples_per_second": 0.409,
113
+ "eval_steps_per_second": 0.058,
114
+ "step": 35
115
+ },
116
+ {
117
+ "epoch": 5.33,
118
+ "grad_norm": 0.4633428454399109,
119
+ "learning_rate": 8.857142857142857e-05,
120
+ "loss": 1.6023,
121
+ "step": 40
122
+ },
123
+ {
124
+ "epoch": 5.33,
125
+ "eval_loss": 1.5727053880691528,
126
+ "eval_runtime": 17.1002,
127
+ "eval_samples_per_second": 0.409,
128
+ "eval_steps_per_second": 0.058,
129
+ "step": 40
130
+ },
131
+ {
132
+ "epoch": 6.0,
133
+ "grad_norm": 0.5034663081169128,
134
+ "learning_rate": 8.714285714285715e-05,
135
+ "loss": 1.5322,
136
+ "step": 45
137
+ },
138
+ {
139
+ "epoch": 6.0,
140
+ "eval_loss": 1.5312587022781372,
141
+ "eval_runtime": 17.1159,
142
+ "eval_samples_per_second": 0.409,
143
+ "eval_steps_per_second": 0.058,
144
+ "step": 45
145
+ },
146
+ {
147
+ "epoch": 6.67,
148
+ "grad_norm": 0.5549929141998291,
149
+ "learning_rate": 8.571428571428571e-05,
150
+ "loss": 1.4788,
151
+ "step": 50
152
+ },
153
+ {
154
+ "epoch": 6.67,
155
+ "eval_loss": 1.492464303970337,
156
+ "eval_runtime": 17.0823,
157
+ "eval_samples_per_second": 0.41,
158
+ "eval_steps_per_second": 0.059,
159
+ "step": 50
160
+ },
161
+ {
162
+ "epoch": 7.33,
163
+ "grad_norm": 0.49194690585136414,
164
+ "learning_rate": 8.428571428571429e-05,
165
+ "loss": 1.4632,
166
+ "step": 55
167
+ },
168
+ {
169
+ "epoch": 7.33,
170
+ "eval_loss": 1.4622489213943481,
171
+ "eval_runtime": 17.1022,
172
+ "eval_samples_per_second": 0.409,
173
+ "eval_steps_per_second": 0.058,
174
+ "step": 55
175
+ },
176
+ {
177
+ "epoch": 8.0,
178
+ "grad_norm": 0.5866131782531738,
179
+ "learning_rate": 8.285714285714287e-05,
180
+ "loss": 1.3951,
181
+ "step": 60
182
+ },
183
+ {
184
+ "epoch": 8.0,
185
+ "eval_loss": 1.435951828956604,
186
+ "eval_runtime": 17.1087,
187
+ "eval_samples_per_second": 0.409,
188
+ "eval_steps_per_second": 0.058,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 8.67,
193
+ "grad_norm": 0.6252542734146118,
194
+ "learning_rate": 8.142857142857143e-05,
195
+ "loss": 1.3796,
196
+ "step": 65
197
+ },
198
+ {
199
+ "epoch": 8.67,
200
+ "eval_loss": 1.413227915763855,
201
+ "eval_runtime": 17.0914,
202
+ "eval_samples_per_second": 0.41,
203
+ "eval_steps_per_second": 0.059,
204
+ "step": 65
205
+ },
206
+ {
207
+ "epoch": 9.33,
208
+ "grad_norm": 0.6751863360404968,
209
+ "learning_rate": 8e-05,
210
+ "loss": 1.3257,
211
+ "step": 70
212
+ },
213
+ {
214
+ "epoch": 9.33,
215
+ "eval_loss": 1.395649790763855,
216
+ "eval_runtime": 17.0885,
217
+ "eval_samples_per_second": 0.41,
218
+ "eval_steps_per_second": 0.059,
219
+ "step": 70
220
+ },
221
+ {
222
+ "epoch": 10.0,
223
+ "grad_norm": 0.8878222703933716,
224
+ "learning_rate": 7.857142857142858e-05,
225
+ "loss": 1.2795,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 10.0,
230
+ "eval_loss": 1.3699487447738647,
231
+ "eval_runtime": 17.1031,
232
+ "eval_samples_per_second": 0.409,
233
+ "eval_steps_per_second": 0.058,
234
+ "step": 75
235
+ },
236
+ {
237
+ "epoch": 10.67,
238
+ "grad_norm": 0.8470121026039124,
239
+ "learning_rate": 7.714285714285715e-05,
240
+ "loss": 1.2449,
241
+ "step": 80
242
+ },
243
+ {
244
+ "epoch": 10.67,
245
+ "eval_loss": 1.347831130027771,
246
+ "eval_runtime": 17.0985,
247
+ "eval_samples_per_second": 0.409,
248
+ "eval_steps_per_second": 0.058,
249
+ "step": 80
250
+ },
251
+ {
252
+ "epoch": 11.33,
253
+ "grad_norm": 1.0655425786972046,
254
+ "learning_rate": 7.571428571428571e-05,
255
+ "loss": 1.1983,
256
+ "step": 85
257
+ },
258
+ {
259
+ "epoch": 11.33,
260
+ "eval_loss": 1.3311971426010132,
261
+ "eval_runtime": 17.0784,
262
+ "eval_samples_per_second": 0.41,
263
+ "eval_steps_per_second": 0.059,
264
+ "step": 85
265
+ },
266
+ {
267
+ "epoch": 12.0,
268
+ "grad_norm": 1.2651888132095337,
269
+ "learning_rate": 7.428571428571429e-05,
270
+ "loss": 1.1467,
271
+ "step": 90
272
+ },
273
+ {
274
+ "epoch": 12.0,
275
+ "eval_loss": 1.3095277547836304,
276
+ "eval_runtime": 17.0903,
277
+ "eval_samples_per_second": 0.41,
278
+ "eval_steps_per_second": 0.059,
279
+ "step": 90
280
+ },
281
+ {
282
+ "epoch": 12.67,
283
+ "grad_norm": 1.248926043510437,
284
+ "learning_rate": 7.285714285714286e-05,
285
+ "loss": 1.0922,
286
+ "step": 95
287
+ },
288
+ {
289
+ "epoch": 12.67,
290
+ "eval_loss": 1.2942878007888794,
291
+ "eval_runtime": 17.0947,
292
+ "eval_samples_per_second": 0.409,
293
+ "eval_steps_per_second": 0.058,
294
+ "step": 95
295
+ },
296
+ {
297
+ "epoch": 13.33,
298
+ "grad_norm": 1.896952509880066,
299
+ "learning_rate": 7.142857142857143e-05,
300
+ "loss": 1.0403,
301
+ "step": 100
302
+ },
303
+ {
304
+ "epoch": 13.33,
305
+ "eval_loss": 1.2803159952163696,
306
+ "eval_runtime": 17.0819,
307
+ "eval_samples_per_second": 0.41,
308
+ "eval_steps_per_second": 0.059,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 14.0,
313
+ "grad_norm": 1.862244725227356,
314
+ "learning_rate": 7e-05,
315
+ "loss": 1.0049,
316
+ "step": 105
317
+ },
318
+ {
319
+ "epoch": 14.0,
320
+ "eval_loss": 1.2643567323684692,
321
+ "eval_runtime": 17.0849,
322
+ "eval_samples_per_second": 0.41,
323
+ "eval_steps_per_second": 0.059,
324
+ "step": 105
325
+ },
326
+ {
327
+ "epoch": 14.67,
328
+ "grad_norm": 1.7487821578979492,
329
+ "learning_rate": 6.857142857142858e-05,
330
+ "loss": 0.9262,
331
+ "step": 110
332
+ },
333
+ {
334
+ "epoch": 14.67,
335
+ "eval_loss": 1.2471646070480347,
336
+ "eval_runtime": 17.1278,
337
+ "eval_samples_per_second": 0.409,
338
+ "eval_steps_per_second": 0.058,
339
+ "step": 110
340
+ },
341
+ {
342
+ "epoch": 15.33,
343
+ "grad_norm": 1.838605284690857,
344
+ "learning_rate": 6.714285714285714e-05,
345
+ "loss": 0.8965,
346
+ "step": 115
347
+ },
348
+ {
349
+ "epoch": 15.33,
350
+ "eval_loss": 1.2377034425735474,
351
+ "eval_runtime": 17.0731,
352
+ "eval_samples_per_second": 0.41,
353
+ "eval_steps_per_second": 0.059,
354
+ "step": 115
355
+ },
356
+ {
357
+ "epoch": 16.0,
358
+ "grad_norm": 3.117398977279663,
359
+ "learning_rate": 6.571428571428571e-05,
360
+ "loss": 0.8581,
361
+ "step": 120
362
+ },
363
+ {
364
+ "epoch": 16.0,
365
+ "eval_loss": 1.2083133459091187,
366
+ "eval_runtime": 17.1304,
367
+ "eval_samples_per_second": 0.409,
368
+ "eval_steps_per_second": 0.058,
369
+ "step": 120
370
+ },
371
+ {
372
+ "epoch": 16.67,
373
+ "grad_norm": 2.5655250549316406,
374
+ "learning_rate": 6.428571428571429e-05,
375
+ "loss": 0.7929,
376
+ "step": 125
377
+ },
378
+ {
379
+ "epoch": 16.67,
380
+ "eval_loss": 1.1945828199386597,
381
+ "eval_runtime": 17.104,
382
+ "eval_samples_per_second": 0.409,
383
+ "eval_steps_per_second": 0.058,
384
+ "step": 125
385
+ },
386
+ {
387
+ "epoch": 17.33,
388
+ "grad_norm": 2.168546199798584,
389
+ "learning_rate": 6.285714285714286e-05,
390
+ "loss": 0.7543,
391
+ "step": 130
392
+ },
393
+ {
394
+ "epoch": 17.33,
395
+ "eval_loss": 1.1876276731491089,
396
+ "eval_runtime": 17.1046,
397
+ "eval_samples_per_second": 0.409,
398
+ "eval_steps_per_second": 0.058,
399
+ "step": 130
400
+ },
401
+ {
402
+ "epoch": 18.0,
403
+ "grad_norm": 2.5984208583831787,
404
+ "learning_rate": 6.142857142857143e-05,
405
+ "loss": 0.716,
406
+ "step": 135
407
+ },
408
+ {
409
+ "epoch": 18.0,
410
+ "eval_loss": 1.1714750528335571,
411
+ "eval_runtime": 17.0807,
412
+ "eval_samples_per_second": 0.41,
413
+ "eval_steps_per_second": 0.059,
414
+ "step": 135
415
+ },
416
+ {
417
+ "epoch": 18.67,
418
+ "grad_norm": 3.479024887084961,
419
+ "learning_rate": 6e-05,
420
+ "loss": 0.6681,
421
+ "step": 140
422
+ },
423
+ {
424
+ "epoch": 18.67,
425
+ "eval_loss": 1.169895052909851,
426
+ "eval_runtime": 17.0681,
427
+ "eval_samples_per_second": 0.41,
428
+ "eval_steps_per_second": 0.059,
429
+ "step": 140
430
+ },
431
+ {
432
+ "epoch": 19.33,
433
+ "grad_norm": 2.563386917114258,
434
+ "learning_rate": 5.8571428571428575e-05,
435
+ "loss": 0.6306,
436
+ "step": 145
437
+ },
438
+ {
439
+ "epoch": 19.33,
440
+ "eval_loss": 1.1741083860397339,
441
+ "eval_runtime": 17.0568,
442
+ "eval_samples_per_second": 0.41,
443
+ "eval_steps_per_second": 0.059,
444
+ "step": 145
445
+ },
446
+ {
447
+ "epoch": 20.0,
448
+ "grad_norm": 2.96592116355896,
449
+ "learning_rate": 5.714285714285714e-05,
450
+ "loss": 0.6183,
451
+ "step": 150
452
+ },
453
+ {
454
+ "epoch": 20.0,
455
+ "eval_loss": 1.1455965042114258,
456
+ "eval_runtime": 17.073,
457
+ "eval_samples_per_second": 0.41,
458
+ "eval_steps_per_second": 0.059,
459
+ "step": 150
460
+ },
461
+ {
462
+ "epoch": 20.67,
463
+ "grad_norm": 2.6751275062561035,
464
+ "learning_rate": 5.571428571428572e-05,
465
+ "loss": 0.5464,
466
+ "step": 155
467
+ },
468
+ {
469
+ "epoch": 20.67,
470
+ "eval_loss": 1.131102204322815,
471
+ "eval_runtime": 17.0578,
472
+ "eval_samples_per_second": 0.41,
473
+ "eval_steps_per_second": 0.059,
474
+ "step": 155
475
+ },
476
+ {
477
+ "epoch": 21.33,
478
+ "grad_norm": 2.3700051307678223,
479
+ "learning_rate": 5.428571428571428e-05,
480
+ "loss": 0.551,
481
+ "step": 160
482
+ },
483
+ {
484
+ "epoch": 21.33,
485
+ "eval_loss": 1.127384066581726,
486
+ "eval_runtime": 17.0546,
487
+ "eval_samples_per_second": 0.41,
488
+ "eval_steps_per_second": 0.059,
489
+ "step": 160
490
+ },
491
+ {
492
+ "epoch": 22.0,
493
+ "grad_norm": 3.3827567100524902,
494
+ "learning_rate": 5.285714285714286e-05,
495
+ "loss": 0.5179,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 22.0,
500
+ "eval_loss": 1.111584186553955,
501
+ "eval_runtime": 17.0812,
502
+ "eval_samples_per_second": 0.41,
503
+ "eval_steps_per_second": 0.059,
504
+ "step": 165
505
+ },
506
+ {
507
+ "epoch": 22.67,
508
+ "grad_norm": 3.55790114402771,
509
+ "learning_rate": 5.142857142857143e-05,
510
+ "loss": 0.4831,
511
+ "step": 170
512
+ },
513
+ {
514
+ "epoch": 22.67,
515
+ "eval_loss": 1.0948525667190552,
516
+ "eval_runtime": 17.0547,
517
+ "eval_samples_per_second": 0.41,
518
+ "eval_steps_per_second": 0.059,
519
+ "step": 170
520
+ },
521
+ {
522
+ "epoch": 23.33,
523
+ "grad_norm": 3.0782699584960938,
524
+ "learning_rate": 5e-05,
525
+ "loss": 0.4587,
526
+ "step": 175
527
+ },
528
+ {
529
+ "epoch": 23.33,
530
+ "eval_loss": 1.0906586647033691,
531
+ "eval_runtime": 17.0666,
532
+ "eval_samples_per_second": 0.41,
533
+ "eval_steps_per_second": 0.059,
534
+ "step": 175
535
+ },
536
+ {
537
+ "epoch": 24.0,
538
+ "grad_norm": 3.3993167877197266,
539
+ "learning_rate": 4.8571428571428576e-05,
540
+ "loss": 0.4203,
541
+ "step": 180
542
+ },
543
+ {
544
+ "epoch": 24.0,
545
+ "eval_loss": 1.0688152313232422,
546
+ "eval_runtime": 17.0721,
547
+ "eval_samples_per_second": 0.41,
548
+ "eval_steps_per_second": 0.059,
549
+ "step": 180
550
+ },
551
+ {
552
+ "epoch": 24.67,
553
+ "grad_norm": 3.319303035736084,
554
+ "learning_rate": 4.714285714285714e-05,
555
+ "loss": 0.3975,
556
+ "step": 185
557
+ },
558
+ {
559
+ "epoch": 24.67,
560
+ "eval_loss": 1.0746583938598633,
561
+ "eval_runtime": 17.0709,
562
+ "eval_samples_per_second": 0.41,
563
+ "eval_steps_per_second": 0.059,
564
+ "step": 185
565
+ },
566
+ {
567
+ "epoch": 25.33,
568
+ "grad_norm": 2.4532127380371094,
569
+ "learning_rate": 4.5714285714285716e-05,
570
+ "loss": 0.3832,
571
+ "step": 190
572
+ },
573
+ {
574
+ "epoch": 25.33,
575
+ "eval_loss": 1.0772522687911987,
576
+ "eval_runtime": 17.0619,
577
+ "eval_samples_per_second": 0.41,
578
+ "eval_steps_per_second": 0.059,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 26.0,
583
+ "grad_norm": 3.956822156906128,
584
+ "learning_rate": 4.428571428571428e-05,
585
+ "loss": 0.3725,
586
+ "step": 195
587
+ },
588
+ {
589
+ "epoch": 26.0,
590
+ "eval_loss": 1.0638784170150757,
591
+ "eval_runtime": 17.0807,
592
+ "eval_samples_per_second": 0.41,
593
+ "eval_steps_per_second": 0.059,
594
+ "step": 195
595
+ },
596
+ {
597
+ "epoch": 26.67,
598
+ "grad_norm": 2.76033353805542,
599
+ "learning_rate": 4.2857142857142856e-05,
600
+ "loss": 0.3473,
601
+ "step": 200
602
+ },
603
+ {
604
+ "epoch": 26.67,
605
+ "eval_loss": 1.04669988155365,
606
+ "eval_runtime": 17.0774,
607
+ "eval_samples_per_second": 0.41,
608
+ "eval_steps_per_second": 0.059,
609
+ "step": 200
610
+ },
611
+ {
612
+ "epoch": 27.33,
613
+ "grad_norm": 3.8683507442474365,
614
+ "learning_rate": 4.1428571428571437e-05,
615
+ "loss": 0.3243,
616
+ "step": 205
617
+ },
618
+ {
619
+ "epoch": 27.33,
620
+ "eval_loss": 1.0470303297042847,
621
+ "eval_runtime": 17.0718,
622
+ "eval_samples_per_second": 0.41,
623
+ "eval_steps_per_second": 0.059,
624
+ "step": 205
625
+ },
626
+ {
627
+ "epoch": 28.0,
628
+ "grad_norm": 4.535538196563721,
629
+ "learning_rate": 4e-05,
630
+ "loss": 0.3202,
631
+ "step": 210
632
+ },
633
+ {
634
+ "epoch": 28.0,
635
+ "eval_loss": 1.025539517402649,
636
+ "eval_runtime": 17.0604,
637
+ "eval_samples_per_second": 0.41,
638
+ "eval_steps_per_second": 0.059,
639
+ "step": 210
640
+ },
641
+ {
642
+ "epoch": 28.67,
643
+ "grad_norm": 2.6224355697631836,
644
+ "learning_rate": 3.857142857142858e-05,
645
+ "loss": 0.2958,
646
+ "step": 215
647
+ },
648
+ {
649
+ "epoch": 28.67,
650
+ "eval_loss": 1.0192126035690308,
651
+ "eval_runtime": 17.0657,
652
+ "eval_samples_per_second": 0.41,
653
+ "eval_steps_per_second": 0.059,
654
+ "step": 215
655
+ },
656
+ {
657
+ "epoch": 29.33,
658
+ "grad_norm": 2.5870041847229004,
659
+ "learning_rate": 3.7142857142857143e-05,
660
+ "loss": 0.2783,
661
+ "step": 220
662
+ },
663
+ {
664
+ "epoch": 29.33,
665
+ "eval_loss": 1.0211580991744995,
666
+ "eval_runtime": 17.0857,
667
+ "eval_samples_per_second": 0.41,
668
+ "eval_steps_per_second": 0.059,
669
+ "step": 220
670
+ },
671
+ {
672
+ "epoch": 30.0,
673
+ "grad_norm": 3.4565751552581787,
674
+ "learning_rate": 3.571428571428572e-05,
675
+ "loss": 0.2773,
676
+ "step": 225
677
+ },
678
+ {
679
+ "epoch": 30.0,
680
+ "eval_loss": 1.006419062614441,
681
+ "eval_runtime": 17.0807,
682
+ "eval_samples_per_second": 0.41,
683
+ "eval_steps_per_second": 0.059,
684
+ "step": 225
685
+ },
686
+ {
687
+ "epoch": 30.67,
688
+ "grad_norm": 2.4756500720977783,
689
+ "learning_rate": 3.428571428571429e-05,
690
+ "loss": 0.2482,
691
+ "step": 230
692
+ },
693
+ {
694
+ "epoch": 30.67,
695
+ "eval_loss": 1.0081219673156738,
696
+ "eval_runtime": 17.0576,
697
+ "eval_samples_per_second": 0.41,
698
+ "eval_steps_per_second": 0.059,
699
+ "step": 230
700
+ },
701
+ {
702
+ "epoch": 31.33,
703
+ "grad_norm": 2.38002610206604,
704
+ "learning_rate": 3.285714285714286e-05,
705
+ "loss": 0.2464,
706
+ "step": 235
707
+ },
708
+ {
709
+ "epoch": 31.33,
710
+ "eval_loss": 1.0151804685592651,
711
+ "eval_runtime": 17.0587,
712
+ "eval_samples_per_second": 0.41,
713
+ "eval_steps_per_second": 0.059,
714
+ "step": 235
715
+ },
716
+ {
717
+ "epoch": 32.0,
718
+ "grad_norm": 3.7081105709075928,
719
+ "learning_rate": 3.142857142857143e-05,
720
+ "loss": 0.2442,
721
+ "step": 240
722
+ },
723
+ {
724
+ "epoch": 32.0,
725
+ "eval_loss": 1.0032445192337036,
726
+ "eval_runtime": 17.1613,
727
+ "eval_samples_per_second": 0.408,
728
+ "eval_steps_per_second": 0.058,
729
+ "step": 240
730
+ },
731
+ {
732
+ "epoch": 32.67,
733
+ "grad_norm": 2.55924391746521,
734
+ "learning_rate": 3e-05,
735
+ "loss": 0.2193,
736
+ "step": 245
737
+ },
738
+ {
739
+ "epoch": 32.67,
740
+ "eval_loss": 0.9989615082740784,
741
+ "eval_runtime": 17.0447,
742
+ "eval_samples_per_second": 0.411,
743
+ "eval_steps_per_second": 0.059,
744
+ "step": 245
745
+ },
746
+ {
747
+ "epoch": 33.33,
748
+ "grad_norm": 1.9451407194137573,
749
+ "learning_rate": 2.857142857142857e-05,
750
+ "loss": 0.2101,
751
+ "step": 250
752
+ },
753
+ {
754
+ "epoch": 33.33,
755
+ "eval_loss": 1.0029457807540894,
756
+ "eval_runtime": 17.0816,
757
+ "eval_samples_per_second": 0.41,
758
+ "eval_steps_per_second": 0.059,
759
+ "step": 250
760
+ },
761
+ {
762
+ "epoch": 34.0,
763
+ "grad_norm": 2.713731527328491,
764
+ "learning_rate": 2.714285714285714e-05,
765
+ "loss": 0.2194,
766
+ "step": 255
767
+ },
768
+ {
769
+ "epoch": 34.0,
770
+ "eval_loss": 0.9959421753883362,
771
+ "eval_runtime": 17.0747,
772
+ "eval_samples_per_second": 0.41,
773
+ "eval_steps_per_second": 0.059,
774
+ "step": 255
775
+ },
776
+ {
777
+ "epoch": 34.67,
778
+ "grad_norm": 2.1633846759796143,
779
+ "learning_rate": 2.5714285714285714e-05,
780
+ "loss": 0.1958,
781
+ "step": 260
782
+ },
783
+ {
784
+ "epoch": 34.67,
785
+ "eval_loss": 0.9989770650863647,
786
+ "eval_runtime": 17.0821,
787
+ "eval_samples_per_second": 0.41,
788
+ "eval_steps_per_second": 0.059,
789
+ "step": 260
790
+ },
791
+ {
792
+ "epoch": 35.33,
793
+ "grad_norm": 3.9233529567718506,
794
+ "learning_rate": 2.4285714285714288e-05,
795
+ "loss": 0.1831,
796
+ "step": 265
797
+ },
798
+ {
799
+ "epoch": 35.33,
800
+ "eval_loss": 1.0072578191757202,
801
+ "eval_runtime": 17.0564,
802
+ "eval_samples_per_second": 0.41,
803
+ "eval_steps_per_second": 0.059,
804
+ "step": 265
805
+ },
806
+ {
807
+ "epoch": 36.0,
808
+ "grad_norm": 2.4143056869506836,
809
+ "learning_rate": 2.2857142857142858e-05,
810
+ "loss": 0.1753,
811
+ "step": 270
812
+ },
813
+ {
814
+ "epoch": 36.0,
815
+ "eval_loss": 0.9938892722129822,
816
+ "eval_runtime": 17.0668,
817
+ "eval_samples_per_second": 0.41,
818
+ "eval_steps_per_second": 0.059,
819
+ "step": 270
820
+ },
821
+ {
822
+ "epoch": 36.67,
823
+ "grad_norm": 2.706679582595825,
824
+ "learning_rate": 2.1428571428571428e-05,
825
+ "loss": 0.1698,
826
+ "step": 275
827
+ },
828
+ {
829
+ "epoch": 36.67,
830
+ "eval_loss": 0.9969200491905212,
831
+ "eval_runtime": 17.0643,
832
+ "eval_samples_per_second": 0.41,
833
+ "eval_steps_per_second": 0.059,
834
+ "step": 275
835
+ },
836
+ {
837
+ "epoch": 37.33,
838
+ "grad_norm": 1.872753620147705,
839
+ "learning_rate": 2e-05,
840
+ "loss": 0.16,
841
+ "step": 280
842
+ },
843
+ {
844
+ "epoch": 37.33,
845
+ "eval_loss": 0.9940390586853027,
846
+ "eval_runtime": 17.0728,
847
+ "eval_samples_per_second": 0.41,
848
+ "eval_steps_per_second": 0.059,
849
+ "step": 280
850
+ },
851
+ {
852
+ "epoch": 38.0,
853
+ "grad_norm": 2.7510581016540527,
854
+ "learning_rate": 1.8571428571428572e-05,
855
+ "loss": 0.1614,
856
+ "step": 285
857
+ },
858
+ {
859
+ "epoch": 38.0,
860
+ "eval_loss": 1.0066231489181519,
861
+ "eval_runtime": 17.072,
862
+ "eval_samples_per_second": 0.41,
863
+ "eval_steps_per_second": 0.059,
864
+ "step": 285
865
+ },
866
+ {
867
+ "epoch": 38.67,
868
+ "grad_norm": 1.8461092710494995,
869
+ "learning_rate": 1.7142857142857145e-05,
870
+ "loss": 0.1506,
871
+ "step": 290
872
+ },
873
+ {
874
+ "epoch": 38.67,
875
+ "eval_loss": 0.9927281737327576,
876
+ "eval_runtime": 17.0481,
877
+ "eval_samples_per_second": 0.411,
878
+ "eval_steps_per_second": 0.059,
879
+ "step": 290
880
+ }
881
+ ],
882
+ "logging_steps": 5,
883
+ "max_steps": 350,
884
+ "num_input_tokens_seen": 0,
885
+ "num_train_epochs": 50,
886
+ "save_steps": 10,
887
+ "total_flos": 1.9272976816637215e+18,
888
+ "train_batch_size": 2,
889
+ "trial_name": null,
890
+ "trial_params": null
891
+ }
checkpoint-290/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed0ba6fb9a88dad56f61d9306f17b4e66e8767d898772faa97871a1388e82cf
3
+ size 4920
checkpoint-320/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - _load_in_8bit: False
10
+ - _load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ - load_in_4bit: False
19
+ - load_in_8bit: False
20
+ ### Framework versions
21
+
22
+
23
+ - PEFT 0.5.0
checkpoint-320/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-320/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:503d1844ac09449a9022937fd6160c557504e89ac035c55bc1efe99d8474e6f2
3
+ size 65578776
checkpoint-320/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f932a7f6af7ecf22ed10ae5ef43e3515ce5ae8605087025bef91b87a52248bc5
3
+ size 131345914
checkpoint-320/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33a70333a4fc39389b0589b00a370500b36754db76577b913a7a9f856a3ef8df
3
+ size 14244
checkpoint-320/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e41504e9cc891d7299640b1d039f4351835f3de1fa7153a1118aece45c9f2c20
3
+ size 1064
checkpoint-320/trainer_state.json ADDED
@@ -0,0 +1,981 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9927281737327576,
3
+ "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-290",
4
+ "epoch": 42.666666666666664,
5
+ "eval_steps": 5,
6
+ "global_step": 320,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.67,
13
+ "grad_norm": 0.243179589509964,
14
+ "learning_rate": 9.857142857142858e-05,
15
+ "loss": 1.9956,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.67,
20
+ "eval_loss": 1.9701930284500122,
21
+ "eval_runtime": 17.115,
22
+ "eval_samples_per_second": 0.409,
23
+ "eval_steps_per_second": 0.058,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 1.33,
28
+ "grad_norm": 0.34590908885002136,
29
+ "learning_rate": 9.714285714285715e-05,
30
+ "loss": 1.9758,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 1.33,
35
+ "eval_loss": 1.8941271305084229,
36
+ "eval_runtime": 17.0912,
37
+ "eval_samples_per_second": 0.41,
38
+ "eval_steps_per_second": 0.059,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 2.0,
43
+ "grad_norm": 0.31595832109451294,
44
+ "learning_rate": 9.571428571428573e-05,
45
+ "loss": 1.849,
46
+ "step": 15
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "eval_loss": 1.8046789169311523,
51
+ "eval_runtime": 17.098,
52
+ "eval_samples_per_second": 0.409,
53
+ "eval_steps_per_second": 0.058,
54
+ "step": 15
55
+ },
56
+ {
57
+ "epoch": 2.67,
58
+ "grad_norm": 0.3428090512752533,
59
+ "learning_rate": 9.428571428571429e-05,
60
+ "loss": 1.789,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 2.67,
65
+ "eval_loss": 1.7658358812332153,
66
+ "eval_runtime": 17.0734,
67
+ "eval_samples_per_second": 0.41,
68
+ "eval_steps_per_second": 0.059,
69
+ "step": 20
70
+ },
71
+ {
72
+ "epoch": 3.33,
73
+ "grad_norm": 0.3102028965950012,
74
+ "learning_rate": 9.285714285714286e-05,
75
+ "loss": 1.7789,
76
+ "step": 25
77
+ },
78
+ {
79
+ "epoch": 3.33,
80
+ "eval_loss": 1.7225048542022705,
81
+ "eval_runtime": 17.0972,
82
+ "eval_samples_per_second": 0.409,
83
+ "eval_steps_per_second": 0.058,
84
+ "step": 25
85
+ },
86
+ {
87
+ "epoch": 4.0,
88
+ "grad_norm": 0.38602885603904724,
89
+ "learning_rate": 9.142857142857143e-05,
90
+ "loss": 1.7003,
91
+ "step": 30
92
+ },
93
+ {
94
+ "epoch": 4.0,
95
+ "eval_loss": 1.6749440431594849,
96
+ "eval_runtime": 17.1034,
97
+ "eval_samples_per_second": 0.409,
98
+ "eval_steps_per_second": 0.058,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 4.67,
103
+ "grad_norm": 0.37120407819747925,
104
+ "learning_rate": 9e-05,
105
+ "loss": 1.6424,
106
+ "step": 35
107
+ },
108
+ {
109
+ "epoch": 4.67,
110
+ "eval_loss": 1.6231099367141724,
111
+ "eval_runtime": 17.1067,
112
+ "eval_samples_per_second": 0.409,
113
+ "eval_steps_per_second": 0.058,
114
+ "step": 35
115
+ },
116
+ {
117
+ "epoch": 5.33,
118
+ "grad_norm": 0.4633428454399109,
119
+ "learning_rate": 8.857142857142857e-05,
120
+ "loss": 1.6023,
121
+ "step": 40
122
+ },
123
+ {
124
+ "epoch": 5.33,
125
+ "eval_loss": 1.5727053880691528,
126
+ "eval_runtime": 17.1002,
127
+ "eval_samples_per_second": 0.409,
128
+ "eval_steps_per_second": 0.058,
129
+ "step": 40
130
+ },
131
+ {
132
+ "epoch": 6.0,
133
+ "grad_norm": 0.5034663081169128,
134
+ "learning_rate": 8.714285714285715e-05,
135
+ "loss": 1.5322,
136
+ "step": 45
137
+ },
138
+ {
139
+ "epoch": 6.0,
140
+ "eval_loss": 1.5312587022781372,
141
+ "eval_runtime": 17.1159,
142
+ "eval_samples_per_second": 0.409,
143
+ "eval_steps_per_second": 0.058,
144
+ "step": 45
145
+ },
146
+ {
147
+ "epoch": 6.67,
148
+ "grad_norm": 0.5549929141998291,
149
+ "learning_rate": 8.571428571428571e-05,
150
+ "loss": 1.4788,
151
+ "step": 50
152
+ },
153
+ {
154
+ "epoch": 6.67,
155
+ "eval_loss": 1.492464303970337,
156
+ "eval_runtime": 17.0823,
157
+ "eval_samples_per_second": 0.41,
158
+ "eval_steps_per_second": 0.059,
159
+ "step": 50
160
+ },
161
+ {
162
+ "epoch": 7.33,
163
+ "grad_norm": 0.49194690585136414,
164
+ "learning_rate": 8.428571428571429e-05,
165
+ "loss": 1.4632,
166
+ "step": 55
167
+ },
168
+ {
169
+ "epoch": 7.33,
170
+ "eval_loss": 1.4622489213943481,
171
+ "eval_runtime": 17.1022,
172
+ "eval_samples_per_second": 0.409,
173
+ "eval_steps_per_second": 0.058,
174
+ "step": 55
175
+ },
176
+ {
177
+ "epoch": 8.0,
178
+ "grad_norm": 0.5866131782531738,
179
+ "learning_rate": 8.285714285714287e-05,
180
+ "loss": 1.3951,
181
+ "step": 60
182
+ },
183
+ {
184
+ "epoch": 8.0,
185
+ "eval_loss": 1.435951828956604,
186
+ "eval_runtime": 17.1087,
187
+ "eval_samples_per_second": 0.409,
188
+ "eval_steps_per_second": 0.058,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 8.67,
193
+ "grad_norm": 0.6252542734146118,
194
+ "learning_rate": 8.142857142857143e-05,
195
+ "loss": 1.3796,
196
+ "step": 65
197
+ },
198
+ {
199
+ "epoch": 8.67,
200
+ "eval_loss": 1.413227915763855,
201
+ "eval_runtime": 17.0914,
202
+ "eval_samples_per_second": 0.41,
203
+ "eval_steps_per_second": 0.059,
204
+ "step": 65
205
+ },
206
+ {
207
+ "epoch": 9.33,
208
+ "grad_norm": 0.6751863360404968,
209
+ "learning_rate": 8e-05,
210
+ "loss": 1.3257,
211
+ "step": 70
212
+ },
213
+ {
214
+ "epoch": 9.33,
215
+ "eval_loss": 1.395649790763855,
216
+ "eval_runtime": 17.0885,
217
+ "eval_samples_per_second": 0.41,
218
+ "eval_steps_per_second": 0.059,
219
+ "step": 70
220
+ },
221
+ {
222
+ "epoch": 10.0,
223
+ "grad_norm": 0.8878222703933716,
224
+ "learning_rate": 7.857142857142858e-05,
225
+ "loss": 1.2795,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 10.0,
230
+ "eval_loss": 1.3699487447738647,
231
+ "eval_runtime": 17.1031,
232
+ "eval_samples_per_second": 0.409,
233
+ "eval_steps_per_second": 0.058,
234
+ "step": 75
235
+ },
236
+ {
237
+ "epoch": 10.67,
238
+ "grad_norm": 0.8470121026039124,
239
+ "learning_rate": 7.714285714285715e-05,
240
+ "loss": 1.2449,
241
+ "step": 80
242
+ },
243
+ {
244
+ "epoch": 10.67,
245
+ "eval_loss": 1.347831130027771,
246
+ "eval_runtime": 17.0985,
247
+ "eval_samples_per_second": 0.409,
248
+ "eval_steps_per_second": 0.058,
249
+ "step": 80
250
+ },
251
+ {
252
+ "epoch": 11.33,
253
+ "grad_norm": 1.0655425786972046,
254
+ "learning_rate": 7.571428571428571e-05,
255
+ "loss": 1.1983,
256
+ "step": 85
257
+ },
258
+ {
259
+ "epoch": 11.33,
260
+ "eval_loss": 1.3311971426010132,
261
+ "eval_runtime": 17.0784,
262
+ "eval_samples_per_second": 0.41,
263
+ "eval_steps_per_second": 0.059,
264
+ "step": 85
265
+ },
266
+ {
267
+ "epoch": 12.0,
268
+ "grad_norm": 1.2651888132095337,
269
+ "learning_rate": 7.428571428571429e-05,
270
+ "loss": 1.1467,
271
+ "step": 90
272
+ },
273
+ {
274
+ "epoch": 12.0,
275
+ "eval_loss": 1.3095277547836304,
276
+ "eval_runtime": 17.0903,
277
+ "eval_samples_per_second": 0.41,
278
+ "eval_steps_per_second": 0.059,
279
+ "step": 90
280
+ },
281
+ {
282
+ "epoch": 12.67,
283
+ "grad_norm": 1.248926043510437,
284
+ "learning_rate": 7.285714285714286e-05,
285
+ "loss": 1.0922,
286
+ "step": 95
287
+ },
288
+ {
289
+ "epoch": 12.67,
290
+ "eval_loss": 1.2942878007888794,
291
+ "eval_runtime": 17.0947,
292
+ "eval_samples_per_second": 0.409,
293
+ "eval_steps_per_second": 0.058,
294
+ "step": 95
295
+ },
296
+ {
297
+ "epoch": 13.33,
298
+ "grad_norm": 1.896952509880066,
299
+ "learning_rate": 7.142857142857143e-05,
300
+ "loss": 1.0403,
301
+ "step": 100
302
+ },
303
+ {
304
+ "epoch": 13.33,
305
+ "eval_loss": 1.2803159952163696,
306
+ "eval_runtime": 17.0819,
307
+ "eval_samples_per_second": 0.41,
308
+ "eval_steps_per_second": 0.059,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 14.0,
313
+ "grad_norm": 1.862244725227356,
314
+ "learning_rate": 7e-05,
315
+ "loss": 1.0049,
316
+ "step": 105
317
+ },
318
+ {
319
+ "epoch": 14.0,
320
+ "eval_loss": 1.2643567323684692,
321
+ "eval_runtime": 17.0849,
322
+ "eval_samples_per_second": 0.41,
323
+ "eval_steps_per_second": 0.059,
324
+ "step": 105
325
+ },
326
+ {
327
+ "epoch": 14.67,
328
+ "grad_norm": 1.7487821578979492,
329
+ "learning_rate": 6.857142857142858e-05,
330
+ "loss": 0.9262,
331
+ "step": 110
332
+ },
333
+ {
334
+ "epoch": 14.67,
335
+ "eval_loss": 1.2471646070480347,
336
+ "eval_runtime": 17.1278,
337
+ "eval_samples_per_second": 0.409,
338
+ "eval_steps_per_second": 0.058,
339
+ "step": 110
340
+ },
341
+ {
342
+ "epoch": 15.33,
343
+ "grad_norm": 1.838605284690857,
344
+ "learning_rate": 6.714285714285714e-05,
345
+ "loss": 0.8965,
346
+ "step": 115
347
+ },
348
+ {
349
+ "epoch": 15.33,
350
+ "eval_loss": 1.2377034425735474,
351
+ "eval_runtime": 17.0731,
352
+ "eval_samples_per_second": 0.41,
353
+ "eval_steps_per_second": 0.059,
354
+ "step": 115
355
+ },
356
+ {
357
+ "epoch": 16.0,
358
+ "grad_norm": 3.117398977279663,
359
+ "learning_rate": 6.571428571428571e-05,
360
+ "loss": 0.8581,
361
+ "step": 120
362
+ },
363
+ {
364
+ "epoch": 16.0,
365
+ "eval_loss": 1.2083133459091187,
366
+ "eval_runtime": 17.1304,
367
+ "eval_samples_per_second": 0.409,
368
+ "eval_steps_per_second": 0.058,
369
+ "step": 120
370
+ },
371
+ {
372
+ "epoch": 16.67,
373
+ "grad_norm": 2.5655250549316406,
374
+ "learning_rate": 6.428571428571429e-05,
375
+ "loss": 0.7929,
376
+ "step": 125
377
+ },
378
+ {
379
+ "epoch": 16.67,
380
+ "eval_loss": 1.1945828199386597,
381
+ "eval_runtime": 17.104,
382
+ "eval_samples_per_second": 0.409,
383
+ "eval_steps_per_second": 0.058,
384
+ "step": 125
385
+ },
386
+ {
387
+ "epoch": 17.33,
388
+ "grad_norm": 2.168546199798584,
389
+ "learning_rate": 6.285714285714286e-05,
390
+ "loss": 0.7543,
391
+ "step": 130
392
+ },
393
+ {
394
+ "epoch": 17.33,
395
+ "eval_loss": 1.1876276731491089,
396
+ "eval_runtime": 17.1046,
397
+ "eval_samples_per_second": 0.409,
398
+ "eval_steps_per_second": 0.058,
399
+ "step": 130
400
+ },
401
+ {
402
+ "epoch": 18.0,
403
+ "grad_norm": 2.5984208583831787,
404
+ "learning_rate": 6.142857142857143e-05,
405
+ "loss": 0.716,
406
+ "step": 135
407
+ },
408
+ {
409
+ "epoch": 18.0,
410
+ "eval_loss": 1.1714750528335571,
411
+ "eval_runtime": 17.0807,
412
+ "eval_samples_per_second": 0.41,
413
+ "eval_steps_per_second": 0.059,
414
+ "step": 135
415
+ },
416
+ {
417
+ "epoch": 18.67,
418
+ "grad_norm": 3.479024887084961,
419
+ "learning_rate": 6e-05,
420
+ "loss": 0.6681,
421
+ "step": 140
422
+ },
423
+ {
424
+ "epoch": 18.67,
425
+ "eval_loss": 1.169895052909851,
426
+ "eval_runtime": 17.0681,
427
+ "eval_samples_per_second": 0.41,
428
+ "eval_steps_per_second": 0.059,
429
+ "step": 140
430
+ },
431
+ {
432
+ "epoch": 19.33,
433
+ "grad_norm": 2.563386917114258,
434
+ "learning_rate": 5.8571428571428575e-05,
435
+ "loss": 0.6306,
436
+ "step": 145
437
+ },
438
+ {
439
+ "epoch": 19.33,
440
+ "eval_loss": 1.1741083860397339,
441
+ "eval_runtime": 17.0568,
442
+ "eval_samples_per_second": 0.41,
443
+ "eval_steps_per_second": 0.059,
444
+ "step": 145
445
+ },
446
+ {
447
+ "epoch": 20.0,
448
+ "grad_norm": 2.96592116355896,
449
+ "learning_rate": 5.714285714285714e-05,
450
+ "loss": 0.6183,
451
+ "step": 150
452
+ },
453
+ {
454
+ "epoch": 20.0,
455
+ "eval_loss": 1.1455965042114258,
456
+ "eval_runtime": 17.073,
457
+ "eval_samples_per_second": 0.41,
458
+ "eval_steps_per_second": 0.059,
459
+ "step": 150
460
+ },
461
+ {
462
+ "epoch": 20.67,
463
+ "grad_norm": 2.6751275062561035,
464
+ "learning_rate": 5.571428571428572e-05,
465
+ "loss": 0.5464,
466
+ "step": 155
467
+ },
468
+ {
469
+ "epoch": 20.67,
470
+ "eval_loss": 1.131102204322815,
471
+ "eval_runtime": 17.0578,
472
+ "eval_samples_per_second": 0.41,
473
+ "eval_steps_per_second": 0.059,
474
+ "step": 155
475
+ },
476
+ {
477
+ "epoch": 21.33,
478
+ "grad_norm": 2.3700051307678223,
479
+ "learning_rate": 5.428571428571428e-05,
480
+ "loss": 0.551,
481
+ "step": 160
482
+ },
483
+ {
484
+ "epoch": 21.33,
485
+ "eval_loss": 1.127384066581726,
486
+ "eval_runtime": 17.0546,
487
+ "eval_samples_per_second": 0.41,
488
+ "eval_steps_per_second": 0.059,
489
+ "step": 160
490
+ },
491
+ {
492
+ "epoch": 22.0,
493
+ "grad_norm": 3.3827567100524902,
494
+ "learning_rate": 5.285714285714286e-05,
495
+ "loss": 0.5179,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 22.0,
500
+ "eval_loss": 1.111584186553955,
501
+ "eval_runtime": 17.0812,
502
+ "eval_samples_per_second": 0.41,
503
+ "eval_steps_per_second": 0.059,
504
+ "step": 165
505
+ },
506
+ {
507
+ "epoch": 22.67,
508
+ "grad_norm": 3.55790114402771,
509
+ "learning_rate": 5.142857142857143e-05,
510
+ "loss": 0.4831,
511
+ "step": 170
512
+ },
513
+ {
514
+ "epoch": 22.67,
515
+ "eval_loss": 1.0948525667190552,
516
+ "eval_runtime": 17.0547,
517
+ "eval_samples_per_second": 0.41,
518
+ "eval_steps_per_second": 0.059,
519
+ "step": 170
520
+ },
521
+ {
522
+ "epoch": 23.33,
523
+ "grad_norm": 3.0782699584960938,
524
+ "learning_rate": 5e-05,
525
+ "loss": 0.4587,
526
+ "step": 175
527
+ },
528
+ {
529
+ "epoch": 23.33,
530
+ "eval_loss": 1.0906586647033691,
531
+ "eval_runtime": 17.0666,
532
+ "eval_samples_per_second": 0.41,
533
+ "eval_steps_per_second": 0.059,
534
+ "step": 175
535
+ },
536
+ {
537
+ "epoch": 24.0,
538
+ "grad_norm": 3.3993167877197266,
539
+ "learning_rate": 4.8571428571428576e-05,
540
+ "loss": 0.4203,
541
+ "step": 180
542
+ },
543
+ {
544
+ "epoch": 24.0,
545
+ "eval_loss": 1.0688152313232422,
546
+ "eval_runtime": 17.0721,
547
+ "eval_samples_per_second": 0.41,
548
+ "eval_steps_per_second": 0.059,
549
+ "step": 180
550
+ },
551
+ {
552
+ "epoch": 24.67,
553
+ "grad_norm": 3.319303035736084,
554
+ "learning_rate": 4.714285714285714e-05,
555
+ "loss": 0.3975,
556
+ "step": 185
557
+ },
558
+ {
559
+ "epoch": 24.67,
560
+ "eval_loss": 1.0746583938598633,
561
+ "eval_runtime": 17.0709,
562
+ "eval_samples_per_second": 0.41,
563
+ "eval_steps_per_second": 0.059,
564
+ "step": 185
565
+ },
566
+ {
567
+ "epoch": 25.33,
568
+ "grad_norm": 2.4532127380371094,
569
+ "learning_rate": 4.5714285714285716e-05,
570
+ "loss": 0.3832,
571
+ "step": 190
572
+ },
573
+ {
574
+ "epoch": 25.33,
575
+ "eval_loss": 1.0772522687911987,
576
+ "eval_runtime": 17.0619,
577
+ "eval_samples_per_second": 0.41,
578
+ "eval_steps_per_second": 0.059,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 26.0,
583
+ "grad_norm": 3.956822156906128,
584
+ "learning_rate": 4.428571428571428e-05,
585
+ "loss": 0.3725,
586
+ "step": 195
587
+ },
588
+ {
589
+ "epoch": 26.0,
590
+ "eval_loss": 1.0638784170150757,
591
+ "eval_runtime": 17.0807,
592
+ "eval_samples_per_second": 0.41,
593
+ "eval_steps_per_second": 0.059,
594
+ "step": 195
595
+ },
596
+ {
597
+ "epoch": 26.67,
598
+ "grad_norm": 2.76033353805542,
599
+ "learning_rate": 4.2857142857142856e-05,
600
+ "loss": 0.3473,
601
+ "step": 200
602
+ },
603
+ {
604
+ "epoch": 26.67,
605
+ "eval_loss": 1.04669988155365,
606
+ "eval_runtime": 17.0774,
607
+ "eval_samples_per_second": 0.41,
608
+ "eval_steps_per_second": 0.059,
609
+ "step": 200
610
+ },
611
+ {
612
+ "epoch": 27.33,
613
+ "grad_norm": 3.8683507442474365,
614
+ "learning_rate": 4.1428571428571437e-05,
615
+ "loss": 0.3243,
616
+ "step": 205
617
+ },
618
+ {
619
+ "epoch": 27.33,
620
+ "eval_loss": 1.0470303297042847,
621
+ "eval_runtime": 17.0718,
622
+ "eval_samples_per_second": 0.41,
623
+ "eval_steps_per_second": 0.059,
624
+ "step": 205
625
+ },
626
+ {
627
+ "epoch": 28.0,
628
+ "grad_norm": 4.535538196563721,
629
+ "learning_rate": 4e-05,
630
+ "loss": 0.3202,
631
+ "step": 210
632
+ },
633
+ {
634
+ "epoch": 28.0,
635
+ "eval_loss": 1.025539517402649,
636
+ "eval_runtime": 17.0604,
637
+ "eval_samples_per_second": 0.41,
638
+ "eval_steps_per_second": 0.059,
639
+ "step": 210
640
+ },
641
+ {
642
+ "epoch": 28.67,
643
+ "grad_norm": 2.6224355697631836,
644
+ "learning_rate": 3.857142857142858e-05,
645
+ "loss": 0.2958,
646
+ "step": 215
647
+ },
648
+ {
649
+ "epoch": 28.67,
650
+ "eval_loss": 1.0192126035690308,
651
+ "eval_runtime": 17.0657,
652
+ "eval_samples_per_second": 0.41,
653
+ "eval_steps_per_second": 0.059,
654
+ "step": 215
655
+ },
656
+ {
657
+ "epoch": 29.33,
658
+ "grad_norm": 2.5870041847229004,
659
+ "learning_rate": 3.7142857142857143e-05,
660
+ "loss": 0.2783,
661
+ "step": 220
662
+ },
663
+ {
664
+ "epoch": 29.33,
665
+ "eval_loss": 1.0211580991744995,
666
+ "eval_runtime": 17.0857,
667
+ "eval_samples_per_second": 0.41,
668
+ "eval_steps_per_second": 0.059,
669
+ "step": 220
670
+ },
671
+ {
672
+ "epoch": 30.0,
673
+ "grad_norm": 3.4565751552581787,
674
+ "learning_rate": 3.571428571428572e-05,
675
+ "loss": 0.2773,
676
+ "step": 225
677
+ },
678
+ {
679
+ "epoch": 30.0,
680
+ "eval_loss": 1.006419062614441,
681
+ "eval_runtime": 17.0807,
682
+ "eval_samples_per_second": 0.41,
683
+ "eval_steps_per_second": 0.059,
684
+ "step": 225
685
+ },
686
+ {
687
+ "epoch": 30.67,
688
+ "grad_norm": 2.4756500720977783,
689
+ "learning_rate": 3.428571428571429e-05,
690
+ "loss": 0.2482,
691
+ "step": 230
692
+ },
693
+ {
694
+ "epoch": 30.67,
695
+ "eval_loss": 1.0081219673156738,
696
+ "eval_runtime": 17.0576,
697
+ "eval_samples_per_second": 0.41,
698
+ "eval_steps_per_second": 0.059,
699
+ "step": 230
700
+ },
701
+ {
702
+ "epoch": 31.33,
703
+ "grad_norm": 2.38002610206604,
704
+ "learning_rate": 3.285714285714286e-05,
705
+ "loss": 0.2464,
706
+ "step": 235
707
+ },
708
+ {
709
+ "epoch": 31.33,
710
+ "eval_loss": 1.0151804685592651,
711
+ "eval_runtime": 17.0587,
712
+ "eval_samples_per_second": 0.41,
713
+ "eval_steps_per_second": 0.059,
714
+ "step": 235
715
+ },
716
+ {
717
+ "epoch": 32.0,
718
+ "grad_norm": 3.7081105709075928,
719
+ "learning_rate": 3.142857142857143e-05,
720
+ "loss": 0.2442,
721
+ "step": 240
722
+ },
723
+ {
724
+ "epoch": 32.0,
725
+ "eval_loss": 1.0032445192337036,
726
+ "eval_runtime": 17.1613,
727
+ "eval_samples_per_second": 0.408,
728
+ "eval_steps_per_second": 0.058,
729
+ "step": 240
730
+ },
731
+ {
732
+ "epoch": 32.67,
733
+ "grad_norm": 2.55924391746521,
734
+ "learning_rate": 3e-05,
735
+ "loss": 0.2193,
736
+ "step": 245
737
+ },
738
+ {
739
+ "epoch": 32.67,
740
+ "eval_loss": 0.9989615082740784,
741
+ "eval_runtime": 17.0447,
742
+ "eval_samples_per_second": 0.411,
743
+ "eval_steps_per_second": 0.059,
744
+ "step": 245
745
+ },
746
+ {
747
+ "epoch": 33.33,
748
+ "grad_norm": 1.9451407194137573,
749
+ "learning_rate": 2.857142857142857e-05,
750
+ "loss": 0.2101,
751
+ "step": 250
752
+ },
753
+ {
754
+ "epoch": 33.33,
755
+ "eval_loss": 1.0029457807540894,
756
+ "eval_runtime": 17.0816,
757
+ "eval_samples_per_second": 0.41,
758
+ "eval_steps_per_second": 0.059,
759
+ "step": 250
760
+ },
761
+ {
762
+ "epoch": 34.0,
763
+ "grad_norm": 2.713731527328491,
764
+ "learning_rate": 2.714285714285714e-05,
765
+ "loss": 0.2194,
766
+ "step": 255
767
+ },
768
+ {
769
+ "epoch": 34.0,
770
+ "eval_loss": 0.9959421753883362,
771
+ "eval_runtime": 17.0747,
772
+ "eval_samples_per_second": 0.41,
773
+ "eval_steps_per_second": 0.059,
774
+ "step": 255
775
+ },
776
+ {
777
+ "epoch": 34.67,
778
+ "grad_norm": 2.1633846759796143,
779
+ "learning_rate": 2.5714285714285714e-05,
780
+ "loss": 0.1958,
781
+ "step": 260
782
+ },
783
+ {
784
+ "epoch": 34.67,
785
+ "eval_loss": 0.9989770650863647,
786
+ "eval_runtime": 17.0821,
787
+ "eval_samples_per_second": 0.41,
788
+ "eval_steps_per_second": 0.059,
789
+ "step": 260
790
+ },
791
+ {
792
+ "epoch": 35.33,
793
+ "grad_norm": 3.9233529567718506,
794
+ "learning_rate": 2.4285714285714288e-05,
795
+ "loss": 0.1831,
796
+ "step": 265
797
+ },
798
+ {
799
+ "epoch": 35.33,
800
+ "eval_loss": 1.0072578191757202,
801
+ "eval_runtime": 17.0564,
802
+ "eval_samples_per_second": 0.41,
803
+ "eval_steps_per_second": 0.059,
804
+ "step": 265
805
+ },
806
+ {
807
+ "epoch": 36.0,
808
+ "grad_norm": 2.4143056869506836,
809
+ "learning_rate": 2.2857142857142858e-05,
810
+ "loss": 0.1753,
811
+ "step": 270
812
+ },
813
+ {
814
+ "epoch": 36.0,
815
+ "eval_loss": 0.9938892722129822,
816
+ "eval_runtime": 17.0668,
817
+ "eval_samples_per_second": 0.41,
818
+ "eval_steps_per_second": 0.059,
819
+ "step": 270
820
+ },
821
+ {
822
+ "epoch": 36.67,
823
+ "grad_norm": 2.706679582595825,
824
+ "learning_rate": 2.1428571428571428e-05,
825
+ "loss": 0.1698,
826
+ "step": 275
827
+ },
828
+ {
829
+ "epoch": 36.67,
830
+ "eval_loss": 0.9969200491905212,
831
+ "eval_runtime": 17.0643,
832
+ "eval_samples_per_second": 0.41,
833
+ "eval_steps_per_second": 0.059,
834
+ "step": 275
835
+ },
836
+ {
837
+ "epoch": 37.33,
838
+ "grad_norm": 1.872753620147705,
839
+ "learning_rate": 2e-05,
840
+ "loss": 0.16,
841
+ "step": 280
842
+ },
843
+ {
844
+ "epoch": 37.33,
845
+ "eval_loss": 0.9940390586853027,
846
+ "eval_runtime": 17.0728,
847
+ "eval_samples_per_second": 0.41,
848
+ "eval_steps_per_second": 0.059,
849
+ "step": 280
850
+ },
851
+ {
852
+ "epoch": 38.0,
853
+ "grad_norm": 2.7510581016540527,
854
+ "learning_rate": 1.8571428571428572e-05,
855
+ "loss": 0.1614,
856
+ "step": 285
857
+ },
858
+ {
859
+ "epoch": 38.0,
860
+ "eval_loss": 1.0066231489181519,
861
+ "eval_runtime": 17.072,
862
+ "eval_samples_per_second": 0.41,
863
+ "eval_steps_per_second": 0.059,
864
+ "step": 285
865
+ },
866
+ {
867
+ "epoch": 38.67,
868
+ "grad_norm": 1.8461092710494995,
869
+ "learning_rate": 1.7142857142857145e-05,
870
+ "loss": 0.1506,
871
+ "step": 290
872
+ },
873
+ {
874
+ "epoch": 38.67,
875
+ "eval_loss": 0.9927281737327576,
876
+ "eval_runtime": 17.0481,
877
+ "eval_samples_per_second": 0.411,
878
+ "eval_steps_per_second": 0.059,
879
+ "step": 290
880
+ },
881
+ {
882
+ "epoch": 39.33,
883
+ "grad_norm": 1.8425017595291138,
884
+ "learning_rate": 1.5714285714285715e-05,
885
+ "loss": 0.1419,
886
+ "step": 295
887
+ },
888
+ {
889
+ "epoch": 39.33,
890
+ "eval_loss": 1.0133570432662964,
891
+ "eval_runtime": 17.0642,
892
+ "eval_samples_per_second": 0.41,
893
+ "eval_steps_per_second": 0.059,
894
+ "step": 295
895
+ },
896
+ {
897
+ "epoch": 40.0,
898
+ "grad_norm": 2.0457987785339355,
899
+ "learning_rate": 1.4285714285714285e-05,
900
+ "loss": 0.1459,
901
+ "step": 300
902
+ },
903
+ {
904
+ "epoch": 40.0,
905
+ "eval_loss": 1.0127934217453003,
906
+ "eval_runtime": 17.0581,
907
+ "eval_samples_per_second": 0.41,
908
+ "eval_steps_per_second": 0.059,
909
+ "step": 300
910
+ },
911
+ {
912
+ "epoch": 40.67,
913
+ "grad_norm": 1.5630775690078735,
914
+ "learning_rate": 1.2857142857142857e-05,
915
+ "loss": 0.1225,
916
+ "step": 305
917
+ },
918
+ {
919
+ "epoch": 40.67,
920
+ "eval_loss": 1.0092624425888062,
921
+ "eval_runtime": 17.0483,
922
+ "eval_samples_per_second": 0.411,
923
+ "eval_steps_per_second": 0.059,
924
+ "step": 305
925
+ },
926
+ {
927
+ "epoch": 41.33,
928
+ "grad_norm": 1.37598717212677,
929
+ "learning_rate": 1.1428571428571429e-05,
930
+ "loss": 0.146,
931
+ "step": 310
932
+ },
933
+ {
934
+ "epoch": 41.33,
935
+ "eval_loss": 1.0083317756652832,
936
+ "eval_runtime": 17.0804,
937
+ "eval_samples_per_second": 0.41,
938
+ "eval_steps_per_second": 0.059,
939
+ "step": 310
940
+ },
941
+ {
942
+ "epoch": 42.0,
943
+ "grad_norm": 1.8867217302322388,
944
+ "learning_rate": 1e-05,
945
+ "loss": 0.13,
946
+ "step": 315
947
+ },
948
+ {
949
+ "epoch": 42.0,
950
+ "eval_loss": 1.0165104866027832,
951
+ "eval_runtime": 17.0621,
952
+ "eval_samples_per_second": 0.41,
953
+ "eval_steps_per_second": 0.059,
954
+ "step": 315
955
+ },
956
+ {
957
+ "epoch": 42.67,
958
+ "grad_norm": 1.4643555879592896,
959
+ "learning_rate": 8.571428571428573e-06,
960
+ "loss": 0.131,
961
+ "step": 320
962
+ },
963
+ {
964
+ "epoch": 42.67,
965
+ "eval_loss": 1.0264887809753418,
966
+ "eval_runtime": 17.0554,
967
+ "eval_samples_per_second": 0.41,
968
+ "eval_steps_per_second": 0.059,
969
+ "step": 320
970
+ }
971
+ ],
972
+ "logging_steps": 5,
973
+ "max_steps": 350,
974
+ "num_input_tokens_seen": 0,
975
+ "num_train_epochs": 50,
976
+ "save_steps": 10,
977
+ "total_flos": 2.1266150580320993e+18,
978
+ "train_batch_size": 2,
979
+ "trial_name": null,
980
+ "trial_params": null
981
+ }
checkpoint-320/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed0ba6fb9a88dad56f61d9306f17b4e66e8767d898772faa97871a1388e82cf
3
+ size 4920
checkpoint-330/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - _load_in_8bit: False
10
+ - _load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ - load_in_4bit: False
19
+ - load_in_8bit: False
20
+ ### Framework versions
21
+
22
+
23
+ - PEFT 0.5.0
checkpoint-330/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-330/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5505e257fa18af21b6af22ca968df21297c4fb92614c82743718e81cf858cf9c
3
+ size 65578776
checkpoint-330/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8098efa09a64b1fcc8326dd62e171a71ab1c0f95e20015654d914ad8fbfb5880
3
+ size 131345914
checkpoint-330/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42b24e02a21b68f4542793debee790ccdfacee8b8c9cad8a58a3c5770a53b720
3
+ size 14244
checkpoint-330/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7302be973ca42b6ea8fae42686b6439278ad5d7647a75f6503a767b9e0f09c
3
+ size 1064
checkpoint-330/trainer_state.json ADDED
@@ -0,0 +1,1011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9927281737327576,
3
+ "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-290",
4
+ "epoch": 44.0,
5
+ "eval_steps": 5,
6
+ "global_step": 330,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.67,
13
+ "grad_norm": 0.243179589509964,
14
+ "learning_rate": 9.857142857142858e-05,
15
+ "loss": 1.9956,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.67,
20
+ "eval_loss": 1.9701930284500122,
21
+ "eval_runtime": 17.115,
22
+ "eval_samples_per_second": 0.409,
23
+ "eval_steps_per_second": 0.058,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 1.33,
28
+ "grad_norm": 0.34590908885002136,
29
+ "learning_rate": 9.714285714285715e-05,
30
+ "loss": 1.9758,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 1.33,
35
+ "eval_loss": 1.8941271305084229,
36
+ "eval_runtime": 17.0912,
37
+ "eval_samples_per_second": 0.41,
38
+ "eval_steps_per_second": 0.059,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 2.0,
43
+ "grad_norm": 0.31595832109451294,
44
+ "learning_rate": 9.571428571428573e-05,
45
+ "loss": 1.849,
46
+ "step": 15
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "eval_loss": 1.8046789169311523,
51
+ "eval_runtime": 17.098,
52
+ "eval_samples_per_second": 0.409,
53
+ "eval_steps_per_second": 0.058,
54
+ "step": 15
55
+ },
56
+ {
57
+ "epoch": 2.67,
58
+ "grad_norm": 0.3428090512752533,
59
+ "learning_rate": 9.428571428571429e-05,
60
+ "loss": 1.789,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 2.67,
65
+ "eval_loss": 1.7658358812332153,
66
+ "eval_runtime": 17.0734,
67
+ "eval_samples_per_second": 0.41,
68
+ "eval_steps_per_second": 0.059,
69
+ "step": 20
70
+ },
71
+ {
72
+ "epoch": 3.33,
73
+ "grad_norm": 0.3102028965950012,
74
+ "learning_rate": 9.285714285714286e-05,
75
+ "loss": 1.7789,
76
+ "step": 25
77
+ },
78
+ {
79
+ "epoch": 3.33,
80
+ "eval_loss": 1.7225048542022705,
81
+ "eval_runtime": 17.0972,
82
+ "eval_samples_per_second": 0.409,
83
+ "eval_steps_per_second": 0.058,
84
+ "step": 25
85
+ },
86
+ {
87
+ "epoch": 4.0,
88
+ "grad_norm": 0.38602885603904724,
89
+ "learning_rate": 9.142857142857143e-05,
90
+ "loss": 1.7003,
91
+ "step": 30
92
+ },
93
+ {
94
+ "epoch": 4.0,
95
+ "eval_loss": 1.6749440431594849,
96
+ "eval_runtime": 17.1034,
97
+ "eval_samples_per_second": 0.409,
98
+ "eval_steps_per_second": 0.058,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 4.67,
103
+ "grad_norm": 0.37120407819747925,
104
+ "learning_rate": 9e-05,
105
+ "loss": 1.6424,
106
+ "step": 35
107
+ },
108
+ {
109
+ "epoch": 4.67,
110
+ "eval_loss": 1.6231099367141724,
111
+ "eval_runtime": 17.1067,
112
+ "eval_samples_per_second": 0.409,
113
+ "eval_steps_per_second": 0.058,
114
+ "step": 35
115
+ },
116
+ {
117
+ "epoch": 5.33,
118
+ "grad_norm": 0.4633428454399109,
119
+ "learning_rate": 8.857142857142857e-05,
120
+ "loss": 1.6023,
121
+ "step": 40
122
+ },
123
+ {
124
+ "epoch": 5.33,
125
+ "eval_loss": 1.5727053880691528,
126
+ "eval_runtime": 17.1002,
127
+ "eval_samples_per_second": 0.409,
128
+ "eval_steps_per_second": 0.058,
129
+ "step": 40
130
+ },
131
+ {
132
+ "epoch": 6.0,
133
+ "grad_norm": 0.5034663081169128,
134
+ "learning_rate": 8.714285714285715e-05,
135
+ "loss": 1.5322,
136
+ "step": 45
137
+ },
138
+ {
139
+ "epoch": 6.0,
140
+ "eval_loss": 1.5312587022781372,
141
+ "eval_runtime": 17.1159,
142
+ "eval_samples_per_second": 0.409,
143
+ "eval_steps_per_second": 0.058,
144
+ "step": 45
145
+ },
146
+ {
147
+ "epoch": 6.67,
148
+ "grad_norm": 0.5549929141998291,
149
+ "learning_rate": 8.571428571428571e-05,
150
+ "loss": 1.4788,
151
+ "step": 50
152
+ },
153
+ {
154
+ "epoch": 6.67,
155
+ "eval_loss": 1.492464303970337,
156
+ "eval_runtime": 17.0823,
157
+ "eval_samples_per_second": 0.41,
158
+ "eval_steps_per_second": 0.059,
159
+ "step": 50
160
+ },
161
+ {
162
+ "epoch": 7.33,
163
+ "grad_norm": 0.49194690585136414,
164
+ "learning_rate": 8.428571428571429e-05,
165
+ "loss": 1.4632,
166
+ "step": 55
167
+ },
168
+ {
169
+ "epoch": 7.33,
170
+ "eval_loss": 1.4622489213943481,
171
+ "eval_runtime": 17.1022,
172
+ "eval_samples_per_second": 0.409,
173
+ "eval_steps_per_second": 0.058,
174
+ "step": 55
175
+ },
176
+ {
177
+ "epoch": 8.0,
178
+ "grad_norm": 0.5866131782531738,
179
+ "learning_rate": 8.285714285714287e-05,
180
+ "loss": 1.3951,
181
+ "step": 60
182
+ },
183
+ {
184
+ "epoch": 8.0,
185
+ "eval_loss": 1.435951828956604,
186
+ "eval_runtime": 17.1087,
187
+ "eval_samples_per_second": 0.409,
188
+ "eval_steps_per_second": 0.058,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 8.67,
193
+ "grad_norm": 0.6252542734146118,
194
+ "learning_rate": 8.142857142857143e-05,
195
+ "loss": 1.3796,
196
+ "step": 65
197
+ },
198
+ {
199
+ "epoch": 8.67,
200
+ "eval_loss": 1.413227915763855,
201
+ "eval_runtime": 17.0914,
202
+ "eval_samples_per_second": 0.41,
203
+ "eval_steps_per_second": 0.059,
204
+ "step": 65
205
+ },
206
+ {
207
+ "epoch": 9.33,
208
+ "grad_norm": 0.6751863360404968,
209
+ "learning_rate": 8e-05,
210
+ "loss": 1.3257,
211
+ "step": 70
212
+ },
213
+ {
214
+ "epoch": 9.33,
215
+ "eval_loss": 1.395649790763855,
216
+ "eval_runtime": 17.0885,
217
+ "eval_samples_per_second": 0.41,
218
+ "eval_steps_per_second": 0.059,
219
+ "step": 70
220
+ },
221
+ {
222
+ "epoch": 10.0,
223
+ "grad_norm": 0.8878222703933716,
224
+ "learning_rate": 7.857142857142858e-05,
225
+ "loss": 1.2795,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 10.0,
230
+ "eval_loss": 1.3699487447738647,
231
+ "eval_runtime": 17.1031,
232
+ "eval_samples_per_second": 0.409,
233
+ "eval_steps_per_second": 0.058,
234
+ "step": 75
235
+ },
236
+ {
237
+ "epoch": 10.67,
238
+ "grad_norm": 0.8470121026039124,
239
+ "learning_rate": 7.714285714285715e-05,
240
+ "loss": 1.2449,
241
+ "step": 80
242
+ },
243
+ {
244
+ "epoch": 10.67,
245
+ "eval_loss": 1.347831130027771,
246
+ "eval_runtime": 17.0985,
247
+ "eval_samples_per_second": 0.409,
248
+ "eval_steps_per_second": 0.058,
249
+ "step": 80
250
+ },
251
+ {
252
+ "epoch": 11.33,
253
+ "grad_norm": 1.0655425786972046,
254
+ "learning_rate": 7.571428571428571e-05,
255
+ "loss": 1.1983,
256
+ "step": 85
257
+ },
258
+ {
259
+ "epoch": 11.33,
260
+ "eval_loss": 1.3311971426010132,
261
+ "eval_runtime": 17.0784,
262
+ "eval_samples_per_second": 0.41,
263
+ "eval_steps_per_second": 0.059,
264
+ "step": 85
265
+ },
266
+ {
267
+ "epoch": 12.0,
268
+ "grad_norm": 1.2651888132095337,
269
+ "learning_rate": 7.428571428571429e-05,
270
+ "loss": 1.1467,
271
+ "step": 90
272
+ },
273
+ {
274
+ "epoch": 12.0,
275
+ "eval_loss": 1.3095277547836304,
276
+ "eval_runtime": 17.0903,
277
+ "eval_samples_per_second": 0.41,
278
+ "eval_steps_per_second": 0.059,
279
+ "step": 90
280
+ },
281
+ {
282
+ "epoch": 12.67,
283
+ "grad_norm": 1.248926043510437,
284
+ "learning_rate": 7.285714285714286e-05,
285
+ "loss": 1.0922,
286
+ "step": 95
287
+ },
288
+ {
289
+ "epoch": 12.67,
290
+ "eval_loss": 1.2942878007888794,
291
+ "eval_runtime": 17.0947,
292
+ "eval_samples_per_second": 0.409,
293
+ "eval_steps_per_second": 0.058,
294
+ "step": 95
295
+ },
296
+ {
297
+ "epoch": 13.33,
298
+ "grad_norm": 1.896952509880066,
299
+ "learning_rate": 7.142857142857143e-05,
300
+ "loss": 1.0403,
301
+ "step": 100
302
+ },
303
+ {
304
+ "epoch": 13.33,
305
+ "eval_loss": 1.2803159952163696,
306
+ "eval_runtime": 17.0819,
307
+ "eval_samples_per_second": 0.41,
308
+ "eval_steps_per_second": 0.059,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 14.0,
313
+ "grad_norm": 1.862244725227356,
314
+ "learning_rate": 7e-05,
315
+ "loss": 1.0049,
316
+ "step": 105
317
+ },
318
+ {
319
+ "epoch": 14.0,
320
+ "eval_loss": 1.2643567323684692,
321
+ "eval_runtime": 17.0849,
322
+ "eval_samples_per_second": 0.41,
323
+ "eval_steps_per_second": 0.059,
324
+ "step": 105
325
+ },
326
+ {
327
+ "epoch": 14.67,
328
+ "grad_norm": 1.7487821578979492,
329
+ "learning_rate": 6.857142857142858e-05,
330
+ "loss": 0.9262,
331
+ "step": 110
332
+ },
333
+ {
334
+ "epoch": 14.67,
335
+ "eval_loss": 1.2471646070480347,
336
+ "eval_runtime": 17.1278,
337
+ "eval_samples_per_second": 0.409,
338
+ "eval_steps_per_second": 0.058,
339
+ "step": 110
340
+ },
341
+ {
342
+ "epoch": 15.33,
343
+ "grad_norm": 1.838605284690857,
344
+ "learning_rate": 6.714285714285714e-05,
345
+ "loss": 0.8965,
346
+ "step": 115
347
+ },
348
+ {
349
+ "epoch": 15.33,
350
+ "eval_loss": 1.2377034425735474,
351
+ "eval_runtime": 17.0731,
352
+ "eval_samples_per_second": 0.41,
353
+ "eval_steps_per_second": 0.059,
354
+ "step": 115
355
+ },
356
+ {
357
+ "epoch": 16.0,
358
+ "grad_norm": 3.117398977279663,
359
+ "learning_rate": 6.571428571428571e-05,
360
+ "loss": 0.8581,
361
+ "step": 120
362
+ },
363
+ {
364
+ "epoch": 16.0,
365
+ "eval_loss": 1.2083133459091187,
366
+ "eval_runtime": 17.1304,
367
+ "eval_samples_per_second": 0.409,
368
+ "eval_steps_per_second": 0.058,
369
+ "step": 120
370
+ },
371
+ {
372
+ "epoch": 16.67,
373
+ "grad_norm": 2.5655250549316406,
374
+ "learning_rate": 6.428571428571429e-05,
375
+ "loss": 0.7929,
376
+ "step": 125
377
+ },
378
+ {
379
+ "epoch": 16.67,
380
+ "eval_loss": 1.1945828199386597,
381
+ "eval_runtime": 17.104,
382
+ "eval_samples_per_second": 0.409,
383
+ "eval_steps_per_second": 0.058,
384
+ "step": 125
385
+ },
386
+ {
387
+ "epoch": 17.33,
388
+ "grad_norm": 2.168546199798584,
389
+ "learning_rate": 6.285714285714286e-05,
390
+ "loss": 0.7543,
391
+ "step": 130
392
+ },
393
+ {
394
+ "epoch": 17.33,
395
+ "eval_loss": 1.1876276731491089,
396
+ "eval_runtime": 17.1046,
397
+ "eval_samples_per_second": 0.409,
398
+ "eval_steps_per_second": 0.058,
399
+ "step": 130
400
+ },
401
+ {
402
+ "epoch": 18.0,
403
+ "grad_norm": 2.5984208583831787,
404
+ "learning_rate": 6.142857142857143e-05,
405
+ "loss": 0.716,
406
+ "step": 135
407
+ },
408
+ {
409
+ "epoch": 18.0,
410
+ "eval_loss": 1.1714750528335571,
411
+ "eval_runtime": 17.0807,
412
+ "eval_samples_per_second": 0.41,
413
+ "eval_steps_per_second": 0.059,
414
+ "step": 135
415
+ },
416
+ {
417
+ "epoch": 18.67,
418
+ "grad_norm": 3.479024887084961,
419
+ "learning_rate": 6e-05,
420
+ "loss": 0.6681,
421
+ "step": 140
422
+ },
423
+ {
424
+ "epoch": 18.67,
425
+ "eval_loss": 1.169895052909851,
426
+ "eval_runtime": 17.0681,
427
+ "eval_samples_per_second": 0.41,
428
+ "eval_steps_per_second": 0.059,
429
+ "step": 140
430
+ },
431
+ {
432
+ "epoch": 19.33,
433
+ "grad_norm": 2.563386917114258,
434
+ "learning_rate": 5.8571428571428575e-05,
435
+ "loss": 0.6306,
436
+ "step": 145
437
+ },
438
+ {
439
+ "epoch": 19.33,
440
+ "eval_loss": 1.1741083860397339,
441
+ "eval_runtime": 17.0568,
442
+ "eval_samples_per_second": 0.41,
443
+ "eval_steps_per_second": 0.059,
444
+ "step": 145
445
+ },
446
+ {
447
+ "epoch": 20.0,
448
+ "grad_norm": 2.96592116355896,
449
+ "learning_rate": 5.714285714285714e-05,
450
+ "loss": 0.6183,
451
+ "step": 150
452
+ },
453
+ {
454
+ "epoch": 20.0,
455
+ "eval_loss": 1.1455965042114258,
456
+ "eval_runtime": 17.073,
457
+ "eval_samples_per_second": 0.41,
458
+ "eval_steps_per_second": 0.059,
459
+ "step": 150
460
+ },
461
+ {
462
+ "epoch": 20.67,
463
+ "grad_norm": 2.6751275062561035,
464
+ "learning_rate": 5.571428571428572e-05,
465
+ "loss": 0.5464,
466
+ "step": 155
467
+ },
468
+ {
469
+ "epoch": 20.67,
470
+ "eval_loss": 1.131102204322815,
471
+ "eval_runtime": 17.0578,
472
+ "eval_samples_per_second": 0.41,
473
+ "eval_steps_per_second": 0.059,
474
+ "step": 155
475
+ },
476
+ {
477
+ "epoch": 21.33,
478
+ "grad_norm": 2.3700051307678223,
479
+ "learning_rate": 5.428571428571428e-05,
480
+ "loss": 0.551,
481
+ "step": 160
482
+ },
483
+ {
484
+ "epoch": 21.33,
485
+ "eval_loss": 1.127384066581726,
486
+ "eval_runtime": 17.0546,
487
+ "eval_samples_per_second": 0.41,
488
+ "eval_steps_per_second": 0.059,
489
+ "step": 160
490
+ },
491
+ {
492
+ "epoch": 22.0,
493
+ "grad_norm": 3.3827567100524902,
494
+ "learning_rate": 5.285714285714286e-05,
495
+ "loss": 0.5179,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 22.0,
500
+ "eval_loss": 1.111584186553955,
501
+ "eval_runtime": 17.0812,
502
+ "eval_samples_per_second": 0.41,
503
+ "eval_steps_per_second": 0.059,
504
+ "step": 165
505
+ },
506
+ {
507
+ "epoch": 22.67,
508
+ "grad_norm": 3.55790114402771,
509
+ "learning_rate": 5.142857142857143e-05,
510
+ "loss": 0.4831,
511
+ "step": 170
512
+ },
513
+ {
514
+ "epoch": 22.67,
515
+ "eval_loss": 1.0948525667190552,
516
+ "eval_runtime": 17.0547,
517
+ "eval_samples_per_second": 0.41,
518
+ "eval_steps_per_second": 0.059,
519
+ "step": 170
520
+ },
521
+ {
522
+ "epoch": 23.33,
523
+ "grad_norm": 3.0782699584960938,
524
+ "learning_rate": 5e-05,
525
+ "loss": 0.4587,
526
+ "step": 175
527
+ },
528
+ {
529
+ "epoch": 23.33,
530
+ "eval_loss": 1.0906586647033691,
531
+ "eval_runtime": 17.0666,
532
+ "eval_samples_per_second": 0.41,
533
+ "eval_steps_per_second": 0.059,
534
+ "step": 175
535
+ },
536
+ {
537
+ "epoch": 24.0,
538
+ "grad_norm": 3.3993167877197266,
539
+ "learning_rate": 4.8571428571428576e-05,
540
+ "loss": 0.4203,
541
+ "step": 180
542
+ },
543
+ {
544
+ "epoch": 24.0,
545
+ "eval_loss": 1.0688152313232422,
546
+ "eval_runtime": 17.0721,
547
+ "eval_samples_per_second": 0.41,
548
+ "eval_steps_per_second": 0.059,
549
+ "step": 180
550
+ },
551
+ {
552
+ "epoch": 24.67,
553
+ "grad_norm": 3.319303035736084,
554
+ "learning_rate": 4.714285714285714e-05,
555
+ "loss": 0.3975,
556
+ "step": 185
557
+ },
558
+ {
559
+ "epoch": 24.67,
560
+ "eval_loss": 1.0746583938598633,
561
+ "eval_runtime": 17.0709,
562
+ "eval_samples_per_second": 0.41,
563
+ "eval_steps_per_second": 0.059,
564
+ "step": 185
565
+ },
566
+ {
567
+ "epoch": 25.33,
568
+ "grad_norm": 2.4532127380371094,
569
+ "learning_rate": 4.5714285714285716e-05,
570
+ "loss": 0.3832,
571
+ "step": 190
572
+ },
573
+ {
574
+ "epoch": 25.33,
575
+ "eval_loss": 1.0772522687911987,
576
+ "eval_runtime": 17.0619,
577
+ "eval_samples_per_second": 0.41,
578
+ "eval_steps_per_second": 0.059,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 26.0,
583
+ "grad_norm": 3.956822156906128,
584
+ "learning_rate": 4.428571428571428e-05,
585
+ "loss": 0.3725,
586
+ "step": 195
587
+ },
588
+ {
589
+ "epoch": 26.0,
590
+ "eval_loss": 1.0638784170150757,
591
+ "eval_runtime": 17.0807,
592
+ "eval_samples_per_second": 0.41,
593
+ "eval_steps_per_second": 0.059,
594
+ "step": 195
595
+ },
596
+ {
597
+ "epoch": 26.67,
598
+ "grad_norm": 2.76033353805542,
599
+ "learning_rate": 4.2857142857142856e-05,
600
+ "loss": 0.3473,
601
+ "step": 200
602
+ },
603
+ {
604
+ "epoch": 26.67,
605
+ "eval_loss": 1.04669988155365,
606
+ "eval_runtime": 17.0774,
607
+ "eval_samples_per_second": 0.41,
608
+ "eval_steps_per_second": 0.059,
609
+ "step": 200
610
+ },
611
+ {
612
+ "epoch": 27.33,
613
+ "grad_norm": 3.8683507442474365,
614
+ "learning_rate": 4.1428571428571437e-05,
615
+ "loss": 0.3243,
616
+ "step": 205
617
+ },
618
+ {
619
+ "epoch": 27.33,
620
+ "eval_loss": 1.0470303297042847,
621
+ "eval_runtime": 17.0718,
622
+ "eval_samples_per_second": 0.41,
623
+ "eval_steps_per_second": 0.059,
624
+ "step": 205
625
+ },
626
+ {
627
+ "epoch": 28.0,
628
+ "grad_norm": 4.535538196563721,
629
+ "learning_rate": 4e-05,
630
+ "loss": 0.3202,
631
+ "step": 210
632
+ },
633
+ {
634
+ "epoch": 28.0,
635
+ "eval_loss": 1.025539517402649,
636
+ "eval_runtime": 17.0604,
637
+ "eval_samples_per_second": 0.41,
638
+ "eval_steps_per_second": 0.059,
639
+ "step": 210
640
+ },
641
+ {
642
+ "epoch": 28.67,
643
+ "grad_norm": 2.6224355697631836,
644
+ "learning_rate": 3.857142857142858e-05,
645
+ "loss": 0.2958,
646
+ "step": 215
647
+ },
648
+ {
649
+ "epoch": 28.67,
650
+ "eval_loss": 1.0192126035690308,
651
+ "eval_runtime": 17.0657,
652
+ "eval_samples_per_second": 0.41,
653
+ "eval_steps_per_second": 0.059,
654
+ "step": 215
655
+ },
656
+ {
657
+ "epoch": 29.33,
658
+ "grad_norm": 2.5870041847229004,
659
+ "learning_rate": 3.7142857142857143e-05,
660
+ "loss": 0.2783,
661
+ "step": 220
662
+ },
663
+ {
664
+ "epoch": 29.33,
665
+ "eval_loss": 1.0211580991744995,
666
+ "eval_runtime": 17.0857,
667
+ "eval_samples_per_second": 0.41,
668
+ "eval_steps_per_second": 0.059,
669
+ "step": 220
670
+ },
671
+ {
672
+ "epoch": 30.0,
673
+ "grad_norm": 3.4565751552581787,
674
+ "learning_rate": 3.571428571428572e-05,
675
+ "loss": 0.2773,
676
+ "step": 225
677
+ },
678
+ {
679
+ "epoch": 30.0,
680
+ "eval_loss": 1.006419062614441,
681
+ "eval_runtime": 17.0807,
682
+ "eval_samples_per_second": 0.41,
683
+ "eval_steps_per_second": 0.059,
684
+ "step": 225
685
+ },
686
+ {
687
+ "epoch": 30.67,
688
+ "grad_norm": 2.4756500720977783,
689
+ "learning_rate": 3.428571428571429e-05,
690
+ "loss": 0.2482,
691
+ "step": 230
692
+ },
693
+ {
694
+ "epoch": 30.67,
695
+ "eval_loss": 1.0081219673156738,
696
+ "eval_runtime": 17.0576,
697
+ "eval_samples_per_second": 0.41,
698
+ "eval_steps_per_second": 0.059,
699
+ "step": 230
700
+ },
701
+ {
702
+ "epoch": 31.33,
703
+ "grad_norm": 2.38002610206604,
704
+ "learning_rate": 3.285714285714286e-05,
705
+ "loss": 0.2464,
706
+ "step": 235
707
+ },
708
+ {
709
+ "epoch": 31.33,
710
+ "eval_loss": 1.0151804685592651,
711
+ "eval_runtime": 17.0587,
712
+ "eval_samples_per_second": 0.41,
713
+ "eval_steps_per_second": 0.059,
714
+ "step": 235
715
+ },
716
+ {
717
+ "epoch": 32.0,
718
+ "grad_norm": 3.7081105709075928,
719
+ "learning_rate": 3.142857142857143e-05,
720
+ "loss": 0.2442,
721
+ "step": 240
722
+ },
723
+ {
724
+ "epoch": 32.0,
725
+ "eval_loss": 1.0032445192337036,
726
+ "eval_runtime": 17.1613,
727
+ "eval_samples_per_second": 0.408,
728
+ "eval_steps_per_second": 0.058,
729
+ "step": 240
730
+ },
731
+ {
732
+ "epoch": 32.67,
733
+ "grad_norm": 2.55924391746521,
734
+ "learning_rate": 3e-05,
735
+ "loss": 0.2193,
736
+ "step": 245
737
+ },
738
+ {
739
+ "epoch": 32.67,
740
+ "eval_loss": 0.9989615082740784,
741
+ "eval_runtime": 17.0447,
742
+ "eval_samples_per_second": 0.411,
743
+ "eval_steps_per_second": 0.059,
744
+ "step": 245
745
+ },
746
+ {
747
+ "epoch": 33.33,
748
+ "grad_norm": 1.9451407194137573,
749
+ "learning_rate": 2.857142857142857e-05,
750
+ "loss": 0.2101,
751
+ "step": 250
752
+ },
753
+ {
754
+ "epoch": 33.33,
755
+ "eval_loss": 1.0029457807540894,
756
+ "eval_runtime": 17.0816,
757
+ "eval_samples_per_second": 0.41,
758
+ "eval_steps_per_second": 0.059,
759
+ "step": 250
760
+ },
761
+ {
762
+ "epoch": 34.0,
763
+ "grad_norm": 2.713731527328491,
764
+ "learning_rate": 2.714285714285714e-05,
765
+ "loss": 0.2194,
766
+ "step": 255
767
+ },
768
+ {
769
+ "epoch": 34.0,
770
+ "eval_loss": 0.9959421753883362,
771
+ "eval_runtime": 17.0747,
772
+ "eval_samples_per_second": 0.41,
773
+ "eval_steps_per_second": 0.059,
774
+ "step": 255
775
+ },
776
+ {
777
+ "epoch": 34.67,
778
+ "grad_norm": 2.1633846759796143,
779
+ "learning_rate": 2.5714285714285714e-05,
780
+ "loss": 0.1958,
781
+ "step": 260
782
+ },
783
+ {
784
+ "epoch": 34.67,
785
+ "eval_loss": 0.9989770650863647,
786
+ "eval_runtime": 17.0821,
787
+ "eval_samples_per_second": 0.41,
788
+ "eval_steps_per_second": 0.059,
789
+ "step": 260
790
+ },
791
+ {
792
+ "epoch": 35.33,
793
+ "grad_norm": 3.9233529567718506,
794
+ "learning_rate": 2.4285714285714288e-05,
795
+ "loss": 0.1831,
796
+ "step": 265
797
+ },
798
+ {
799
+ "epoch": 35.33,
800
+ "eval_loss": 1.0072578191757202,
801
+ "eval_runtime": 17.0564,
802
+ "eval_samples_per_second": 0.41,
803
+ "eval_steps_per_second": 0.059,
804
+ "step": 265
805
+ },
806
+ {
807
+ "epoch": 36.0,
808
+ "grad_norm": 2.4143056869506836,
809
+ "learning_rate": 2.2857142857142858e-05,
810
+ "loss": 0.1753,
811
+ "step": 270
812
+ },
813
+ {
814
+ "epoch": 36.0,
815
+ "eval_loss": 0.9938892722129822,
816
+ "eval_runtime": 17.0668,
817
+ "eval_samples_per_second": 0.41,
818
+ "eval_steps_per_second": 0.059,
819
+ "step": 270
820
+ },
821
+ {
822
+ "epoch": 36.67,
823
+ "grad_norm": 2.706679582595825,
824
+ "learning_rate": 2.1428571428571428e-05,
825
+ "loss": 0.1698,
826
+ "step": 275
827
+ },
828
+ {
829
+ "epoch": 36.67,
830
+ "eval_loss": 0.9969200491905212,
831
+ "eval_runtime": 17.0643,
832
+ "eval_samples_per_second": 0.41,
833
+ "eval_steps_per_second": 0.059,
834
+ "step": 275
835
+ },
836
+ {
837
+ "epoch": 37.33,
838
+ "grad_norm": 1.872753620147705,
839
+ "learning_rate": 2e-05,
840
+ "loss": 0.16,
841
+ "step": 280
842
+ },
843
+ {
844
+ "epoch": 37.33,
845
+ "eval_loss": 0.9940390586853027,
846
+ "eval_runtime": 17.0728,
847
+ "eval_samples_per_second": 0.41,
848
+ "eval_steps_per_second": 0.059,
849
+ "step": 280
850
+ },
851
+ {
852
+ "epoch": 38.0,
853
+ "grad_norm": 2.7510581016540527,
854
+ "learning_rate": 1.8571428571428572e-05,
855
+ "loss": 0.1614,
856
+ "step": 285
857
+ },
858
+ {
859
+ "epoch": 38.0,
860
+ "eval_loss": 1.0066231489181519,
861
+ "eval_runtime": 17.072,
862
+ "eval_samples_per_second": 0.41,
863
+ "eval_steps_per_second": 0.059,
864
+ "step": 285
865
+ },
866
+ {
867
+ "epoch": 38.67,
868
+ "grad_norm": 1.8461092710494995,
869
+ "learning_rate": 1.7142857142857145e-05,
870
+ "loss": 0.1506,
871
+ "step": 290
872
+ },
873
+ {
874
+ "epoch": 38.67,
875
+ "eval_loss": 0.9927281737327576,
876
+ "eval_runtime": 17.0481,
877
+ "eval_samples_per_second": 0.411,
878
+ "eval_steps_per_second": 0.059,
879
+ "step": 290
880
+ },
881
+ {
882
+ "epoch": 39.33,
883
+ "grad_norm": 1.8425017595291138,
884
+ "learning_rate": 1.5714285714285715e-05,
885
+ "loss": 0.1419,
886
+ "step": 295
887
+ },
888
+ {
889
+ "epoch": 39.33,
890
+ "eval_loss": 1.0133570432662964,
891
+ "eval_runtime": 17.0642,
892
+ "eval_samples_per_second": 0.41,
893
+ "eval_steps_per_second": 0.059,
894
+ "step": 295
895
+ },
896
+ {
897
+ "epoch": 40.0,
898
+ "grad_norm": 2.0457987785339355,
899
+ "learning_rate": 1.4285714285714285e-05,
900
+ "loss": 0.1459,
901
+ "step": 300
902
+ },
903
+ {
904
+ "epoch": 40.0,
905
+ "eval_loss": 1.0127934217453003,
906
+ "eval_runtime": 17.0581,
907
+ "eval_samples_per_second": 0.41,
908
+ "eval_steps_per_second": 0.059,
909
+ "step": 300
910
+ },
911
+ {
912
+ "epoch": 40.67,
913
+ "grad_norm": 1.5630775690078735,
914
+ "learning_rate": 1.2857142857142857e-05,
915
+ "loss": 0.1225,
916
+ "step": 305
917
+ },
918
+ {
919
+ "epoch": 40.67,
920
+ "eval_loss": 1.0092624425888062,
921
+ "eval_runtime": 17.0483,
922
+ "eval_samples_per_second": 0.411,
923
+ "eval_steps_per_second": 0.059,
924
+ "step": 305
925
+ },
926
+ {
927
+ "epoch": 41.33,
928
+ "grad_norm": 1.37598717212677,
929
+ "learning_rate": 1.1428571428571429e-05,
930
+ "loss": 0.146,
931
+ "step": 310
932
+ },
933
+ {
934
+ "epoch": 41.33,
935
+ "eval_loss": 1.0083317756652832,
936
+ "eval_runtime": 17.0804,
937
+ "eval_samples_per_second": 0.41,
938
+ "eval_steps_per_second": 0.059,
939
+ "step": 310
940
+ },
941
+ {
942
+ "epoch": 42.0,
943
+ "grad_norm": 1.8867217302322388,
944
+ "learning_rate": 1e-05,
945
+ "loss": 0.13,
946
+ "step": 315
947
+ },
948
+ {
949
+ "epoch": 42.0,
950
+ "eval_loss": 1.0165104866027832,
951
+ "eval_runtime": 17.0621,
952
+ "eval_samples_per_second": 0.41,
953
+ "eval_steps_per_second": 0.059,
954
+ "step": 315
955
+ },
956
+ {
957
+ "epoch": 42.67,
958
+ "grad_norm": 1.4643555879592896,
959
+ "learning_rate": 8.571428571428573e-06,
960
+ "loss": 0.131,
961
+ "step": 320
962
+ },
963
+ {
964
+ "epoch": 42.67,
965
+ "eval_loss": 1.0264887809753418,
966
+ "eval_runtime": 17.0554,
967
+ "eval_samples_per_second": 0.41,
968
+ "eval_steps_per_second": 0.059,
969
+ "step": 320
970
+ },
971
+ {
972
+ "epoch": 43.33,
973
+ "grad_norm": 1.4163501262664795,
974
+ "learning_rate": 7.142857142857143e-06,
975
+ "loss": 0.1205,
976
+ "step": 325
977
+ },
978
+ {
979
+ "epoch": 43.33,
980
+ "eval_loss": 1.0175670385360718,
981
+ "eval_runtime": 17.0692,
982
+ "eval_samples_per_second": 0.41,
983
+ "eval_steps_per_second": 0.059,
984
+ "step": 325
985
+ },
986
+ {
987
+ "epoch": 44.0,
988
+ "grad_norm": 1.730556607246399,
989
+ "learning_rate": 5.7142857142857145e-06,
990
+ "loss": 0.1143,
991
+ "step": 330
992
+ },
993
+ {
994
+ "epoch": 44.0,
995
+ "eval_loss": 1.0272893905639648,
996
+ "eval_runtime": 17.0651,
997
+ "eval_samples_per_second": 0.41,
998
+ "eval_steps_per_second": 0.059,
999
+ "step": 330
1000
+ }
1001
+ ],
1002
+ "logging_steps": 5,
1003
+ "max_steps": 350,
1004
+ "num_input_tokens_seen": 0,
1005
+ "num_train_epochs": 50,
1006
+ "save_steps": 10,
1007
+ "total_flos": 2.1924911400521564e+18,
1008
+ "train_batch_size": 2,
1009
+ "trial_name": null,
1010
+ "trial_params": null
1011
+ }
checkpoint-330/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed0ba6fb9a88dad56f61d9306f17b4e66e8767d898772faa97871a1388e82cf
3
+ size 4920
checkpoint-340/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - _load_in_8bit: False
10
+ - _load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ - load_in_4bit: False
19
+ - load_in_8bit: False
20
+ ### Framework versions
21
+
22
+
23
+ - PEFT 0.5.0
checkpoint-340/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-340/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c897d7f97d7cfce98fed482198deb963e7ba374c7a773a5a3c57015ba333f55c
3
+ size 65578776
checkpoint-340/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4b41aaa2ff189b2a834053ad096c135f5c88ccb616ee466390cb9b512c7a6cc
3
+ size 131345914
checkpoint-340/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b33e7788fe74a32e8025c78f1eaeeb839f4a42077f639cc8f0ccaaa0cc2a8f5e
3
+ size 14244
checkpoint-340/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ace13ef9060efed9fdf80fb5c7e28a065010d0a88cfc05a35d53c0b45da76b6
3
+ size 1064
checkpoint-340/trainer_state.json ADDED
@@ -0,0 +1,1041 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9927281737327576,
3
+ "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-290",
4
+ "epoch": 45.333333333333336,
5
+ "eval_steps": 5,
6
+ "global_step": 340,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.67,
13
+ "grad_norm": 0.243179589509964,
14
+ "learning_rate": 9.857142857142858e-05,
15
+ "loss": 1.9956,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.67,
20
+ "eval_loss": 1.9701930284500122,
21
+ "eval_runtime": 17.115,
22
+ "eval_samples_per_second": 0.409,
23
+ "eval_steps_per_second": 0.058,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 1.33,
28
+ "grad_norm": 0.34590908885002136,
29
+ "learning_rate": 9.714285714285715e-05,
30
+ "loss": 1.9758,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 1.33,
35
+ "eval_loss": 1.8941271305084229,
36
+ "eval_runtime": 17.0912,
37
+ "eval_samples_per_second": 0.41,
38
+ "eval_steps_per_second": 0.059,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 2.0,
43
+ "grad_norm": 0.31595832109451294,
44
+ "learning_rate": 9.571428571428573e-05,
45
+ "loss": 1.849,
46
+ "step": 15
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "eval_loss": 1.8046789169311523,
51
+ "eval_runtime": 17.098,
52
+ "eval_samples_per_second": 0.409,
53
+ "eval_steps_per_second": 0.058,
54
+ "step": 15
55
+ },
56
+ {
57
+ "epoch": 2.67,
58
+ "grad_norm": 0.3428090512752533,
59
+ "learning_rate": 9.428571428571429e-05,
60
+ "loss": 1.789,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 2.67,
65
+ "eval_loss": 1.7658358812332153,
66
+ "eval_runtime": 17.0734,
67
+ "eval_samples_per_second": 0.41,
68
+ "eval_steps_per_second": 0.059,
69
+ "step": 20
70
+ },
71
+ {
72
+ "epoch": 3.33,
73
+ "grad_norm": 0.3102028965950012,
74
+ "learning_rate": 9.285714285714286e-05,
75
+ "loss": 1.7789,
76
+ "step": 25
77
+ },
78
+ {
79
+ "epoch": 3.33,
80
+ "eval_loss": 1.7225048542022705,
81
+ "eval_runtime": 17.0972,
82
+ "eval_samples_per_second": 0.409,
83
+ "eval_steps_per_second": 0.058,
84
+ "step": 25
85
+ },
86
+ {
87
+ "epoch": 4.0,
88
+ "grad_norm": 0.38602885603904724,
89
+ "learning_rate": 9.142857142857143e-05,
90
+ "loss": 1.7003,
91
+ "step": 30
92
+ },
93
+ {
94
+ "epoch": 4.0,
95
+ "eval_loss": 1.6749440431594849,
96
+ "eval_runtime": 17.1034,
97
+ "eval_samples_per_second": 0.409,
98
+ "eval_steps_per_second": 0.058,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 4.67,
103
+ "grad_norm": 0.37120407819747925,
104
+ "learning_rate": 9e-05,
105
+ "loss": 1.6424,
106
+ "step": 35
107
+ },
108
+ {
109
+ "epoch": 4.67,
110
+ "eval_loss": 1.6231099367141724,
111
+ "eval_runtime": 17.1067,
112
+ "eval_samples_per_second": 0.409,
113
+ "eval_steps_per_second": 0.058,
114
+ "step": 35
115
+ },
116
+ {
117
+ "epoch": 5.33,
118
+ "grad_norm": 0.4633428454399109,
119
+ "learning_rate": 8.857142857142857e-05,
120
+ "loss": 1.6023,
121
+ "step": 40
122
+ },
123
+ {
124
+ "epoch": 5.33,
125
+ "eval_loss": 1.5727053880691528,
126
+ "eval_runtime": 17.1002,
127
+ "eval_samples_per_second": 0.409,
128
+ "eval_steps_per_second": 0.058,
129
+ "step": 40
130
+ },
131
+ {
132
+ "epoch": 6.0,
133
+ "grad_norm": 0.5034663081169128,
134
+ "learning_rate": 8.714285714285715e-05,
135
+ "loss": 1.5322,
136
+ "step": 45
137
+ },
138
+ {
139
+ "epoch": 6.0,
140
+ "eval_loss": 1.5312587022781372,
141
+ "eval_runtime": 17.1159,
142
+ "eval_samples_per_second": 0.409,
143
+ "eval_steps_per_second": 0.058,
144
+ "step": 45
145
+ },
146
+ {
147
+ "epoch": 6.67,
148
+ "grad_norm": 0.5549929141998291,
149
+ "learning_rate": 8.571428571428571e-05,
150
+ "loss": 1.4788,
151
+ "step": 50
152
+ },
153
+ {
154
+ "epoch": 6.67,
155
+ "eval_loss": 1.492464303970337,
156
+ "eval_runtime": 17.0823,
157
+ "eval_samples_per_second": 0.41,
158
+ "eval_steps_per_second": 0.059,
159
+ "step": 50
160
+ },
161
+ {
162
+ "epoch": 7.33,
163
+ "grad_norm": 0.49194690585136414,
164
+ "learning_rate": 8.428571428571429e-05,
165
+ "loss": 1.4632,
166
+ "step": 55
167
+ },
168
+ {
169
+ "epoch": 7.33,
170
+ "eval_loss": 1.4622489213943481,
171
+ "eval_runtime": 17.1022,
172
+ "eval_samples_per_second": 0.409,
173
+ "eval_steps_per_second": 0.058,
174
+ "step": 55
175
+ },
176
+ {
177
+ "epoch": 8.0,
178
+ "grad_norm": 0.5866131782531738,
179
+ "learning_rate": 8.285714285714287e-05,
180
+ "loss": 1.3951,
181
+ "step": 60
182
+ },
183
+ {
184
+ "epoch": 8.0,
185
+ "eval_loss": 1.435951828956604,
186
+ "eval_runtime": 17.1087,
187
+ "eval_samples_per_second": 0.409,
188
+ "eval_steps_per_second": 0.058,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 8.67,
193
+ "grad_norm": 0.6252542734146118,
194
+ "learning_rate": 8.142857142857143e-05,
195
+ "loss": 1.3796,
196
+ "step": 65
197
+ },
198
+ {
199
+ "epoch": 8.67,
200
+ "eval_loss": 1.413227915763855,
201
+ "eval_runtime": 17.0914,
202
+ "eval_samples_per_second": 0.41,
203
+ "eval_steps_per_second": 0.059,
204
+ "step": 65
205
+ },
206
+ {
207
+ "epoch": 9.33,
208
+ "grad_norm": 0.6751863360404968,
209
+ "learning_rate": 8e-05,
210
+ "loss": 1.3257,
211
+ "step": 70
212
+ },
213
+ {
214
+ "epoch": 9.33,
215
+ "eval_loss": 1.395649790763855,
216
+ "eval_runtime": 17.0885,
217
+ "eval_samples_per_second": 0.41,
218
+ "eval_steps_per_second": 0.059,
219
+ "step": 70
220
+ },
221
+ {
222
+ "epoch": 10.0,
223
+ "grad_norm": 0.8878222703933716,
224
+ "learning_rate": 7.857142857142858e-05,
225
+ "loss": 1.2795,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 10.0,
230
+ "eval_loss": 1.3699487447738647,
231
+ "eval_runtime": 17.1031,
232
+ "eval_samples_per_second": 0.409,
233
+ "eval_steps_per_second": 0.058,
234
+ "step": 75
235
+ },
236
+ {
237
+ "epoch": 10.67,
238
+ "grad_norm": 0.8470121026039124,
239
+ "learning_rate": 7.714285714285715e-05,
240
+ "loss": 1.2449,
241
+ "step": 80
242
+ },
243
+ {
244
+ "epoch": 10.67,
245
+ "eval_loss": 1.347831130027771,
246
+ "eval_runtime": 17.0985,
247
+ "eval_samples_per_second": 0.409,
248
+ "eval_steps_per_second": 0.058,
249
+ "step": 80
250
+ },
251
+ {
252
+ "epoch": 11.33,
253
+ "grad_norm": 1.0655425786972046,
254
+ "learning_rate": 7.571428571428571e-05,
255
+ "loss": 1.1983,
256
+ "step": 85
257
+ },
258
+ {
259
+ "epoch": 11.33,
260
+ "eval_loss": 1.3311971426010132,
261
+ "eval_runtime": 17.0784,
262
+ "eval_samples_per_second": 0.41,
263
+ "eval_steps_per_second": 0.059,
264
+ "step": 85
265
+ },
266
+ {
267
+ "epoch": 12.0,
268
+ "grad_norm": 1.2651888132095337,
269
+ "learning_rate": 7.428571428571429e-05,
270
+ "loss": 1.1467,
271
+ "step": 90
272
+ },
273
+ {
274
+ "epoch": 12.0,
275
+ "eval_loss": 1.3095277547836304,
276
+ "eval_runtime": 17.0903,
277
+ "eval_samples_per_second": 0.41,
278
+ "eval_steps_per_second": 0.059,
279
+ "step": 90
280
+ },
281
+ {
282
+ "epoch": 12.67,
283
+ "grad_norm": 1.248926043510437,
284
+ "learning_rate": 7.285714285714286e-05,
285
+ "loss": 1.0922,
286
+ "step": 95
287
+ },
288
+ {
289
+ "epoch": 12.67,
290
+ "eval_loss": 1.2942878007888794,
291
+ "eval_runtime": 17.0947,
292
+ "eval_samples_per_second": 0.409,
293
+ "eval_steps_per_second": 0.058,
294
+ "step": 95
295
+ },
296
+ {
297
+ "epoch": 13.33,
298
+ "grad_norm": 1.896952509880066,
299
+ "learning_rate": 7.142857142857143e-05,
300
+ "loss": 1.0403,
301
+ "step": 100
302
+ },
303
+ {
304
+ "epoch": 13.33,
305
+ "eval_loss": 1.2803159952163696,
306
+ "eval_runtime": 17.0819,
307
+ "eval_samples_per_second": 0.41,
308
+ "eval_steps_per_second": 0.059,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 14.0,
313
+ "grad_norm": 1.862244725227356,
314
+ "learning_rate": 7e-05,
315
+ "loss": 1.0049,
316
+ "step": 105
317
+ },
318
+ {
319
+ "epoch": 14.0,
320
+ "eval_loss": 1.2643567323684692,
321
+ "eval_runtime": 17.0849,
322
+ "eval_samples_per_second": 0.41,
323
+ "eval_steps_per_second": 0.059,
324
+ "step": 105
325
+ },
326
+ {
327
+ "epoch": 14.67,
328
+ "grad_norm": 1.7487821578979492,
329
+ "learning_rate": 6.857142857142858e-05,
330
+ "loss": 0.9262,
331
+ "step": 110
332
+ },
333
+ {
334
+ "epoch": 14.67,
335
+ "eval_loss": 1.2471646070480347,
336
+ "eval_runtime": 17.1278,
337
+ "eval_samples_per_second": 0.409,
338
+ "eval_steps_per_second": 0.058,
339
+ "step": 110
340
+ },
341
+ {
342
+ "epoch": 15.33,
343
+ "grad_norm": 1.838605284690857,
344
+ "learning_rate": 6.714285714285714e-05,
345
+ "loss": 0.8965,
346
+ "step": 115
347
+ },
348
+ {
349
+ "epoch": 15.33,
350
+ "eval_loss": 1.2377034425735474,
351
+ "eval_runtime": 17.0731,
352
+ "eval_samples_per_second": 0.41,
353
+ "eval_steps_per_second": 0.059,
354
+ "step": 115
355
+ },
356
+ {
357
+ "epoch": 16.0,
358
+ "grad_norm": 3.117398977279663,
359
+ "learning_rate": 6.571428571428571e-05,
360
+ "loss": 0.8581,
361
+ "step": 120
362
+ },
363
+ {
364
+ "epoch": 16.0,
365
+ "eval_loss": 1.2083133459091187,
366
+ "eval_runtime": 17.1304,
367
+ "eval_samples_per_second": 0.409,
368
+ "eval_steps_per_second": 0.058,
369
+ "step": 120
370
+ },
371
+ {
372
+ "epoch": 16.67,
373
+ "grad_norm": 2.5655250549316406,
374
+ "learning_rate": 6.428571428571429e-05,
375
+ "loss": 0.7929,
376
+ "step": 125
377
+ },
378
+ {
379
+ "epoch": 16.67,
380
+ "eval_loss": 1.1945828199386597,
381
+ "eval_runtime": 17.104,
382
+ "eval_samples_per_second": 0.409,
383
+ "eval_steps_per_second": 0.058,
384
+ "step": 125
385
+ },
386
+ {
387
+ "epoch": 17.33,
388
+ "grad_norm": 2.168546199798584,
389
+ "learning_rate": 6.285714285714286e-05,
390
+ "loss": 0.7543,
391
+ "step": 130
392
+ },
393
+ {
394
+ "epoch": 17.33,
395
+ "eval_loss": 1.1876276731491089,
396
+ "eval_runtime": 17.1046,
397
+ "eval_samples_per_second": 0.409,
398
+ "eval_steps_per_second": 0.058,
399
+ "step": 130
400
+ },
401
+ {
402
+ "epoch": 18.0,
403
+ "grad_norm": 2.5984208583831787,
404
+ "learning_rate": 6.142857142857143e-05,
405
+ "loss": 0.716,
406
+ "step": 135
407
+ },
408
+ {
409
+ "epoch": 18.0,
410
+ "eval_loss": 1.1714750528335571,
411
+ "eval_runtime": 17.0807,
412
+ "eval_samples_per_second": 0.41,
413
+ "eval_steps_per_second": 0.059,
414
+ "step": 135
415
+ },
416
+ {
417
+ "epoch": 18.67,
418
+ "grad_norm": 3.479024887084961,
419
+ "learning_rate": 6e-05,
420
+ "loss": 0.6681,
421
+ "step": 140
422
+ },
423
+ {
424
+ "epoch": 18.67,
425
+ "eval_loss": 1.169895052909851,
426
+ "eval_runtime": 17.0681,
427
+ "eval_samples_per_second": 0.41,
428
+ "eval_steps_per_second": 0.059,
429
+ "step": 140
430
+ },
431
+ {
432
+ "epoch": 19.33,
433
+ "grad_norm": 2.563386917114258,
434
+ "learning_rate": 5.8571428571428575e-05,
435
+ "loss": 0.6306,
436
+ "step": 145
437
+ },
438
+ {
439
+ "epoch": 19.33,
440
+ "eval_loss": 1.1741083860397339,
441
+ "eval_runtime": 17.0568,
442
+ "eval_samples_per_second": 0.41,
443
+ "eval_steps_per_second": 0.059,
444
+ "step": 145
445
+ },
446
+ {
447
+ "epoch": 20.0,
448
+ "grad_norm": 2.96592116355896,
449
+ "learning_rate": 5.714285714285714e-05,
450
+ "loss": 0.6183,
451
+ "step": 150
452
+ },
453
+ {
454
+ "epoch": 20.0,
455
+ "eval_loss": 1.1455965042114258,
456
+ "eval_runtime": 17.073,
457
+ "eval_samples_per_second": 0.41,
458
+ "eval_steps_per_second": 0.059,
459
+ "step": 150
460
+ },
461
+ {
462
+ "epoch": 20.67,
463
+ "grad_norm": 2.6751275062561035,
464
+ "learning_rate": 5.571428571428572e-05,
465
+ "loss": 0.5464,
466
+ "step": 155
467
+ },
468
+ {
469
+ "epoch": 20.67,
470
+ "eval_loss": 1.131102204322815,
471
+ "eval_runtime": 17.0578,
472
+ "eval_samples_per_second": 0.41,
473
+ "eval_steps_per_second": 0.059,
474
+ "step": 155
475
+ },
476
+ {
477
+ "epoch": 21.33,
478
+ "grad_norm": 2.3700051307678223,
479
+ "learning_rate": 5.428571428571428e-05,
480
+ "loss": 0.551,
481
+ "step": 160
482
+ },
483
+ {
484
+ "epoch": 21.33,
485
+ "eval_loss": 1.127384066581726,
486
+ "eval_runtime": 17.0546,
487
+ "eval_samples_per_second": 0.41,
488
+ "eval_steps_per_second": 0.059,
489
+ "step": 160
490
+ },
491
+ {
492
+ "epoch": 22.0,
493
+ "grad_norm": 3.3827567100524902,
494
+ "learning_rate": 5.285714285714286e-05,
495
+ "loss": 0.5179,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 22.0,
500
+ "eval_loss": 1.111584186553955,
501
+ "eval_runtime": 17.0812,
502
+ "eval_samples_per_second": 0.41,
503
+ "eval_steps_per_second": 0.059,
504
+ "step": 165
505
+ },
506
+ {
507
+ "epoch": 22.67,
508
+ "grad_norm": 3.55790114402771,
509
+ "learning_rate": 5.142857142857143e-05,
510
+ "loss": 0.4831,
511
+ "step": 170
512
+ },
513
+ {
514
+ "epoch": 22.67,
515
+ "eval_loss": 1.0948525667190552,
516
+ "eval_runtime": 17.0547,
517
+ "eval_samples_per_second": 0.41,
518
+ "eval_steps_per_second": 0.059,
519
+ "step": 170
520
+ },
521
+ {
522
+ "epoch": 23.33,
523
+ "grad_norm": 3.0782699584960938,
524
+ "learning_rate": 5e-05,
525
+ "loss": 0.4587,
526
+ "step": 175
527
+ },
528
+ {
529
+ "epoch": 23.33,
530
+ "eval_loss": 1.0906586647033691,
531
+ "eval_runtime": 17.0666,
532
+ "eval_samples_per_second": 0.41,
533
+ "eval_steps_per_second": 0.059,
534
+ "step": 175
535
+ },
536
+ {
537
+ "epoch": 24.0,
538
+ "grad_norm": 3.3993167877197266,
539
+ "learning_rate": 4.8571428571428576e-05,
540
+ "loss": 0.4203,
541
+ "step": 180
542
+ },
543
+ {
544
+ "epoch": 24.0,
545
+ "eval_loss": 1.0688152313232422,
546
+ "eval_runtime": 17.0721,
547
+ "eval_samples_per_second": 0.41,
548
+ "eval_steps_per_second": 0.059,
549
+ "step": 180
550
+ },
551
+ {
552
+ "epoch": 24.67,
553
+ "grad_norm": 3.319303035736084,
554
+ "learning_rate": 4.714285714285714e-05,
555
+ "loss": 0.3975,
556
+ "step": 185
557
+ },
558
+ {
559
+ "epoch": 24.67,
560
+ "eval_loss": 1.0746583938598633,
561
+ "eval_runtime": 17.0709,
562
+ "eval_samples_per_second": 0.41,
563
+ "eval_steps_per_second": 0.059,
564
+ "step": 185
565
+ },
566
+ {
567
+ "epoch": 25.33,
568
+ "grad_norm": 2.4532127380371094,
569
+ "learning_rate": 4.5714285714285716e-05,
570
+ "loss": 0.3832,
571
+ "step": 190
572
+ },
573
+ {
574
+ "epoch": 25.33,
575
+ "eval_loss": 1.0772522687911987,
576
+ "eval_runtime": 17.0619,
577
+ "eval_samples_per_second": 0.41,
578
+ "eval_steps_per_second": 0.059,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 26.0,
583
+ "grad_norm": 3.956822156906128,
584
+ "learning_rate": 4.428571428571428e-05,
585
+ "loss": 0.3725,
586
+ "step": 195
587
+ },
588
+ {
589
+ "epoch": 26.0,
590
+ "eval_loss": 1.0638784170150757,
591
+ "eval_runtime": 17.0807,
592
+ "eval_samples_per_second": 0.41,
593
+ "eval_steps_per_second": 0.059,
594
+ "step": 195
595
+ },
596
+ {
597
+ "epoch": 26.67,
598
+ "grad_norm": 2.76033353805542,
599
+ "learning_rate": 4.2857142857142856e-05,
600
+ "loss": 0.3473,
601
+ "step": 200
602
+ },
603
+ {
604
+ "epoch": 26.67,
605
+ "eval_loss": 1.04669988155365,
606
+ "eval_runtime": 17.0774,
607
+ "eval_samples_per_second": 0.41,
608
+ "eval_steps_per_second": 0.059,
609
+ "step": 200
610
+ },
611
+ {
612
+ "epoch": 27.33,
613
+ "grad_norm": 3.8683507442474365,
614
+ "learning_rate": 4.1428571428571437e-05,
615
+ "loss": 0.3243,
616
+ "step": 205
617
+ },
618
+ {
619
+ "epoch": 27.33,
620
+ "eval_loss": 1.0470303297042847,
621
+ "eval_runtime": 17.0718,
622
+ "eval_samples_per_second": 0.41,
623
+ "eval_steps_per_second": 0.059,
624
+ "step": 205
625
+ },
626
+ {
627
+ "epoch": 28.0,
628
+ "grad_norm": 4.535538196563721,
629
+ "learning_rate": 4e-05,
630
+ "loss": 0.3202,
631
+ "step": 210
632
+ },
633
+ {
634
+ "epoch": 28.0,
635
+ "eval_loss": 1.025539517402649,
636
+ "eval_runtime": 17.0604,
637
+ "eval_samples_per_second": 0.41,
638
+ "eval_steps_per_second": 0.059,
639
+ "step": 210
640
+ },
641
+ {
642
+ "epoch": 28.67,
643
+ "grad_norm": 2.6224355697631836,
644
+ "learning_rate": 3.857142857142858e-05,
645
+ "loss": 0.2958,
646
+ "step": 215
647
+ },
648
+ {
649
+ "epoch": 28.67,
650
+ "eval_loss": 1.0192126035690308,
651
+ "eval_runtime": 17.0657,
652
+ "eval_samples_per_second": 0.41,
653
+ "eval_steps_per_second": 0.059,
654
+ "step": 215
655
+ },
656
+ {
657
+ "epoch": 29.33,
658
+ "grad_norm": 2.5870041847229004,
659
+ "learning_rate": 3.7142857142857143e-05,
660
+ "loss": 0.2783,
661
+ "step": 220
662
+ },
663
+ {
664
+ "epoch": 29.33,
665
+ "eval_loss": 1.0211580991744995,
666
+ "eval_runtime": 17.0857,
667
+ "eval_samples_per_second": 0.41,
668
+ "eval_steps_per_second": 0.059,
669
+ "step": 220
670
+ },
671
+ {
672
+ "epoch": 30.0,
673
+ "grad_norm": 3.4565751552581787,
674
+ "learning_rate": 3.571428571428572e-05,
675
+ "loss": 0.2773,
676
+ "step": 225
677
+ },
678
+ {
679
+ "epoch": 30.0,
680
+ "eval_loss": 1.006419062614441,
681
+ "eval_runtime": 17.0807,
682
+ "eval_samples_per_second": 0.41,
683
+ "eval_steps_per_second": 0.059,
684
+ "step": 225
685
+ },
686
+ {
687
+ "epoch": 30.67,
688
+ "grad_norm": 2.4756500720977783,
689
+ "learning_rate": 3.428571428571429e-05,
690
+ "loss": 0.2482,
691
+ "step": 230
692
+ },
693
+ {
694
+ "epoch": 30.67,
695
+ "eval_loss": 1.0081219673156738,
696
+ "eval_runtime": 17.0576,
697
+ "eval_samples_per_second": 0.41,
698
+ "eval_steps_per_second": 0.059,
699
+ "step": 230
700
+ },
701
+ {
702
+ "epoch": 31.33,
703
+ "grad_norm": 2.38002610206604,
704
+ "learning_rate": 3.285714285714286e-05,
705
+ "loss": 0.2464,
706
+ "step": 235
707
+ },
708
+ {
709
+ "epoch": 31.33,
710
+ "eval_loss": 1.0151804685592651,
711
+ "eval_runtime": 17.0587,
712
+ "eval_samples_per_second": 0.41,
713
+ "eval_steps_per_second": 0.059,
714
+ "step": 235
715
+ },
716
+ {
717
+ "epoch": 32.0,
718
+ "grad_norm": 3.7081105709075928,
719
+ "learning_rate": 3.142857142857143e-05,
720
+ "loss": 0.2442,
721
+ "step": 240
722
+ },
723
+ {
724
+ "epoch": 32.0,
725
+ "eval_loss": 1.0032445192337036,
726
+ "eval_runtime": 17.1613,
727
+ "eval_samples_per_second": 0.408,
728
+ "eval_steps_per_second": 0.058,
729
+ "step": 240
730
+ },
731
+ {
732
+ "epoch": 32.67,
733
+ "grad_norm": 2.55924391746521,
734
+ "learning_rate": 3e-05,
735
+ "loss": 0.2193,
736
+ "step": 245
737
+ },
738
+ {
739
+ "epoch": 32.67,
740
+ "eval_loss": 0.9989615082740784,
741
+ "eval_runtime": 17.0447,
742
+ "eval_samples_per_second": 0.411,
743
+ "eval_steps_per_second": 0.059,
744
+ "step": 245
745
+ },
746
+ {
747
+ "epoch": 33.33,
748
+ "grad_norm": 1.9451407194137573,
749
+ "learning_rate": 2.857142857142857e-05,
750
+ "loss": 0.2101,
751
+ "step": 250
752
+ },
753
+ {
754
+ "epoch": 33.33,
755
+ "eval_loss": 1.0029457807540894,
756
+ "eval_runtime": 17.0816,
757
+ "eval_samples_per_second": 0.41,
758
+ "eval_steps_per_second": 0.059,
759
+ "step": 250
760
+ },
761
+ {
762
+ "epoch": 34.0,
763
+ "grad_norm": 2.713731527328491,
764
+ "learning_rate": 2.714285714285714e-05,
765
+ "loss": 0.2194,
766
+ "step": 255
767
+ },
768
+ {
769
+ "epoch": 34.0,
770
+ "eval_loss": 0.9959421753883362,
771
+ "eval_runtime": 17.0747,
772
+ "eval_samples_per_second": 0.41,
773
+ "eval_steps_per_second": 0.059,
774
+ "step": 255
775
+ },
776
+ {
777
+ "epoch": 34.67,
778
+ "grad_norm": 2.1633846759796143,
779
+ "learning_rate": 2.5714285714285714e-05,
780
+ "loss": 0.1958,
781
+ "step": 260
782
+ },
783
+ {
784
+ "epoch": 34.67,
785
+ "eval_loss": 0.9989770650863647,
786
+ "eval_runtime": 17.0821,
787
+ "eval_samples_per_second": 0.41,
788
+ "eval_steps_per_second": 0.059,
789
+ "step": 260
790
+ },
791
+ {
792
+ "epoch": 35.33,
793
+ "grad_norm": 3.9233529567718506,
794
+ "learning_rate": 2.4285714285714288e-05,
795
+ "loss": 0.1831,
796
+ "step": 265
797
+ },
798
+ {
799
+ "epoch": 35.33,
800
+ "eval_loss": 1.0072578191757202,
801
+ "eval_runtime": 17.0564,
802
+ "eval_samples_per_second": 0.41,
803
+ "eval_steps_per_second": 0.059,
804
+ "step": 265
805
+ },
806
+ {
807
+ "epoch": 36.0,
808
+ "grad_norm": 2.4143056869506836,
809
+ "learning_rate": 2.2857142857142858e-05,
810
+ "loss": 0.1753,
811
+ "step": 270
812
+ },
813
+ {
814
+ "epoch": 36.0,
815
+ "eval_loss": 0.9938892722129822,
816
+ "eval_runtime": 17.0668,
817
+ "eval_samples_per_second": 0.41,
818
+ "eval_steps_per_second": 0.059,
819
+ "step": 270
820
+ },
821
+ {
822
+ "epoch": 36.67,
823
+ "grad_norm": 2.706679582595825,
824
+ "learning_rate": 2.1428571428571428e-05,
825
+ "loss": 0.1698,
826
+ "step": 275
827
+ },
828
+ {
829
+ "epoch": 36.67,
830
+ "eval_loss": 0.9969200491905212,
831
+ "eval_runtime": 17.0643,
832
+ "eval_samples_per_second": 0.41,
833
+ "eval_steps_per_second": 0.059,
834
+ "step": 275
835
+ },
836
+ {
837
+ "epoch": 37.33,
838
+ "grad_norm": 1.872753620147705,
839
+ "learning_rate": 2e-05,
840
+ "loss": 0.16,
841
+ "step": 280
842
+ },
843
+ {
844
+ "epoch": 37.33,
845
+ "eval_loss": 0.9940390586853027,
846
+ "eval_runtime": 17.0728,
847
+ "eval_samples_per_second": 0.41,
848
+ "eval_steps_per_second": 0.059,
849
+ "step": 280
850
+ },
851
+ {
852
+ "epoch": 38.0,
853
+ "grad_norm": 2.7510581016540527,
854
+ "learning_rate": 1.8571428571428572e-05,
855
+ "loss": 0.1614,
856
+ "step": 285
857
+ },
858
+ {
859
+ "epoch": 38.0,
860
+ "eval_loss": 1.0066231489181519,
861
+ "eval_runtime": 17.072,
862
+ "eval_samples_per_second": 0.41,
863
+ "eval_steps_per_second": 0.059,
864
+ "step": 285
865
+ },
866
+ {
867
+ "epoch": 38.67,
868
+ "grad_norm": 1.8461092710494995,
869
+ "learning_rate": 1.7142857142857145e-05,
870
+ "loss": 0.1506,
871
+ "step": 290
872
+ },
873
+ {
874
+ "epoch": 38.67,
875
+ "eval_loss": 0.9927281737327576,
876
+ "eval_runtime": 17.0481,
877
+ "eval_samples_per_second": 0.411,
878
+ "eval_steps_per_second": 0.059,
879
+ "step": 290
880
+ },
881
+ {
882
+ "epoch": 39.33,
883
+ "grad_norm": 1.8425017595291138,
884
+ "learning_rate": 1.5714285714285715e-05,
885
+ "loss": 0.1419,
886
+ "step": 295
887
+ },
888
+ {
889
+ "epoch": 39.33,
890
+ "eval_loss": 1.0133570432662964,
891
+ "eval_runtime": 17.0642,
892
+ "eval_samples_per_second": 0.41,
893
+ "eval_steps_per_second": 0.059,
894
+ "step": 295
895
+ },
896
+ {
897
+ "epoch": 40.0,
898
+ "grad_norm": 2.0457987785339355,
899
+ "learning_rate": 1.4285714285714285e-05,
900
+ "loss": 0.1459,
901
+ "step": 300
902
+ },
903
+ {
904
+ "epoch": 40.0,
905
+ "eval_loss": 1.0127934217453003,
906
+ "eval_runtime": 17.0581,
907
+ "eval_samples_per_second": 0.41,
908
+ "eval_steps_per_second": 0.059,
909
+ "step": 300
910
+ },
911
+ {
912
+ "epoch": 40.67,
913
+ "grad_norm": 1.5630775690078735,
914
+ "learning_rate": 1.2857142857142857e-05,
915
+ "loss": 0.1225,
916
+ "step": 305
917
+ },
918
+ {
919
+ "epoch": 40.67,
920
+ "eval_loss": 1.0092624425888062,
921
+ "eval_runtime": 17.0483,
922
+ "eval_samples_per_second": 0.411,
923
+ "eval_steps_per_second": 0.059,
924
+ "step": 305
925
+ },
926
+ {
927
+ "epoch": 41.33,
928
+ "grad_norm": 1.37598717212677,
929
+ "learning_rate": 1.1428571428571429e-05,
930
+ "loss": 0.146,
931
+ "step": 310
932
+ },
933
+ {
934
+ "epoch": 41.33,
935
+ "eval_loss": 1.0083317756652832,
936
+ "eval_runtime": 17.0804,
937
+ "eval_samples_per_second": 0.41,
938
+ "eval_steps_per_second": 0.059,
939
+ "step": 310
940
+ },
941
+ {
942
+ "epoch": 42.0,
943
+ "grad_norm": 1.8867217302322388,
944
+ "learning_rate": 1e-05,
945
+ "loss": 0.13,
946
+ "step": 315
947
+ },
948
+ {
949
+ "epoch": 42.0,
950
+ "eval_loss": 1.0165104866027832,
951
+ "eval_runtime": 17.0621,
952
+ "eval_samples_per_second": 0.41,
953
+ "eval_steps_per_second": 0.059,
954
+ "step": 315
955
+ },
956
+ {
957
+ "epoch": 42.67,
958
+ "grad_norm": 1.4643555879592896,
959
+ "learning_rate": 8.571428571428573e-06,
960
+ "loss": 0.131,
961
+ "step": 320
962
+ },
963
+ {
964
+ "epoch": 42.67,
965
+ "eval_loss": 1.0264887809753418,
966
+ "eval_runtime": 17.0554,
967
+ "eval_samples_per_second": 0.41,
968
+ "eval_steps_per_second": 0.059,
969
+ "step": 320
970
+ },
971
+ {
972
+ "epoch": 43.33,
973
+ "grad_norm": 1.4163501262664795,
974
+ "learning_rate": 7.142857142857143e-06,
975
+ "loss": 0.1205,
976
+ "step": 325
977
+ },
978
+ {
979
+ "epoch": 43.33,
980
+ "eval_loss": 1.0175670385360718,
981
+ "eval_runtime": 17.0692,
982
+ "eval_samples_per_second": 0.41,
983
+ "eval_steps_per_second": 0.059,
984
+ "step": 325
985
+ },
986
+ {
987
+ "epoch": 44.0,
988
+ "grad_norm": 1.730556607246399,
989
+ "learning_rate": 5.7142857142857145e-06,
990
+ "loss": 0.1143,
991
+ "step": 330
992
+ },
993
+ {
994
+ "epoch": 44.0,
995
+ "eval_loss": 1.0272893905639648,
996
+ "eval_runtime": 17.0651,
997
+ "eval_samples_per_second": 0.41,
998
+ "eval_steps_per_second": 0.059,
999
+ "step": 330
1000
+ },
1001
+ {
1002
+ "epoch": 44.67,
1003
+ "grad_norm": 1.451904058456421,
1004
+ "learning_rate": 4.285714285714286e-06,
1005
+ "loss": 0.1087,
1006
+ "step": 335
1007
+ },
1008
+ {
1009
+ "epoch": 44.67,
1010
+ "eval_loss": 1.0272446870803833,
1011
+ "eval_runtime": 17.0604,
1012
+ "eval_samples_per_second": 0.41,
1013
+ "eval_steps_per_second": 0.059,
1014
+ "step": 335
1015
+ },
1016
+ {
1017
+ "epoch": 45.33,
1018
+ "grad_norm": 1.1883801221847534,
1019
+ "learning_rate": 2.8571428571428573e-06,
1020
+ "loss": 0.1173,
1021
+ "step": 340
1022
+ },
1023
+ {
1024
+ "epoch": 45.33,
1025
+ "eval_loss": 1.0314446687698364,
1026
+ "eval_runtime": 17.0652,
1027
+ "eval_samples_per_second": 0.41,
1028
+ "eval_steps_per_second": 0.059,
1029
+ "step": 340
1030
+ }
1031
+ ],
1032
+ "logging_steps": 5,
1033
+ "max_steps": 350,
1034
+ "num_input_tokens_seen": 0,
1035
+ "num_train_epochs": 50,
1036
+ "save_steps": 10,
1037
+ "total_flos": 2.2592117872263168e+18,
1038
+ "train_batch_size": 2,
1039
+ "trial_name": null,
1040
+ "trial_params": null
1041
+ }
checkpoint-340/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed0ba6fb9a88dad56f61d9306f17b4e66e8767d898772faa97871a1388e82cf
3
+ size 4920
checkpoint-350/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - _load_in_8bit: False
10
+ - _load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ - load_in_4bit: False
19
+ - load_in_8bit: False
20
+ ### Framework versions
21
+
22
+
23
+ - PEFT 0.5.0
checkpoint-350/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-70b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-350/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67eccf78fe003c69b69f04b1f9b9a6e6a1978840cc4a849f3af0d407627c47d6
3
+ size 65578776
checkpoint-350/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d76e77636c4237df069f3fc007ebc0115869843959146b38c8e52ae3e0864fa
3
+ size 131345914
checkpoint-350/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72b00dace87f5d813f1a45a5136e238b8c42ac9391260f621c6e60ffca2a1a3b
3
+ size 14244
checkpoint-350/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70732dec4f14211b78555ce1acfc091b59c12a05595924c42e5da10d3a62cd6d
3
+ size 1064
checkpoint-350/trainer_state.json ADDED
@@ -0,0 +1,1071 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9927281737327576,
3
+ "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-290",
4
+ "epoch": 46.666666666666664,
5
+ "eval_steps": 5,
6
+ "global_step": 350,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.67,
13
+ "grad_norm": 0.243179589509964,
14
+ "learning_rate": 9.857142857142858e-05,
15
+ "loss": 1.9956,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.67,
20
+ "eval_loss": 1.9701930284500122,
21
+ "eval_runtime": 17.115,
22
+ "eval_samples_per_second": 0.409,
23
+ "eval_steps_per_second": 0.058,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 1.33,
28
+ "grad_norm": 0.34590908885002136,
29
+ "learning_rate": 9.714285714285715e-05,
30
+ "loss": 1.9758,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 1.33,
35
+ "eval_loss": 1.8941271305084229,
36
+ "eval_runtime": 17.0912,
37
+ "eval_samples_per_second": 0.41,
38
+ "eval_steps_per_second": 0.059,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 2.0,
43
+ "grad_norm": 0.31595832109451294,
44
+ "learning_rate": 9.571428571428573e-05,
45
+ "loss": 1.849,
46
+ "step": 15
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "eval_loss": 1.8046789169311523,
51
+ "eval_runtime": 17.098,
52
+ "eval_samples_per_second": 0.409,
53
+ "eval_steps_per_second": 0.058,
54
+ "step": 15
55
+ },
56
+ {
57
+ "epoch": 2.67,
58
+ "grad_norm": 0.3428090512752533,
59
+ "learning_rate": 9.428571428571429e-05,
60
+ "loss": 1.789,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 2.67,
65
+ "eval_loss": 1.7658358812332153,
66
+ "eval_runtime": 17.0734,
67
+ "eval_samples_per_second": 0.41,
68
+ "eval_steps_per_second": 0.059,
69
+ "step": 20
70
+ },
71
+ {
72
+ "epoch": 3.33,
73
+ "grad_norm": 0.3102028965950012,
74
+ "learning_rate": 9.285714285714286e-05,
75
+ "loss": 1.7789,
76
+ "step": 25
77
+ },
78
+ {
79
+ "epoch": 3.33,
80
+ "eval_loss": 1.7225048542022705,
81
+ "eval_runtime": 17.0972,
82
+ "eval_samples_per_second": 0.409,
83
+ "eval_steps_per_second": 0.058,
84
+ "step": 25
85
+ },
86
+ {
87
+ "epoch": 4.0,
88
+ "grad_norm": 0.38602885603904724,
89
+ "learning_rate": 9.142857142857143e-05,
90
+ "loss": 1.7003,
91
+ "step": 30
92
+ },
93
+ {
94
+ "epoch": 4.0,
95
+ "eval_loss": 1.6749440431594849,
96
+ "eval_runtime": 17.1034,
97
+ "eval_samples_per_second": 0.409,
98
+ "eval_steps_per_second": 0.058,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 4.67,
103
+ "grad_norm": 0.37120407819747925,
104
+ "learning_rate": 9e-05,
105
+ "loss": 1.6424,
106
+ "step": 35
107
+ },
108
+ {
109
+ "epoch": 4.67,
110
+ "eval_loss": 1.6231099367141724,
111
+ "eval_runtime": 17.1067,
112
+ "eval_samples_per_second": 0.409,
113
+ "eval_steps_per_second": 0.058,
114
+ "step": 35
115
+ },
116
+ {
117
+ "epoch": 5.33,
118
+ "grad_norm": 0.4633428454399109,
119
+ "learning_rate": 8.857142857142857e-05,
120
+ "loss": 1.6023,
121
+ "step": 40
122
+ },
123
+ {
124
+ "epoch": 5.33,
125
+ "eval_loss": 1.5727053880691528,
126
+ "eval_runtime": 17.1002,
127
+ "eval_samples_per_second": 0.409,
128
+ "eval_steps_per_second": 0.058,
129
+ "step": 40
130
+ },
131
+ {
132
+ "epoch": 6.0,
133
+ "grad_norm": 0.5034663081169128,
134
+ "learning_rate": 8.714285714285715e-05,
135
+ "loss": 1.5322,
136
+ "step": 45
137
+ },
138
+ {
139
+ "epoch": 6.0,
140
+ "eval_loss": 1.5312587022781372,
141
+ "eval_runtime": 17.1159,
142
+ "eval_samples_per_second": 0.409,
143
+ "eval_steps_per_second": 0.058,
144
+ "step": 45
145
+ },
146
+ {
147
+ "epoch": 6.67,
148
+ "grad_norm": 0.5549929141998291,
149
+ "learning_rate": 8.571428571428571e-05,
150
+ "loss": 1.4788,
151
+ "step": 50
152
+ },
153
+ {
154
+ "epoch": 6.67,
155
+ "eval_loss": 1.492464303970337,
156
+ "eval_runtime": 17.0823,
157
+ "eval_samples_per_second": 0.41,
158
+ "eval_steps_per_second": 0.059,
159
+ "step": 50
160
+ },
161
+ {
162
+ "epoch": 7.33,
163
+ "grad_norm": 0.49194690585136414,
164
+ "learning_rate": 8.428571428571429e-05,
165
+ "loss": 1.4632,
166
+ "step": 55
167
+ },
168
+ {
169
+ "epoch": 7.33,
170
+ "eval_loss": 1.4622489213943481,
171
+ "eval_runtime": 17.1022,
172
+ "eval_samples_per_second": 0.409,
173
+ "eval_steps_per_second": 0.058,
174
+ "step": 55
175
+ },
176
+ {
177
+ "epoch": 8.0,
178
+ "grad_norm": 0.5866131782531738,
179
+ "learning_rate": 8.285714285714287e-05,
180
+ "loss": 1.3951,
181
+ "step": 60
182
+ },
183
+ {
184
+ "epoch": 8.0,
185
+ "eval_loss": 1.435951828956604,
186
+ "eval_runtime": 17.1087,
187
+ "eval_samples_per_second": 0.409,
188
+ "eval_steps_per_second": 0.058,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 8.67,
193
+ "grad_norm": 0.6252542734146118,
194
+ "learning_rate": 8.142857142857143e-05,
195
+ "loss": 1.3796,
196
+ "step": 65
197
+ },
198
+ {
199
+ "epoch": 8.67,
200
+ "eval_loss": 1.413227915763855,
201
+ "eval_runtime": 17.0914,
202
+ "eval_samples_per_second": 0.41,
203
+ "eval_steps_per_second": 0.059,
204
+ "step": 65
205
+ },
206
+ {
207
+ "epoch": 9.33,
208
+ "grad_norm": 0.6751863360404968,
209
+ "learning_rate": 8e-05,
210
+ "loss": 1.3257,
211
+ "step": 70
212
+ },
213
+ {
214
+ "epoch": 9.33,
215
+ "eval_loss": 1.395649790763855,
216
+ "eval_runtime": 17.0885,
217
+ "eval_samples_per_second": 0.41,
218
+ "eval_steps_per_second": 0.059,
219
+ "step": 70
220
+ },
221
+ {
222
+ "epoch": 10.0,
223
+ "grad_norm": 0.8878222703933716,
224
+ "learning_rate": 7.857142857142858e-05,
225
+ "loss": 1.2795,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 10.0,
230
+ "eval_loss": 1.3699487447738647,
231
+ "eval_runtime": 17.1031,
232
+ "eval_samples_per_second": 0.409,
233
+ "eval_steps_per_second": 0.058,
234
+ "step": 75
235
+ },
236
+ {
237
+ "epoch": 10.67,
238
+ "grad_norm": 0.8470121026039124,
239
+ "learning_rate": 7.714285714285715e-05,
240
+ "loss": 1.2449,
241
+ "step": 80
242
+ },
243
+ {
244
+ "epoch": 10.67,
245
+ "eval_loss": 1.347831130027771,
246
+ "eval_runtime": 17.0985,
247
+ "eval_samples_per_second": 0.409,
248
+ "eval_steps_per_second": 0.058,
249
+ "step": 80
250
+ },
251
+ {
252
+ "epoch": 11.33,
253
+ "grad_norm": 1.0655425786972046,
254
+ "learning_rate": 7.571428571428571e-05,
255
+ "loss": 1.1983,
256
+ "step": 85
257
+ },
258
+ {
259
+ "epoch": 11.33,
260
+ "eval_loss": 1.3311971426010132,
261
+ "eval_runtime": 17.0784,
262
+ "eval_samples_per_second": 0.41,
263
+ "eval_steps_per_second": 0.059,
264
+ "step": 85
265
+ },
266
+ {
267
+ "epoch": 12.0,
268
+ "grad_norm": 1.2651888132095337,
269
+ "learning_rate": 7.428571428571429e-05,
270
+ "loss": 1.1467,
271
+ "step": 90
272
+ },
273
+ {
274
+ "epoch": 12.0,
275
+ "eval_loss": 1.3095277547836304,
276
+ "eval_runtime": 17.0903,
277
+ "eval_samples_per_second": 0.41,
278
+ "eval_steps_per_second": 0.059,
279
+ "step": 90
280
+ },
281
+ {
282
+ "epoch": 12.67,
283
+ "grad_norm": 1.248926043510437,
284
+ "learning_rate": 7.285714285714286e-05,
285
+ "loss": 1.0922,
286
+ "step": 95
287
+ },
288
+ {
289
+ "epoch": 12.67,
290
+ "eval_loss": 1.2942878007888794,
291
+ "eval_runtime": 17.0947,
292
+ "eval_samples_per_second": 0.409,
293
+ "eval_steps_per_second": 0.058,
294
+ "step": 95
295
+ },
296
+ {
297
+ "epoch": 13.33,
298
+ "grad_norm": 1.896952509880066,
299
+ "learning_rate": 7.142857142857143e-05,
300
+ "loss": 1.0403,
301
+ "step": 100
302
+ },
303
+ {
304
+ "epoch": 13.33,
305
+ "eval_loss": 1.2803159952163696,
306
+ "eval_runtime": 17.0819,
307
+ "eval_samples_per_second": 0.41,
308
+ "eval_steps_per_second": 0.059,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 14.0,
313
+ "grad_norm": 1.862244725227356,
314
+ "learning_rate": 7e-05,
315
+ "loss": 1.0049,
316
+ "step": 105
317
+ },
318
+ {
319
+ "epoch": 14.0,
320
+ "eval_loss": 1.2643567323684692,
321
+ "eval_runtime": 17.0849,
322
+ "eval_samples_per_second": 0.41,
323
+ "eval_steps_per_second": 0.059,
324
+ "step": 105
325
+ },
326
+ {
327
+ "epoch": 14.67,
328
+ "grad_norm": 1.7487821578979492,
329
+ "learning_rate": 6.857142857142858e-05,
330
+ "loss": 0.9262,
331
+ "step": 110
332
+ },
333
+ {
334
+ "epoch": 14.67,
335
+ "eval_loss": 1.2471646070480347,
336
+ "eval_runtime": 17.1278,
337
+ "eval_samples_per_second": 0.409,
338
+ "eval_steps_per_second": 0.058,
339
+ "step": 110
340
+ },
341
+ {
342
+ "epoch": 15.33,
343
+ "grad_norm": 1.838605284690857,
344
+ "learning_rate": 6.714285714285714e-05,
345
+ "loss": 0.8965,
346
+ "step": 115
347
+ },
348
+ {
349
+ "epoch": 15.33,
350
+ "eval_loss": 1.2377034425735474,
351
+ "eval_runtime": 17.0731,
352
+ "eval_samples_per_second": 0.41,
353
+ "eval_steps_per_second": 0.059,
354
+ "step": 115
355
+ },
356
+ {
357
+ "epoch": 16.0,
358
+ "grad_norm": 3.117398977279663,
359
+ "learning_rate": 6.571428571428571e-05,
360
+ "loss": 0.8581,
361
+ "step": 120
362
+ },
363
+ {
364
+ "epoch": 16.0,
365
+ "eval_loss": 1.2083133459091187,
366
+ "eval_runtime": 17.1304,
367
+ "eval_samples_per_second": 0.409,
368
+ "eval_steps_per_second": 0.058,
369
+ "step": 120
370
+ },
371
+ {
372
+ "epoch": 16.67,
373
+ "grad_norm": 2.5655250549316406,
374
+ "learning_rate": 6.428571428571429e-05,
375
+ "loss": 0.7929,
376
+ "step": 125
377
+ },
378
+ {
379
+ "epoch": 16.67,
380
+ "eval_loss": 1.1945828199386597,
381
+ "eval_runtime": 17.104,
382
+ "eval_samples_per_second": 0.409,
383
+ "eval_steps_per_second": 0.058,
384
+ "step": 125
385
+ },
386
+ {
387
+ "epoch": 17.33,
388
+ "grad_norm": 2.168546199798584,
389
+ "learning_rate": 6.285714285714286e-05,
390
+ "loss": 0.7543,
391
+ "step": 130
392
+ },
393
+ {
394
+ "epoch": 17.33,
395
+ "eval_loss": 1.1876276731491089,
396
+ "eval_runtime": 17.1046,
397
+ "eval_samples_per_second": 0.409,
398
+ "eval_steps_per_second": 0.058,
399
+ "step": 130
400
+ },
401
+ {
402
+ "epoch": 18.0,
403
+ "grad_norm": 2.5984208583831787,
404
+ "learning_rate": 6.142857142857143e-05,
405
+ "loss": 0.716,
406
+ "step": 135
407
+ },
408
+ {
409
+ "epoch": 18.0,
410
+ "eval_loss": 1.1714750528335571,
411
+ "eval_runtime": 17.0807,
412
+ "eval_samples_per_second": 0.41,
413
+ "eval_steps_per_second": 0.059,
414
+ "step": 135
415
+ },
416
+ {
417
+ "epoch": 18.67,
418
+ "grad_norm": 3.479024887084961,
419
+ "learning_rate": 6e-05,
420
+ "loss": 0.6681,
421
+ "step": 140
422
+ },
423
+ {
424
+ "epoch": 18.67,
425
+ "eval_loss": 1.169895052909851,
426
+ "eval_runtime": 17.0681,
427
+ "eval_samples_per_second": 0.41,
428
+ "eval_steps_per_second": 0.059,
429
+ "step": 140
430
+ },
431
+ {
432
+ "epoch": 19.33,
433
+ "grad_norm": 2.563386917114258,
434
+ "learning_rate": 5.8571428571428575e-05,
435
+ "loss": 0.6306,
436
+ "step": 145
437
+ },
438
+ {
439
+ "epoch": 19.33,
440
+ "eval_loss": 1.1741083860397339,
441
+ "eval_runtime": 17.0568,
442
+ "eval_samples_per_second": 0.41,
443
+ "eval_steps_per_second": 0.059,
444
+ "step": 145
445
+ },
446
+ {
447
+ "epoch": 20.0,
448
+ "grad_norm": 2.96592116355896,
449
+ "learning_rate": 5.714285714285714e-05,
450
+ "loss": 0.6183,
451
+ "step": 150
452
+ },
453
+ {
454
+ "epoch": 20.0,
455
+ "eval_loss": 1.1455965042114258,
456
+ "eval_runtime": 17.073,
457
+ "eval_samples_per_second": 0.41,
458
+ "eval_steps_per_second": 0.059,
459
+ "step": 150
460
+ },
461
+ {
462
+ "epoch": 20.67,
463
+ "grad_norm": 2.6751275062561035,
464
+ "learning_rate": 5.571428571428572e-05,
465
+ "loss": 0.5464,
466
+ "step": 155
467
+ },
468
+ {
469
+ "epoch": 20.67,
470
+ "eval_loss": 1.131102204322815,
471
+ "eval_runtime": 17.0578,
472
+ "eval_samples_per_second": 0.41,
473
+ "eval_steps_per_second": 0.059,
474
+ "step": 155
475
+ },
476
+ {
477
+ "epoch": 21.33,
478
+ "grad_norm": 2.3700051307678223,
479
+ "learning_rate": 5.428571428571428e-05,
480
+ "loss": 0.551,
481
+ "step": 160
482
+ },
483
+ {
484
+ "epoch": 21.33,
485
+ "eval_loss": 1.127384066581726,
486
+ "eval_runtime": 17.0546,
487
+ "eval_samples_per_second": 0.41,
488
+ "eval_steps_per_second": 0.059,
489
+ "step": 160
490
+ },
491
+ {
492
+ "epoch": 22.0,
493
+ "grad_norm": 3.3827567100524902,
494
+ "learning_rate": 5.285714285714286e-05,
495
+ "loss": 0.5179,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 22.0,
500
+ "eval_loss": 1.111584186553955,
501
+ "eval_runtime": 17.0812,
502
+ "eval_samples_per_second": 0.41,
503
+ "eval_steps_per_second": 0.059,
504
+ "step": 165
505
+ },
506
+ {
507
+ "epoch": 22.67,
508
+ "grad_norm": 3.55790114402771,
509
+ "learning_rate": 5.142857142857143e-05,
510
+ "loss": 0.4831,
511
+ "step": 170
512
+ },
513
+ {
514
+ "epoch": 22.67,
515
+ "eval_loss": 1.0948525667190552,
516
+ "eval_runtime": 17.0547,
517
+ "eval_samples_per_second": 0.41,
518
+ "eval_steps_per_second": 0.059,
519
+ "step": 170
520
+ },
521
+ {
522
+ "epoch": 23.33,
523
+ "grad_norm": 3.0782699584960938,
524
+ "learning_rate": 5e-05,
525
+ "loss": 0.4587,
526
+ "step": 175
527
+ },
528
+ {
529
+ "epoch": 23.33,
530
+ "eval_loss": 1.0906586647033691,
531
+ "eval_runtime": 17.0666,
532
+ "eval_samples_per_second": 0.41,
533
+ "eval_steps_per_second": 0.059,
534
+ "step": 175
535
+ },
536
+ {
537
+ "epoch": 24.0,
538
+ "grad_norm": 3.3993167877197266,
539
+ "learning_rate": 4.8571428571428576e-05,
540
+ "loss": 0.4203,
541
+ "step": 180
542
+ },
543
+ {
544
+ "epoch": 24.0,
545
+ "eval_loss": 1.0688152313232422,
546
+ "eval_runtime": 17.0721,
547
+ "eval_samples_per_second": 0.41,
548
+ "eval_steps_per_second": 0.059,
549
+ "step": 180
550
+ },
551
+ {
552
+ "epoch": 24.67,
553
+ "grad_norm": 3.319303035736084,
554
+ "learning_rate": 4.714285714285714e-05,
555
+ "loss": 0.3975,
556
+ "step": 185
557
+ },
558
+ {
559
+ "epoch": 24.67,
560
+ "eval_loss": 1.0746583938598633,
561
+ "eval_runtime": 17.0709,
562
+ "eval_samples_per_second": 0.41,
563
+ "eval_steps_per_second": 0.059,
564
+ "step": 185
565
+ },
566
+ {
567
+ "epoch": 25.33,
568
+ "grad_norm": 2.4532127380371094,
569
+ "learning_rate": 4.5714285714285716e-05,
570
+ "loss": 0.3832,
571
+ "step": 190
572
+ },
573
+ {
574
+ "epoch": 25.33,
575
+ "eval_loss": 1.0772522687911987,
576
+ "eval_runtime": 17.0619,
577
+ "eval_samples_per_second": 0.41,
578
+ "eval_steps_per_second": 0.059,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 26.0,
583
+ "grad_norm": 3.956822156906128,
584
+ "learning_rate": 4.428571428571428e-05,
585
+ "loss": 0.3725,
586
+ "step": 195
587
+ },
588
+ {
589
+ "epoch": 26.0,
590
+ "eval_loss": 1.0638784170150757,
591
+ "eval_runtime": 17.0807,
592
+ "eval_samples_per_second": 0.41,
593
+ "eval_steps_per_second": 0.059,
594
+ "step": 195
595
+ },
596
+ {
597
+ "epoch": 26.67,
598
+ "grad_norm": 2.76033353805542,
599
+ "learning_rate": 4.2857142857142856e-05,
600
+ "loss": 0.3473,
601
+ "step": 200
602
+ },
603
+ {
604
+ "epoch": 26.67,
605
+ "eval_loss": 1.04669988155365,
606
+ "eval_runtime": 17.0774,
607
+ "eval_samples_per_second": 0.41,
608
+ "eval_steps_per_second": 0.059,
609
+ "step": 200
610
+ },
611
+ {
612
+ "epoch": 27.33,
613
+ "grad_norm": 3.8683507442474365,
614
+ "learning_rate": 4.1428571428571437e-05,
615
+ "loss": 0.3243,
616
+ "step": 205
617
+ },
618
+ {
619
+ "epoch": 27.33,
620
+ "eval_loss": 1.0470303297042847,
621
+ "eval_runtime": 17.0718,
622
+ "eval_samples_per_second": 0.41,
623
+ "eval_steps_per_second": 0.059,
624
+ "step": 205
625
+ },
626
+ {
627
+ "epoch": 28.0,
628
+ "grad_norm": 4.535538196563721,
629
+ "learning_rate": 4e-05,
630
+ "loss": 0.3202,
631
+ "step": 210
632
+ },
633
+ {
634
+ "epoch": 28.0,
635
+ "eval_loss": 1.025539517402649,
636
+ "eval_runtime": 17.0604,
637
+ "eval_samples_per_second": 0.41,
638
+ "eval_steps_per_second": 0.059,
639
+ "step": 210
640
+ },
641
+ {
642
+ "epoch": 28.67,
643
+ "grad_norm": 2.6224355697631836,
644
+ "learning_rate": 3.857142857142858e-05,
645
+ "loss": 0.2958,
646
+ "step": 215
647
+ },
648
+ {
649
+ "epoch": 28.67,
650
+ "eval_loss": 1.0192126035690308,
651
+ "eval_runtime": 17.0657,
652
+ "eval_samples_per_second": 0.41,
653
+ "eval_steps_per_second": 0.059,
654
+ "step": 215
655
+ },
656
+ {
657
+ "epoch": 29.33,
658
+ "grad_norm": 2.5870041847229004,
659
+ "learning_rate": 3.7142857142857143e-05,
660
+ "loss": 0.2783,
661
+ "step": 220
662
+ },
663
+ {
664
+ "epoch": 29.33,
665
+ "eval_loss": 1.0211580991744995,
666
+ "eval_runtime": 17.0857,
667
+ "eval_samples_per_second": 0.41,
668
+ "eval_steps_per_second": 0.059,
669
+ "step": 220
670
+ },
671
+ {
672
+ "epoch": 30.0,
673
+ "grad_norm": 3.4565751552581787,
674
+ "learning_rate": 3.571428571428572e-05,
675
+ "loss": 0.2773,
676
+ "step": 225
677
+ },
678
+ {
679
+ "epoch": 30.0,
680
+ "eval_loss": 1.006419062614441,
681
+ "eval_runtime": 17.0807,
682
+ "eval_samples_per_second": 0.41,
683
+ "eval_steps_per_second": 0.059,
684
+ "step": 225
685
+ },
686
+ {
687
+ "epoch": 30.67,
688
+ "grad_norm": 2.4756500720977783,
689
+ "learning_rate": 3.428571428571429e-05,
690
+ "loss": 0.2482,
691
+ "step": 230
692
+ },
693
+ {
694
+ "epoch": 30.67,
695
+ "eval_loss": 1.0081219673156738,
696
+ "eval_runtime": 17.0576,
697
+ "eval_samples_per_second": 0.41,
698
+ "eval_steps_per_second": 0.059,
699
+ "step": 230
700
+ },
701
+ {
702
+ "epoch": 31.33,
703
+ "grad_norm": 2.38002610206604,
704
+ "learning_rate": 3.285714285714286e-05,
705
+ "loss": 0.2464,
706
+ "step": 235
707
+ },
708
+ {
709
+ "epoch": 31.33,
710
+ "eval_loss": 1.0151804685592651,
711
+ "eval_runtime": 17.0587,
712
+ "eval_samples_per_second": 0.41,
713
+ "eval_steps_per_second": 0.059,
714
+ "step": 235
715
+ },
716
+ {
717
+ "epoch": 32.0,
718
+ "grad_norm": 3.7081105709075928,
719
+ "learning_rate": 3.142857142857143e-05,
720
+ "loss": 0.2442,
721
+ "step": 240
722
+ },
723
+ {
724
+ "epoch": 32.0,
725
+ "eval_loss": 1.0032445192337036,
726
+ "eval_runtime": 17.1613,
727
+ "eval_samples_per_second": 0.408,
728
+ "eval_steps_per_second": 0.058,
729
+ "step": 240
730
+ },
731
+ {
732
+ "epoch": 32.67,
733
+ "grad_norm": 2.55924391746521,
734
+ "learning_rate": 3e-05,
735
+ "loss": 0.2193,
736
+ "step": 245
737
+ },
738
+ {
739
+ "epoch": 32.67,
740
+ "eval_loss": 0.9989615082740784,
741
+ "eval_runtime": 17.0447,
742
+ "eval_samples_per_second": 0.411,
743
+ "eval_steps_per_second": 0.059,
744
+ "step": 245
745
+ },
746
+ {
747
+ "epoch": 33.33,
748
+ "grad_norm": 1.9451407194137573,
749
+ "learning_rate": 2.857142857142857e-05,
750
+ "loss": 0.2101,
751
+ "step": 250
752
+ },
753
+ {
754
+ "epoch": 33.33,
755
+ "eval_loss": 1.0029457807540894,
756
+ "eval_runtime": 17.0816,
757
+ "eval_samples_per_second": 0.41,
758
+ "eval_steps_per_second": 0.059,
759
+ "step": 250
760
+ },
761
+ {
762
+ "epoch": 34.0,
763
+ "grad_norm": 2.713731527328491,
764
+ "learning_rate": 2.714285714285714e-05,
765
+ "loss": 0.2194,
766
+ "step": 255
767
+ },
768
+ {
769
+ "epoch": 34.0,
770
+ "eval_loss": 0.9959421753883362,
771
+ "eval_runtime": 17.0747,
772
+ "eval_samples_per_second": 0.41,
773
+ "eval_steps_per_second": 0.059,
774
+ "step": 255
775
+ },
776
+ {
777
+ "epoch": 34.67,
778
+ "grad_norm": 2.1633846759796143,
779
+ "learning_rate": 2.5714285714285714e-05,
780
+ "loss": 0.1958,
781
+ "step": 260
782
+ },
783
+ {
784
+ "epoch": 34.67,
785
+ "eval_loss": 0.9989770650863647,
786
+ "eval_runtime": 17.0821,
787
+ "eval_samples_per_second": 0.41,
788
+ "eval_steps_per_second": 0.059,
789
+ "step": 260
790
+ },
791
+ {
792
+ "epoch": 35.33,
793
+ "grad_norm": 3.9233529567718506,
794
+ "learning_rate": 2.4285714285714288e-05,
795
+ "loss": 0.1831,
796
+ "step": 265
797
+ },
798
+ {
799
+ "epoch": 35.33,
800
+ "eval_loss": 1.0072578191757202,
801
+ "eval_runtime": 17.0564,
802
+ "eval_samples_per_second": 0.41,
803
+ "eval_steps_per_second": 0.059,
804
+ "step": 265
805
+ },
806
+ {
807
+ "epoch": 36.0,
808
+ "grad_norm": 2.4143056869506836,
809
+ "learning_rate": 2.2857142857142858e-05,
810
+ "loss": 0.1753,
811
+ "step": 270
812
+ },
813
+ {
814
+ "epoch": 36.0,
815
+ "eval_loss": 0.9938892722129822,
816
+ "eval_runtime": 17.0668,
817
+ "eval_samples_per_second": 0.41,
818
+ "eval_steps_per_second": 0.059,
819
+ "step": 270
820
+ },
821
+ {
822
+ "epoch": 36.67,
823
+ "grad_norm": 2.706679582595825,
824
+ "learning_rate": 2.1428571428571428e-05,
825
+ "loss": 0.1698,
826
+ "step": 275
827
+ },
828
+ {
829
+ "epoch": 36.67,
830
+ "eval_loss": 0.9969200491905212,
831
+ "eval_runtime": 17.0643,
832
+ "eval_samples_per_second": 0.41,
833
+ "eval_steps_per_second": 0.059,
834
+ "step": 275
835
+ },
836
+ {
837
+ "epoch": 37.33,
838
+ "grad_norm": 1.872753620147705,
839
+ "learning_rate": 2e-05,
840
+ "loss": 0.16,
841
+ "step": 280
842
+ },
843
+ {
844
+ "epoch": 37.33,
845
+ "eval_loss": 0.9940390586853027,
846
+ "eval_runtime": 17.0728,
847
+ "eval_samples_per_second": 0.41,
848
+ "eval_steps_per_second": 0.059,
849
+ "step": 280
850
+ },
851
+ {
852
+ "epoch": 38.0,
853
+ "grad_norm": 2.7510581016540527,
854
+ "learning_rate": 1.8571428571428572e-05,
855
+ "loss": 0.1614,
856
+ "step": 285
857
+ },
858
+ {
859
+ "epoch": 38.0,
860
+ "eval_loss": 1.0066231489181519,
861
+ "eval_runtime": 17.072,
862
+ "eval_samples_per_second": 0.41,
863
+ "eval_steps_per_second": 0.059,
864
+ "step": 285
865
+ },
866
+ {
867
+ "epoch": 38.67,
868
+ "grad_norm": 1.8461092710494995,
869
+ "learning_rate": 1.7142857142857145e-05,
870
+ "loss": 0.1506,
871
+ "step": 290
872
+ },
873
+ {
874
+ "epoch": 38.67,
875
+ "eval_loss": 0.9927281737327576,
876
+ "eval_runtime": 17.0481,
877
+ "eval_samples_per_second": 0.411,
878
+ "eval_steps_per_second": 0.059,
879
+ "step": 290
880
+ },
881
+ {
882
+ "epoch": 39.33,
883
+ "grad_norm": 1.8425017595291138,
884
+ "learning_rate": 1.5714285714285715e-05,
885
+ "loss": 0.1419,
886
+ "step": 295
887
+ },
888
+ {
889
+ "epoch": 39.33,
890
+ "eval_loss": 1.0133570432662964,
891
+ "eval_runtime": 17.0642,
892
+ "eval_samples_per_second": 0.41,
893
+ "eval_steps_per_second": 0.059,
894
+ "step": 295
895
+ },
896
+ {
897
+ "epoch": 40.0,
898
+ "grad_norm": 2.0457987785339355,
899
+ "learning_rate": 1.4285714285714285e-05,
900
+ "loss": 0.1459,
901
+ "step": 300
902
+ },
903
+ {
904
+ "epoch": 40.0,
905
+ "eval_loss": 1.0127934217453003,
906
+ "eval_runtime": 17.0581,
907
+ "eval_samples_per_second": 0.41,
908
+ "eval_steps_per_second": 0.059,
909
+ "step": 300
910
+ },
911
+ {
912
+ "epoch": 40.67,
913
+ "grad_norm": 1.5630775690078735,
914
+ "learning_rate": 1.2857142857142857e-05,
915
+ "loss": 0.1225,
916
+ "step": 305
917
+ },
918
+ {
919
+ "epoch": 40.67,
920
+ "eval_loss": 1.0092624425888062,
921
+ "eval_runtime": 17.0483,
922
+ "eval_samples_per_second": 0.411,
923
+ "eval_steps_per_second": 0.059,
924
+ "step": 305
925
+ },
926
+ {
927
+ "epoch": 41.33,
928
+ "grad_norm": 1.37598717212677,
929
+ "learning_rate": 1.1428571428571429e-05,
930
+ "loss": 0.146,
931
+ "step": 310
932
+ },
933
+ {
934
+ "epoch": 41.33,
935
+ "eval_loss": 1.0083317756652832,
936
+ "eval_runtime": 17.0804,
937
+ "eval_samples_per_second": 0.41,
938
+ "eval_steps_per_second": 0.059,
939
+ "step": 310
940
+ },
941
+ {
942
+ "epoch": 42.0,
943
+ "grad_norm": 1.8867217302322388,
944
+ "learning_rate": 1e-05,
945
+ "loss": 0.13,
946
+ "step": 315
947
+ },
948
+ {
949
+ "epoch": 42.0,
950
+ "eval_loss": 1.0165104866027832,
951
+ "eval_runtime": 17.0621,
952
+ "eval_samples_per_second": 0.41,
953
+ "eval_steps_per_second": 0.059,
954
+ "step": 315
955
+ },
956
+ {
957
+ "epoch": 42.67,
958
+ "grad_norm": 1.4643555879592896,
959
+ "learning_rate": 8.571428571428573e-06,
960
+ "loss": 0.131,
961
+ "step": 320
962
+ },
963
+ {
964
+ "epoch": 42.67,
965
+ "eval_loss": 1.0264887809753418,
966
+ "eval_runtime": 17.0554,
967
+ "eval_samples_per_second": 0.41,
968
+ "eval_steps_per_second": 0.059,
969
+ "step": 320
970
+ },
971
+ {
972
+ "epoch": 43.33,
973
+ "grad_norm": 1.4163501262664795,
974
+ "learning_rate": 7.142857142857143e-06,
975
+ "loss": 0.1205,
976
+ "step": 325
977
+ },
978
+ {
979
+ "epoch": 43.33,
980
+ "eval_loss": 1.0175670385360718,
981
+ "eval_runtime": 17.0692,
982
+ "eval_samples_per_second": 0.41,
983
+ "eval_steps_per_second": 0.059,
984
+ "step": 325
985
+ },
986
+ {
987
+ "epoch": 44.0,
988
+ "grad_norm": 1.730556607246399,
989
+ "learning_rate": 5.7142857142857145e-06,
990
+ "loss": 0.1143,
991
+ "step": 330
992
+ },
993
+ {
994
+ "epoch": 44.0,
995
+ "eval_loss": 1.0272893905639648,
996
+ "eval_runtime": 17.0651,
997
+ "eval_samples_per_second": 0.41,
998
+ "eval_steps_per_second": 0.059,
999
+ "step": 330
1000
+ },
1001
+ {
1002
+ "epoch": 44.67,
1003
+ "grad_norm": 1.451904058456421,
1004
+ "learning_rate": 4.285714285714286e-06,
1005
+ "loss": 0.1087,
1006
+ "step": 335
1007
+ },
1008
+ {
1009
+ "epoch": 44.67,
1010
+ "eval_loss": 1.0272446870803833,
1011
+ "eval_runtime": 17.0604,
1012
+ "eval_samples_per_second": 0.41,
1013
+ "eval_steps_per_second": 0.059,
1014
+ "step": 335
1015
+ },
1016
+ {
1017
+ "epoch": 45.33,
1018
+ "grad_norm": 1.1883801221847534,
1019
+ "learning_rate": 2.8571428571428573e-06,
1020
+ "loss": 0.1173,
1021
+ "step": 340
1022
+ },
1023
+ {
1024
+ "epoch": 45.33,
1025
+ "eval_loss": 1.0314446687698364,
1026
+ "eval_runtime": 17.0652,
1027
+ "eval_samples_per_second": 0.41,
1028
+ "eval_steps_per_second": 0.059,
1029
+ "step": 340
1030
+ },
1031
+ {
1032
+ "epoch": 46.0,
1033
+ "grad_norm": 1.514420747756958,
1034
+ "learning_rate": 1.4285714285714286e-06,
1035
+ "loss": 0.1227,
1036
+ "step": 345
1037
+ },
1038
+ {
1039
+ "epoch": 46.0,
1040
+ "eval_loss": 1.0313900709152222,
1041
+ "eval_runtime": 17.0795,
1042
+ "eval_samples_per_second": 0.41,
1043
+ "eval_steps_per_second": 0.059,
1044
+ "step": 345
1045
+ },
1046
+ {
1047
+ "epoch": 46.67,
1048
+ "grad_norm": 1.1168299913406372,
1049
+ "learning_rate": 0.0,
1050
+ "loss": 0.1133,
1051
+ "step": 350
1052
+ },
1053
+ {
1054
+ "epoch": 46.67,
1055
+ "eval_loss": 1.032753586769104,
1056
+ "eval_runtime": 17.0596,
1057
+ "eval_samples_per_second": 0.41,
1058
+ "eval_steps_per_second": 0.059,
1059
+ "step": 350
1060
+ }
1061
+ ],
1062
+ "logging_steps": 5,
1063
+ "max_steps": 350,
1064
+ "num_input_tokens_seen": 0,
1065
+ "num_train_epochs": 50,
1066
+ "save_steps": 10,
1067
+ "total_flos": 2.325932434400477e+18,
1068
+ "train_batch_size": 2,
1069
+ "trial_name": null,
1070
+ "trial_params": null
1071
+ }
checkpoint-350/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed0ba6fb9a88dad56f61d9306f17b4e66e8767d898772faa97871a1388e82cf
3
+ size 4920
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "meta-llama/Llama-2-70b-hf",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 8192,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 28672,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 64,
17
+ "num_hidden_layers": 80,
18
+ "num_key_value_heads": 8,
19
+ "pretraining_tp": 1,
20
+ "quantization_config": {
21
+ "_load_in_4bit": false,
22
+ "_load_in_8bit": false,
23
+ "bnb_4bit_compute_dtype": "float32",
24
+ "bnb_4bit_quant_type": "fp4",
25
+ "bnb_4bit_use_double_quant": false,
26
+ "llm_int8_enable_fp32_cpu_offload": false,
27
+ "llm_int8_has_fp16_weight": false,
28
+ "llm_int8_skip_modules": null,
29
+ "llm_int8_threshold": 6.0,
30
+ "load_in_4bit": false,
31
+ "load_in_8bit": false,
32
+ "quant_method": "bitsandbytes"
33
+ },
34
+ "rms_norm_eps": 1e-05,
35
+ "rope_scaling": null,
36
+ "rope_theta": 10000.0,
37
+ "tie_word_embeddings": false,
38
+ "torch_dtype": "float16",
39
+ "transformers_version": "4.38.1",
40
+ "use_cache": true,
41
+ "vocab_size": 32000
42
+ }
logs/events.out.tfevents.1699989718.node0370.palmetto.clemson.edu.1119956.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8338f7ab492bb36430b90fde21a7822be3a5f94276aa93f72ad83aeaf872e40
3
+ size 4868
logs/events.out.tfevents.1699989813.node0370.palmetto.clemson.edu.1119956.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4773b11c0e593bac1fe819e5c6324bb2a3542c475a7d375bb608e46408b2b009
3
+ size 4602
logs/events.out.tfevents.1699994287.node0370.palmetto.clemson.edu.1126403.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edff87033b1a80ab70d0a0af49d7943790a698e659a0875f929d78a3299c699d
3
+ size 7962
logs/events.out.tfevents.1699995040.node0370.palmetto.clemson.edu.1127435.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f054998bd37e1977597fdb66e495fb58c70a3b0042fa2ef4428756252ee49a4c
3
+ size 33937
logs/events.out.tfevents.1700064888.node0277.palmetto.clemson.edu.1971495.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:587a2145d6c92f9cbc7c114d9b2d452e13239b0a925f3bdcaa5631c48bdca19a
3
+ size 9908
logs/events.out.tfevents.1700066157.node0277.palmetto.clemson.edu.1973537.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fcffea7a0e86207d5c20daaabb71040aa71cb529a4bbddc4a309e37d9666f12
3
+ size 14528