ikno commited on
Commit
b7b1651
1 Parent(s): f550df5

Model save

Browse files
Files changed (4) hide show
  1. README.md +69 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +413 -0
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: meta-llama/Meta-Llama-3-8B-Instruct
9
+ model-index:
10
+ - name: rinko_300_labeling
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # rinko_300_labeling
18
+
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 2.0068
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 2e-06
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - gradient_accumulation_steps: 2
46
+ - total_train_batch_size: 8
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 5
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:------:|:----:|:---------------:|
56
+ | 2.3912 | 0.9897 | 48 | 2.2464 |
57
+ | 2.2442 | 2.0 | 97 | 2.1167 |
58
+ | 2.1047 | 2.9897 | 145 | 2.0317 |
59
+ | 2.05 | 4.0 | 194 | 2.0067 |
60
+ | 2.0626 | 4.9485 | 240 | 2.0068 |
61
+
62
+
63
+ ### Framework versions
64
+
65
+ - PEFT 0.7.1
66
+ - Transformers 4.40.1
67
+ - Pytorch 2.2.1+cu121
68
+ - Datasets 2.19.0
69
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.948453608247423,
3
+ "total_flos": 1.0715672433026662e+17,
4
+ "train_loss": 2.205865615606308,
5
+ "train_runtime": 7479.3037,
6
+ "train_samples": 385,
7
+ "train_samples_per_second": 0.257,
8
+ "train_steps_per_second": 0.032
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.948453608247423,
3
+ "total_flos": 1.0715672433026662e+17,
4
+ "train_loss": 2.205865615606308,
5
+ "train_runtime": 7479.3037,
6
+ "train_samples": 385,
7
+ "train_samples_per_second": 0.257,
8
+ "train_steps_per_second": 0.032
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.948453608247423,
5
+ "eval_steps": 500,
6
+ "global_step": 240,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.020618556701030927,
13
+ "grad_norm": 1.546875,
14
+ "learning_rate": 8.333333333333333e-08,
15
+ "loss": 2.5196,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.10309278350515463,
20
+ "grad_norm": 1.5390625,
21
+ "learning_rate": 4.1666666666666667e-07,
22
+ "loss": 2.458,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.20618556701030927,
27
+ "grad_norm": 1.4375,
28
+ "learning_rate": 8.333333333333333e-07,
29
+ "loss": 2.4466,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.30927835051546393,
34
+ "grad_norm": 1.515625,
35
+ "learning_rate": 1.2499999999999999e-06,
36
+ "loss": 2.4248,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.41237113402061853,
41
+ "grad_norm": 1.6171875,
42
+ "learning_rate": 1.6666666666666667e-06,
43
+ "loss": 2.4622,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.5154639175257731,
48
+ "grad_norm": 1.4921875,
49
+ "learning_rate": 1.9998942319271077e-06,
50
+ "loss": 2.4147,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.6185567010309279,
55
+ "grad_norm": 1.46875,
56
+ "learning_rate": 1.9961946980917456e-06,
57
+ "loss": 2.3747,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.7216494845360825,
62
+ "grad_norm": 1.453125,
63
+ "learning_rate": 1.987229113117374e-06,
64
+ "loss": 2.4189,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.8247422680412371,
69
+ "grad_norm": 1.578125,
70
+ "learning_rate": 1.9730448705798236e-06,
71
+ "loss": 2.4126,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.9278350515463918,
76
+ "grad_norm": 1.4765625,
77
+ "learning_rate": 1.953716950748227e-06,
78
+ "loss": 2.3912,
79
+ "step": 45
80
+ },
81
+ {
82
+ "epoch": 0.9896907216494846,
83
+ "eval_loss": 2.246396064758301,
84
+ "eval_runtime": 1335.3788,
85
+ "eval_samples_per_second": 4.423,
86
+ "eval_steps_per_second": 0.553,
87
+ "step": 48
88
+ },
89
+ {
90
+ "epoch": 1.0309278350515463,
91
+ "grad_norm": 1.4453125,
92
+ "learning_rate": 1.929347524226822e-06,
93
+ "loss": 2.3653,
94
+ "step": 50
95
+ },
96
+ {
97
+ "epoch": 1.134020618556701,
98
+ "grad_norm": 1.546875,
99
+ "learning_rate": 1.900065411864121e-06,
100
+ "loss": 2.3567,
101
+ "step": 55
102
+ },
103
+ {
104
+ "epoch": 1.2371134020618557,
105
+ "grad_norm": 1.5,
106
+ "learning_rate": 1.8660254037844386e-06,
107
+ "loss": 2.3555,
108
+ "step": 60
109
+ },
110
+ {
111
+ "epoch": 1.3402061855670104,
112
+ "grad_norm": 1.6015625,
113
+ "learning_rate": 1.8274074411415103e-06,
114
+ "loss": 2.2988,
115
+ "step": 65
116
+ },
117
+ {
118
+ "epoch": 1.443298969072165,
119
+ "grad_norm": 1.53125,
120
+ "learning_rate": 1.7844156649195757e-06,
121
+ "loss": 2.288,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 1.5463917525773194,
126
+ "grad_norm": 1.3671875,
127
+ "learning_rate": 1.737277336810124e-06,
128
+ "loss": 2.2871,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 1.6494845360824741,
133
+ "grad_norm": 1.59375,
134
+ "learning_rate": 1.6862416378687337e-06,
135
+ "loss": 2.2903,
136
+ "step": 80
137
+ },
138
+ {
139
+ "epoch": 1.7525773195876289,
140
+ "grad_norm": 1.484375,
141
+ "learning_rate": 1.6315783513024974e-06,
142
+ "loss": 2.2465,
143
+ "step": 85
144
+ },
145
+ {
146
+ "epoch": 1.8556701030927836,
147
+ "grad_norm": 1.484375,
148
+ "learning_rate": 1.573576436351046e-06,
149
+ "loss": 2.2434,
150
+ "step": 90
151
+ },
152
+ {
153
+ "epoch": 1.9587628865979383,
154
+ "grad_norm": 1.421875,
155
+ "learning_rate": 1.5125425007998652e-06,
156
+ "loss": 2.2442,
157
+ "step": 95
158
+ },
159
+ {
160
+ "epoch": 2.0,
161
+ "eval_loss": 2.1167430877685547,
162
+ "eval_runtime": 1327.822,
163
+ "eval_samples_per_second": 4.449,
164
+ "eval_steps_per_second": 0.557,
165
+ "step": 97
166
+ },
167
+ {
168
+ "epoch": 2.0618556701030926,
169
+ "grad_norm": 1.4765625,
170
+ "learning_rate": 1.4487991802004622e-06,
171
+ "loss": 2.2931,
172
+ "step": 100
173
+ },
174
+ {
175
+ "epoch": 2.1649484536082473,
176
+ "grad_norm": 1.390625,
177
+ "learning_rate": 1.3826834323650898e-06,
178
+ "loss": 2.2062,
179
+ "step": 105
180
+ },
181
+ {
182
+ "epoch": 2.268041237113402,
183
+ "grad_norm": 1.4921875,
184
+ "learning_rate": 1.3145447561516136e-06,
185
+ "loss": 2.1501,
186
+ "step": 110
187
+ },
188
+ {
189
+ "epoch": 2.3711340206185567,
190
+ "grad_norm": 1.1875,
191
+ "learning_rate": 1.2447433439543238e-06,
192
+ "loss": 2.1248,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 2.4742268041237114,
197
+ "grad_norm": 1.3984375,
198
+ "learning_rate": 1.1736481776669305e-06,
199
+ "loss": 2.1624,
200
+ "step": 120
201
+ },
202
+ {
203
+ "epoch": 2.5773195876288657,
204
+ "grad_norm": 1.2265625,
205
+ "learning_rate": 1.101635078182802e-06,
206
+ "loss": 2.1479,
207
+ "step": 125
208
+ },
209
+ {
210
+ "epoch": 2.680412371134021,
211
+ "grad_norm": 1.3359375,
212
+ "learning_rate": 1.0290847187431114e-06,
213
+ "loss": 2.1655,
214
+ "step": 130
215
+ },
216
+ {
217
+ "epoch": 2.783505154639175,
218
+ "grad_norm": 1.3203125,
219
+ "learning_rate": 9.56380612634664e-07,
220
+ "loss": 2.1369,
221
+ "step": 135
222
+ },
223
+ {
224
+ "epoch": 2.88659793814433,
225
+ "grad_norm": 1.28125,
226
+ "learning_rate": 8.839070858747696e-07,
227
+ "loss": 2.1003,
228
+ "step": 140
229
+ },
230
+ {
231
+ "epoch": 2.9896907216494846,
232
+ "grad_norm": 1.2109375,
233
+ "learning_rate": 8.120472455998881e-07,
234
+ "loss": 2.1047,
235
+ "step": 145
236
+ },
237
+ {
238
+ "epoch": 2.9896907216494846,
239
+ "eval_loss": 2.0316832065582275,
240
+ "eval_runtime": 1327.2975,
241
+ "eval_samples_per_second": 4.45,
242
+ "eval_steps_per_second": 0.557,
243
+ "step": 145
244
+ },
245
+ {
246
+ "epoch": 3.0927835051546393,
247
+ "grad_norm": 1.25,
248
+ "learning_rate": 7.411809548974791e-07,
249
+ "loss": 2.0675,
250
+ "step": 150
251
+ },
252
+ {
253
+ "epoch": 3.195876288659794,
254
+ "grad_norm": 1.3125,
255
+ "learning_rate": 6.71682824786439e-07,
256
+ "loss": 2.0934,
257
+ "step": 155
258
+ },
259
+ {
260
+ "epoch": 3.2989690721649483,
261
+ "grad_norm": 1.0234375,
262
+ "learning_rate": 6.039202339608431e-07,
263
+ "loss": 2.081,
264
+ "step": 160
265
+ },
266
+ {
267
+ "epoch": 3.402061855670103,
268
+ "grad_norm": 1.0625,
269
+ "learning_rate": 5.382513867649663e-07,
270
+ "loss": 2.0219,
271
+ "step": 165
272
+ },
273
+ {
274
+ "epoch": 3.5051546391752577,
275
+ "grad_norm": 1.3203125,
276
+ "learning_rate": 4.750234196654399e-07,
277
+ "loss": 2.0877,
278
+ "step": 170
279
+ },
280
+ {
281
+ "epoch": 3.6082474226804124,
282
+ "grad_norm": 1.2890625,
283
+ "learning_rate": 4.1457056623005947e-07,
284
+ "loss": 2.1019,
285
+ "step": 175
286
+ },
287
+ {
288
+ "epoch": 3.711340206185567,
289
+ "grad_norm": 1.1328125,
290
+ "learning_rate": 3.5721239031346063e-07,
291
+ "loss": 2.0828,
292
+ "step": 180
293
+ },
294
+ {
295
+ "epoch": 3.8144329896907214,
296
+ "grad_norm": 1.2421875,
297
+ "learning_rate": 3.032520967893453e-07,
298
+ "loss": 2.09,
299
+ "step": 185
300
+ },
301
+ {
302
+ "epoch": 3.917525773195876,
303
+ "grad_norm": 1.3125,
304
+ "learning_rate": 2.5297492875900415e-07,
305
+ "loss": 2.05,
306
+ "step": 190
307
+ },
308
+ {
309
+ "epoch": 4.0,
310
+ "eval_loss": 2.0067081451416016,
311
+ "eval_runtime": 1327.277,
312
+ "eval_samples_per_second": 4.45,
313
+ "eval_steps_per_second": 0.557,
314
+ "step": 194
315
+ },
316
+ {
317
+ "epoch": 4.020618556701031,
318
+ "grad_norm": 1.1796875,
319
+ "learning_rate": 2.0664665970876495e-07,
320
+ "loss": 2.1246,
321
+ "step": 195
322
+ },
323
+ {
324
+ "epoch": 4.123711340206185,
325
+ "grad_norm": 1.1796875,
326
+ "learning_rate": 1.6451218858706372e-07,
327
+ "loss": 2.0933,
328
+ "step": 200
329
+ },
330
+ {
331
+ "epoch": 4.22680412371134,
332
+ "grad_norm": 1.1328125,
333
+ "learning_rate": 1.2679424522780425e-07,
334
+ "loss": 2.0561,
335
+ "step": 205
336
+ },
337
+ {
338
+ "epoch": 4.329896907216495,
339
+ "grad_norm": 1.1640625,
340
+ "learning_rate": 9.369221296335006e-08,
341
+ "loss": 2.0946,
342
+ "step": 210
343
+ },
344
+ {
345
+ "epoch": 4.43298969072165,
346
+ "grad_norm": 1.125,
347
+ "learning_rate": 6.538107465101162e-08,
348
+ "loss": 2.0797,
349
+ "step": 215
350
+ },
351
+ {
352
+ "epoch": 4.536082474226804,
353
+ "grad_norm": 1.125,
354
+ "learning_rate": 4.20104876845111e-08,
355
+ "loss": 2.0907,
356
+ "step": 220
357
+ },
358
+ {
359
+ "epoch": 4.639175257731958,
360
+ "grad_norm": 1.171875,
361
+ "learning_rate": 2.3703992880066636e-08,
362
+ "loss": 2.1295,
363
+ "step": 225
364
+ },
365
+ {
366
+ "epoch": 4.742268041237113,
367
+ "grad_norm": 1.1640625,
368
+ "learning_rate": 1.0558361419055529e-08,
369
+ "loss": 2.0247,
370
+ "step": 230
371
+ },
372
+ {
373
+ "epoch": 4.845360824742268,
374
+ "grad_norm": 1.046875,
375
+ "learning_rate": 2.643083299427751e-09,
376
+ "loss": 2.0658,
377
+ "step": 235
378
+ },
379
+ {
380
+ "epoch": 4.948453608247423,
381
+ "grad_norm": 1.046875,
382
+ "learning_rate": 0.0,
383
+ "loss": 2.0626,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 4.948453608247423,
388
+ "eval_loss": 2.0068044662475586,
389
+ "eval_runtime": 1327.4294,
390
+ "eval_samples_per_second": 4.45,
391
+ "eval_steps_per_second": 0.557,
392
+ "step": 240
393
+ },
394
+ {
395
+ "epoch": 4.948453608247423,
396
+ "step": 240,
397
+ "total_flos": 1.0715672433026662e+17,
398
+ "train_loss": 2.205865615606308,
399
+ "train_runtime": 7479.3037,
400
+ "train_samples_per_second": 0.257,
401
+ "train_steps_per_second": 0.032
402
+ }
403
+ ],
404
+ "logging_steps": 5,
405
+ "max_steps": 240,
406
+ "num_input_tokens_seen": 0,
407
+ "num_train_epochs": 5,
408
+ "save_steps": 1,
409
+ "total_flos": 1.0715672433026662e+17,
410
+ "train_batch_size": 4,
411
+ "trial_name": null,
412
+ "trial_params": null
413
+ }