pszemraj commited on
Commit
2b4ffac
1 Parent(s): 67a526d

End of training

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. all_results.json +17 -0
  3. eval_results.json +11 -0
  4. train_results.json +10 -0
  5. trainer_state.json +297 -0
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
  [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/pszemraj/long-generation-tests/runs/ethp25f9)
17
  # Qwen2-1.5B-stepbasin-books-vN
18
 
19
- This model is a fine-tuned version of [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 2.8110
22
  - Accuracy: 0.4298
 
16
  [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/pszemraj/long-generation-tests/runs/ethp25f9)
17
  # Qwen2-1.5B-stepbasin-books-vN
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) on the BEE-spoke-data/stepbasin-books dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 2.8110
22
  - Accuracy: 0.4298
all_results.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9899888765294773,
3
+ "eval_accuracy": 0.4297979192055684,
4
+ "eval_loss": 2.810983180999756,
5
+ "eval_runtime": 19.3217,
6
+ "eval_samples": 29,
7
+ "eval_samples_per_second": 1.501,
8
+ "eval_steps_per_second": 1.501,
9
+ "num_input_tokens_seen": 44040192,
10
+ "perplexity": 16.62625680536472,
11
+ "total_flos": 3.462459117703004e+17,
12
+ "train_loss": 2.7097946518943425,
13
+ "train_runtime": 6795.1554,
14
+ "train_samples": 899,
15
+ "train_samples_per_second": 0.397,
16
+ "train_steps_per_second": 0.012
17
+ }
eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9899888765294773,
3
+ "eval_accuracy": 0.4297979192055684,
4
+ "eval_loss": 2.810983180999756,
5
+ "eval_runtime": 19.3217,
6
+ "eval_samples": 29,
7
+ "eval_samples_per_second": 1.501,
8
+ "eval_steps_per_second": 1.501,
9
+ "num_input_tokens_seen": 44040192,
10
+ "perplexity": 16.62625680536472
11
+ }
train_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9899888765294773,
3
+ "num_input_tokens_seen": 44040192,
4
+ "total_flos": 3.462459117703004e+17,
5
+ "train_loss": 2.7097946518943425,
6
+ "train_runtime": 6795.1554,
7
+ "train_samples": 899,
8
+ "train_samples_per_second": 0.397,
9
+ "train_steps_per_second": 0.012
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9899888765294773,
5
+ "eval_steps": 300.0,
6
+ "global_step": 84,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1067853170189099,
13
+ "grad_norm": 0.5859375,
14
+ "learning_rate": 1.8e-05,
15
+ "loss": 2.8121,
16
+ "num_input_tokens_seen": 1572864,
17
+ "step": 3
18
+ },
19
+ {
20
+ "epoch": 0.2135706340378198,
21
+ "grad_norm": 0.69921875,
22
+ "learning_rate": 2.9988140958409528e-05,
23
+ "loss": 2.8211,
24
+ "num_input_tokens_seen": 3145728,
25
+ "step": 6
26
+ },
27
+ {
28
+ "epoch": 0.3203559510567297,
29
+ "grad_norm": 0.796875,
30
+ "learning_rate": 2.9810630129045003e-05,
31
+ "loss": 2.7286,
32
+ "num_input_tokens_seen": 4718592,
33
+ "step": 9
34
+ },
35
+ {
36
+ "epoch": 0.4271412680756396,
37
+ "grad_norm": 0.71875,
38
+ "learning_rate": 2.9422573564911305e-05,
39
+ "loss": 2.7688,
40
+ "num_input_tokens_seen": 6291456,
41
+ "step": 12
42
+ },
43
+ {
44
+ "epoch": 0.5339265850945495,
45
+ "grad_norm": 0.58984375,
46
+ "learning_rate": 2.88294878209231e-05,
47
+ "loss": 2.756,
48
+ "num_input_tokens_seen": 7864320,
49
+ "step": 15
50
+ },
51
+ {
52
+ "epoch": 0.6407119021134594,
53
+ "grad_norm": 0.54296875,
54
+ "learning_rate": 2.8039804116593743e-05,
55
+ "loss": 2.7252,
56
+ "num_input_tokens_seen": 9437184,
57
+ "step": 18
58
+ },
59
+ {
60
+ "epoch": 0.7474972191323693,
61
+ "grad_norm": 0.54296875,
62
+ "learning_rate": 2.7064748479061476e-05,
63
+ "loss": 2.7188,
64
+ "num_input_tokens_seen": 11010048,
65
+ "step": 21
66
+ },
67
+ {
68
+ "epoch": 0.8542825361512792,
69
+ "grad_norm": 0.484375,
70
+ "learning_rate": 2.5918182155542415e-05,
71
+ "loss": 2.7574,
72
+ "num_input_tokens_seen": 12582912,
73
+ "step": 24
74
+ },
75
+ {
76
+ "epoch": 0.9610678531701891,
77
+ "grad_norm": 0.5,
78
+ "learning_rate": 2.4616404563883302e-05,
79
+ "loss": 2.7792,
80
+ "num_input_tokens_seen": 14155776,
81
+ "step": 27
82
+ },
83
+ {
84
+ "epoch": 0.996662958843159,
85
+ "eval_accuracy": 0.4287265815910942,
86
+ "eval_loss": 2.818293809890747,
87
+ "eval_runtime": 19.7558,
88
+ "eval_samples_per_second": 1.468,
89
+ "eval_steps_per_second": 1.468,
90
+ "num_input_tokens_seen": 14729216,
91
+ "step": 28
92
+ },
93
+ {
94
+ "epoch": 1.067853170189099,
95
+ "grad_norm": 0.50390625,
96
+ "learning_rate": 2.3177921582440015e-05,
97
+ "loss": 2.6953,
98
+ "num_input_tokens_seen": 15728640,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 1.174638487208009,
103
+ "grad_norm": 0.462890625,
104
+ "learning_rate": 2.162318247323868e-05,
105
+ "loss": 2.7098,
106
+ "num_input_tokens_seen": 17301504,
107
+ "step": 33
108
+ },
109
+ {
110
+ "epoch": 1.2814238042269188,
111
+ "grad_norm": 0.451171875,
112
+ "learning_rate": 1.997428917828102e-05,
113
+ "loss": 2.6726,
114
+ "num_input_tokens_seen": 18874368,
115
+ "step": 36
116
+ },
117
+ {
118
+ "epoch": 1.3882091212458287,
119
+ "grad_norm": 0.462890625,
120
+ "learning_rate": 1.825468212159477e-05,
121
+ "loss": 2.6655,
122
+ "num_input_tokens_seen": 20447232,
123
+ "step": 39
124
+ },
125
+ {
126
+ "epoch": 1.4949944382647387,
127
+ "grad_norm": 0.455078125,
128
+ "learning_rate": 1.6488806983620927e-05,
129
+ "loss": 2.7292,
130
+ "num_input_tokens_seen": 22020096,
131
+ "step": 42
132
+ },
133
+ {
134
+ "epoch": 1.6017797552836486,
135
+ "grad_norm": 0.4453125,
136
+ "learning_rate": 1.4701767185023948e-05,
137
+ "loss": 2.6312,
138
+ "num_input_tokens_seen": 23592960,
139
+ "step": 45
140
+ },
141
+ {
142
+ "epoch": 1.7085650723025583,
143
+ "grad_norm": 0.43359375,
144
+ "learning_rate": 1.2918967020163978e-05,
145
+ "loss": 2.7176,
146
+ "num_input_tokens_seen": 25165824,
147
+ "step": 48
148
+ },
149
+ {
150
+ "epoch": 1.8153503893214684,
151
+ "grad_norm": 0.4453125,
152
+ "learning_rate": 1.116575051339288e-05,
153
+ "loss": 2.7234,
154
+ "num_input_tokens_seen": 26738688,
155
+ "step": 51
156
+ },
157
+ {
158
+ "epoch": 1.9221357063403781,
159
+ "grad_norm": 0.4375,
160
+ "learning_rate": 9.467041132139884e-06,
161
+ "loss": 2.6971,
162
+ "num_input_tokens_seen": 28311552,
163
+ "step": 54
164
+ },
165
+ {
166
+ "epoch": 1.993325917686318,
167
+ "eval_accuracy": 0.4297116228554831,
168
+ "eval_loss": 2.8111748695373535,
169
+ "eval_runtime": 19.5837,
170
+ "eval_samples_per_second": 1.481,
171
+ "eval_steps_per_second": 1.481,
172
+ "num_input_tokens_seen": 29458432,
173
+ "step": 56
174
+ },
175
+ {
176
+ "epoch": 2.0289210233592883,
177
+ "grad_norm": 0.41796875,
178
+ "learning_rate": 7.846987478572411e-06,
179
+ "loss": 2.6623,
180
+ "num_input_tokens_seen": 29884416,
181
+ "step": 57
182
+ },
183
+ {
184
+ "epoch": 2.135706340378198,
185
+ "grad_norm": 0.396484375,
186
+ "learning_rate": 6.328619996627272e-06,
187
+ "loss": 2.6211,
188
+ "num_input_tokens_seen": 31457280,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 2.242491657397108,
193
+ "grad_norm": 0.416015625,
194
+ "learning_rate": 4.933523574614447e-06,
195
+ "loss": 2.6728,
196
+ "num_input_tokens_seen": 33030144,
197
+ "step": 63
198
+ },
199
+ {
200
+ "epoch": 2.349276974416018,
201
+ "grad_norm": 0.41015625,
202
+ "learning_rate": 3.6815306976265466e-06,
203
+ "loss": 2.6567,
204
+ "num_input_tokens_seen": 34603008,
205
+ "step": 66
206
+ },
207
+ {
208
+ "epoch": 2.456062291434928,
209
+ "grad_norm": 0.435546875,
210
+ "learning_rate": 2.590439511854144e-06,
211
+ "loss": 2.6862,
212
+ "num_input_tokens_seen": 36175872,
213
+ "step": 69
214
+ },
215
+ {
216
+ "epoch": 2.5628476084538376,
217
+ "grad_norm": 0.419921875,
218
+ "learning_rate": 1.6757608087630249e-06,
219
+ "loss": 2.6841,
220
+ "num_input_tokens_seen": 37748736,
221
+ "step": 72
222
+ },
223
+ {
224
+ "epoch": 2.6696329254727473,
225
+ "grad_norm": 0.40625,
226
+ "learning_rate": 9.504975259690835e-07,
227
+ "loss": 2.7272,
228
+ "num_input_tokens_seen": 39321600,
229
+ "step": 75
230
+ },
231
+ {
232
+ "epoch": 2.7764182424916575,
233
+ "grad_norm": 0.40625,
234
+ "learning_rate": 4.2495989939384916e-07,
235
+ "loss": 2.6643,
236
+ "num_input_tokens_seen": 40894464,
237
+ "step": 78
238
+ },
239
+ {
240
+ "epoch": 2.883203559510567,
241
+ "grad_norm": 0.408203125,
242
+ "learning_rate": 1.0661889447039886e-07,
243
+ "loss": 2.679,
244
+ "num_input_tokens_seen": 42467328,
245
+ "step": 81
246
+ },
247
+ {
248
+ "epoch": 2.9899888765294773,
249
+ "grad_norm": 0.412109375,
250
+ "learning_rate": 0.0,
251
+ "loss": 2.7116,
252
+ "num_input_tokens_seen": 44040192,
253
+ "step": 84
254
+ },
255
+ {
256
+ "epoch": 2.9899888765294773,
257
+ "eval_accuracy": 0.4297979192055684,
258
+ "eval_loss": 2.810983180999756,
259
+ "eval_runtime": 19.4794,
260
+ "eval_samples_per_second": 1.489,
261
+ "eval_steps_per_second": 1.489,
262
+ "num_input_tokens_seen": 44040192,
263
+ "step": 84
264
+ },
265
+ {
266
+ "epoch": 2.9899888765294773,
267
+ "num_input_tokens_seen": 44040192,
268
+ "step": 84,
269
+ "total_flos": 3.462459117703004e+17,
270
+ "train_loss": 2.7097946518943425,
271
+ "train_runtime": 6795.1554,
272
+ "train_samples_per_second": 0.397,
273
+ "train_steps_per_second": 0.012
274
+ }
275
+ ],
276
+ "logging_steps": 3,
277
+ "max_steps": 84,
278
+ "num_input_tokens_seen": 44040192,
279
+ "num_train_epochs": 3,
280
+ "save_steps": 50,
281
+ "stateful_callbacks": {
282
+ "TrainerControl": {
283
+ "args": {
284
+ "should_epoch_stop": false,
285
+ "should_evaluate": false,
286
+ "should_log": false,
287
+ "should_save": true,
288
+ "should_training_stop": true
289
+ },
290
+ "attributes": {}
291
+ }
292
+ },
293
+ "total_flos": 3.462459117703004e+17,
294
+ "train_batch_size": 1,
295
+ "trial_name": null,
296
+ "trial_params": null
297
+ }