Amr Keleg commited on
Commit
5a59f8d
1 Parent(s): 7cb5a13

Track the final version of the model

Browse files
Files changed (6) hide show
  1. optimizer.pt +1 -1
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +2 -2
  4. scheduler.pt +1 -1
  5. trainer_state.json +262 -40
  6. training_args.bin +1 -1
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2059e171b8ca906f726d479158ee4f07b65da57fdc9c29aaf26100d1d2ae122
3
  size 1298159621
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc0454bac094eb9bcfc5225a1906cb99c102336172ef9e7bafc8054a4c830871
3
  size 1298159621
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91bf355a8399a307cd076db691b31d1a372fc086273e419228901f1c397f683a
3
  size 649093613
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ff85dba579bca3aa2ae6faff5086f9422af9dfca51f08c249c6ddf177bc7771
3
  size 649093613
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f113467ba153437377d3cca21247a1bf13d5ec4199ee15a4ce0dbae7ce2c1608
3
- size 17641
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e69904db53ae4ac2e537fe1ecca98d908ffe407b7bf84009f7869243dbb2d8b
3
+ size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37a0fbb9478949421cddae7297acf773062e367b83de55aa1d3cf8ac35dd5ed9
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f7f223e541d8ca8282c09e6f4666d154f61e406961790c043b389c95e9eedcc
3
  size 627
trainer_state.json CHANGED
@@ -2,75 +2,297 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
- "global_step": 795,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.38,
12
- "eval_loss": 0.2512435019016266,
13
- "eval_runtime": 8.34,
14
- "eval_samples_per_second": 133.813,
15
- "eval_steps_per_second": 4.197,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.75,
20
- "eval_loss": 0.2268657237291336,
21
- "eval_runtime": 8.3466,
22
- "eval_samples_per_second": 133.707,
23
- "eval_steps_per_second": 4.193,
24
  "step": 200
25
  },
26
  {
27
- "epoch": 1.13,
28
- "eval_loss": 0.23051245510578156,
29
- "eval_runtime": 8.3417,
30
- "eval_samples_per_second": 133.786,
31
- "eval_steps_per_second": 4.196,
32
  "step": 300
33
  },
34
  {
35
- "epoch": 1.51,
36
- "eval_loss": 0.2245202660560608,
37
- "eval_runtime": 8.337,
38
- "eval_samples_per_second": 133.862,
39
- "eval_steps_per_second": 4.198,
40
  "step": 400
41
  },
42
  {
43
- "epoch": 1.89,
44
- "learning_rate": 1.8553459119496856e-05,
45
- "loss": 0.2119,
46
  "step": 500
47
  },
48
  {
49
- "epoch": 1.89,
50
- "eval_loss": 0.22478941082954407,
51
- "eval_runtime": 8.3391,
52
- "eval_samples_per_second": 133.828,
53
- "eval_steps_per_second": 4.197,
54
  "step": 500
55
  },
56
  {
57
- "epoch": 2.26,
58
- "eval_loss": 0.2384825348854065,
59
- "eval_runtime": 8.3396,
60
- "eval_samples_per_second": 133.819,
61
- "eval_steps_per_second": 4.197,
62
  "step": 600
63
  },
64
  {
65
- "epoch": 2.64,
66
- "eval_loss": 0.23830106854438782,
67
- "eval_runtime": 8.3367,
68
- "eval_samples_per_second": 133.866,
69
- "eval_steps_per_second": 4.198,
70
  "step": 700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  }
72
  ],
73
- "max_steps": 795,
74
  "num_train_epochs": 3,
75
  "total_flos": 6635087188033536.0,
76
  "trial_name": null,
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
+ "global_step": 3174,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.09,
12
+ "eval_loss": 0.291690856218338,
13
+ "eval_runtime": 11.8699,
14
+ "eval_samples_per_second": 94.019,
15
+ "eval_steps_per_second": 11.795,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.19,
20
+ "eval_loss": 0.2913290560245514,
21
+ "eval_runtime": 11.9204,
22
+ "eval_samples_per_second": 93.621,
23
+ "eval_steps_per_second": 11.745,
24
  "step": 200
25
  },
26
  {
27
+ "epoch": 0.28,
28
+ "eval_loss": 0.27973252534866333,
29
+ "eval_runtime": 11.9409,
30
+ "eval_samples_per_second": 93.46,
31
+ "eval_steps_per_second": 11.724,
32
  "step": 300
33
  },
34
  {
35
+ "epoch": 0.38,
36
+ "eval_loss": 0.25020790100097656,
37
+ "eval_runtime": 11.9448,
38
+ "eval_samples_per_second": 93.43,
39
+ "eval_steps_per_second": 11.721,
40
  "step": 400
41
  },
42
  {
43
+ "epoch": 0.47,
44
+ "learning_rate": 4.2123503465658476e-05,
45
+ "loss": 0.3052,
46
  "step": 500
47
  },
48
  {
49
+ "epoch": 0.47,
50
+ "eval_loss": 0.2535267770290375,
51
+ "eval_runtime": 11.9395,
52
+ "eval_samples_per_second": 93.471,
53
+ "eval_steps_per_second": 11.726,
54
  "step": 500
55
  },
56
  {
57
+ "epoch": 0.57,
58
+ "eval_loss": 0.29139193892478943,
59
+ "eval_runtime": 11.942,
60
+ "eval_samples_per_second": 93.452,
61
+ "eval_steps_per_second": 11.723,
62
  "step": 600
63
  },
64
  {
65
+ "epoch": 0.66,
66
+ "eval_loss": 0.2721957862377167,
67
+ "eval_runtime": 11.936,
68
+ "eval_samples_per_second": 93.498,
69
+ "eval_steps_per_second": 11.729,
70
  "step": 700
71
+ },
72
+ {
73
+ "epoch": 0.76,
74
+ "eval_loss": 0.24325571954250336,
75
+ "eval_runtime": 11.9084,
76
+ "eval_samples_per_second": 93.716,
77
+ "eval_steps_per_second": 11.756,
78
+ "step": 800
79
+ },
80
+ {
81
+ "epoch": 0.85,
82
+ "eval_loss": 0.2575836777687073,
83
+ "eval_runtime": 11.9447,
84
+ "eval_samples_per_second": 93.43,
85
+ "eval_steps_per_second": 11.721,
86
+ "step": 900
87
+ },
88
+ {
89
+ "epoch": 0.95,
90
+ "learning_rate": 3.424700693131695e-05,
91
+ "loss": 0.2236,
92
+ "step": 1000
93
+ },
94
+ {
95
+ "epoch": 0.95,
96
+ "eval_loss": 0.2607925832271576,
97
+ "eval_runtime": 11.9435,
98
+ "eval_samples_per_second": 93.44,
99
+ "eval_steps_per_second": 11.722,
100
+ "step": 1000
101
+ },
102
+ {
103
+ "epoch": 1.04,
104
+ "eval_loss": 0.2817261815071106,
105
+ "eval_runtime": 11.9296,
106
+ "eval_samples_per_second": 93.549,
107
+ "eval_steps_per_second": 11.735,
108
+ "step": 1100
109
+ },
110
+ {
111
+ "epoch": 1.13,
112
+ "eval_loss": 0.24339333176612854,
113
+ "eval_runtime": 11.9423,
114
+ "eval_samples_per_second": 93.449,
115
+ "eval_steps_per_second": 11.723,
116
+ "step": 1200
117
+ },
118
+ {
119
+ "epoch": 1.23,
120
+ "eval_loss": 0.24544629454612732,
121
+ "eval_runtime": 11.9445,
122
+ "eval_samples_per_second": 93.432,
123
+ "eval_steps_per_second": 11.721,
124
+ "step": 1300
125
+ },
126
+ {
127
+ "epoch": 1.32,
128
+ "eval_loss": 0.23857346177101135,
129
+ "eval_runtime": 11.9479,
130
+ "eval_samples_per_second": 93.406,
131
+ "eval_steps_per_second": 11.718,
132
+ "step": 1400
133
+ },
134
+ {
135
+ "epoch": 1.42,
136
+ "learning_rate": 2.637051039697543e-05,
137
+ "loss": 0.1449,
138
+ "step": 1500
139
+ },
140
+ {
141
+ "epoch": 1.42,
142
+ "eval_loss": 0.2612239718437195,
143
+ "eval_runtime": 11.947,
144
+ "eval_samples_per_second": 93.413,
145
+ "eval_steps_per_second": 11.718,
146
+ "step": 1500
147
+ },
148
+ {
149
+ "epoch": 1.51,
150
+ "eval_loss": 0.264009565114975,
151
+ "eval_runtime": 11.9446,
152
+ "eval_samples_per_second": 93.431,
153
+ "eval_steps_per_second": 11.721,
154
+ "step": 1600
155
+ },
156
+ {
157
+ "epoch": 1.61,
158
+ "eval_loss": 0.23957186937332153,
159
+ "eval_runtime": 11.9459,
160
+ "eval_samples_per_second": 93.421,
161
+ "eval_steps_per_second": 11.72,
162
+ "step": 1700
163
+ },
164
+ {
165
+ "epoch": 1.7,
166
+ "eval_loss": 0.23903459310531616,
167
+ "eval_runtime": 11.9432,
168
+ "eval_samples_per_second": 93.443,
169
+ "eval_steps_per_second": 11.722,
170
+ "step": 1800
171
+ },
172
+ {
173
+ "epoch": 1.8,
174
+ "eval_loss": 0.22416594624519348,
175
+ "eval_runtime": 11.944,
176
+ "eval_samples_per_second": 93.436,
177
+ "eval_steps_per_second": 11.721,
178
+ "step": 1900
179
+ },
180
+ {
181
+ "epoch": 1.89,
182
+ "learning_rate": 1.8494013862633903e-05,
183
+ "loss": 0.1408,
184
+ "step": 2000
185
+ },
186
+ {
187
+ "epoch": 1.89,
188
+ "eval_loss": 0.2341969758272171,
189
+ "eval_runtime": 11.9399,
190
+ "eval_samples_per_second": 93.468,
191
+ "eval_steps_per_second": 11.725,
192
+ "step": 2000
193
+ },
194
+ {
195
+ "epoch": 1.98,
196
+ "eval_loss": 0.23950397968292236,
197
+ "eval_runtime": 11.9383,
198
+ "eval_samples_per_second": 93.481,
199
+ "eval_steps_per_second": 11.727,
200
+ "step": 2100
201
+ },
202
+ {
203
+ "epoch": 2.08,
204
+ "eval_loss": 0.285567045211792,
205
+ "eval_runtime": 11.9279,
206
+ "eval_samples_per_second": 93.562,
207
+ "eval_steps_per_second": 11.737,
208
+ "step": 2200
209
+ },
210
+ {
211
+ "epoch": 2.17,
212
+ "eval_loss": 0.2656622529029846,
213
+ "eval_runtime": 11.9385,
214
+ "eval_samples_per_second": 93.479,
215
+ "eval_steps_per_second": 11.727,
216
+ "step": 2300
217
+ },
218
+ {
219
+ "epoch": 2.27,
220
+ "eval_loss": 0.25764182209968567,
221
+ "eval_runtime": 11.9434,
222
+ "eval_samples_per_second": 93.44,
223
+ "eval_steps_per_second": 11.722,
224
+ "step": 2400
225
+ },
226
+ {
227
+ "epoch": 2.36,
228
+ "learning_rate": 1.0617517328292375e-05,
229
+ "loss": 0.0893,
230
+ "step": 2500
231
+ },
232
+ {
233
+ "epoch": 2.36,
234
+ "eval_loss": 0.26884153485298157,
235
+ "eval_runtime": 11.9443,
236
+ "eval_samples_per_second": 93.434,
237
+ "eval_steps_per_second": 11.721,
238
+ "step": 2500
239
+ },
240
+ {
241
+ "epoch": 2.46,
242
+ "eval_loss": 0.2657739520072937,
243
+ "eval_runtime": 11.9455,
244
+ "eval_samples_per_second": 93.425,
245
+ "eval_steps_per_second": 11.72,
246
+ "step": 2600
247
+ },
248
+ {
249
+ "epoch": 2.55,
250
+ "eval_loss": 0.2614665925502777,
251
+ "eval_runtime": 11.9422,
252
+ "eval_samples_per_second": 93.45,
253
+ "eval_steps_per_second": 11.723,
254
+ "step": 2700
255
+ },
256
+ {
257
+ "epoch": 2.65,
258
+ "eval_loss": 0.27043506503105164,
259
+ "eval_runtime": 11.9429,
260
+ "eval_samples_per_second": 93.445,
261
+ "eval_steps_per_second": 11.722,
262
+ "step": 2800
263
+ },
264
+ {
265
+ "epoch": 2.74,
266
+ "eval_loss": 0.2741823196411133,
267
+ "eval_runtime": 11.9422,
268
+ "eval_samples_per_second": 93.45,
269
+ "eval_steps_per_second": 11.723,
270
+ "step": 2900
271
+ },
272
+ {
273
+ "epoch": 2.84,
274
+ "learning_rate": 2.741020793950851e-06,
275
+ "loss": 0.0668,
276
+ "step": 3000
277
+ },
278
+ {
279
+ "epoch": 2.84,
280
+ "eval_loss": 0.2574303448200226,
281
+ "eval_runtime": 11.9428,
282
+ "eval_samples_per_second": 93.445,
283
+ "eval_steps_per_second": 11.723,
284
+ "step": 3000
285
+ },
286
+ {
287
+ "epoch": 2.93,
288
+ "eval_loss": 0.2549898028373718,
289
+ "eval_runtime": 11.9436,
290
+ "eval_samples_per_second": 93.439,
291
+ "eval_steps_per_second": 11.722,
292
+ "step": 3100
293
  }
294
  ],
295
+ "max_steps": 3174,
296
  "num_train_epochs": 3,
297
  "total_flos": 6635087188033536.0,
298
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:016e178b54b4499d7401b710ae361f0d803cad1f54a717ede8cfa3bf9199ee13
3
  size 3515
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bb3243e713170949466998e8a474a592be4c2bf750567ed9f5e8fd61388dc92
3
  size 3515