mgh6 commited on
Commit
3b5a099
1 Parent(s): d7c8a06

Training in progress, step 2560, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dbedb6813ab23efa8584bec7923fe721f4b44150baafc9f003c1e9ccbc8959f
3
  size 4725595416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a79993ca3bb2e40d715e49b6365049f27102e49dd8b3a9ce020c6ea5a9f9fe9
3
  size 4725595416
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd523d00ffb4a32186d8de998c8ceb1d14a4584b486840374344962c5a4e3c1b
3
  size 9179193343
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bf2691edb5f20acb6de9eb1f6120c2449bed48ca00eecc968c5be167084b7bb
3
  size 9179193343
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:576c6ec28ef06a8796c16ea2ad20bd582e9c62a6072012fb887d3d0c157f117d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c349c1691bbeda5a6b16abd459bd4b17c698c1ae8b87b93b48229ee14acd38e
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69cbe7499b854b9136e048ff63938c0702f50fa9e8e898bdab2b2964aefa4363
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28472ecbb49d175fddb5467d2d36c375ce76e352a7c4d1642d73ecb32735946a
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 12849.525390625,
3
- "best_model_checkpoint": "mgh6/TCS_Pairing_VAE/checkpoint-12800",
4
- "epoch": 0.9457225449541454,
5
  "eval_steps": 512,
6
- "global_step": 12800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,502 +11,102 @@
11
  {
12
  "epoch": 0.02,
13
  "learning_rate": 9.962169351263485e-05,
14
- "loss": 113604.8047,
15
  "step": 256
16
  },
17
  {
18
  "epoch": 0.04,
19
  "learning_rate": 9.92433870252697e-05,
20
- "loss": 117025.2031,
21
  "step": 512
22
  },
23
  {
24
  "epoch": 0.04,
25
- "eval_loss": 94748.9375,
26
- "eval_runtime": 59.5739,
27
- "eval_samples_per_second": 57.038,
28
- "eval_steps_per_second": 57.038,
29
  "step": 512
30
  },
31
  {
32
  "epoch": 0.06,
33
  "learning_rate": 9.886508053790455e-05,
34
- "loss": 107788.5469,
35
  "step": 768
36
  },
37
  {
38
  "epoch": 0.08,
39
  "learning_rate": 9.848677405053938e-05,
40
- "loss": 95214.5781,
41
  "step": 1024
42
  },
43
  {
44
  "epoch": 0.08,
45
- "eval_loss": 72482.0,
46
- "eval_runtime": 59.6084,
47
- "eval_samples_per_second": 57.005,
48
- "eval_steps_per_second": 57.005,
49
  "step": 1024
50
  },
51
  {
52
  "epoch": 0.09,
53
  "learning_rate": 9.810846756317423e-05,
54
- "loss": 76292.5156,
55
  "step": 1280
56
  },
57
  {
58
  "epoch": 0.11,
59
  "learning_rate": 9.773016107580908e-05,
60
- "loss": 61333.8164,
61
  "step": 1536
62
  },
63
  {
64
  "epoch": 0.11,
65
- "eval_loss": 46481.89453125,
66
- "eval_runtime": 59.7574,
67
- "eval_samples_per_second": 56.863,
68
- "eval_steps_per_second": 56.863,
69
  "step": 1536
70
  },
71
  {
72
  "epoch": 0.13,
73
  "learning_rate": 9.735185458844393e-05,
74
- "loss": 47487.7461,
75
  "step": 1792
76
  },
77
  {
78
  "epoch": 0.15,
79
  "learning_rate": 9.697354810107877e-05,
80
- "loss": 35592.2891,
81
  "step": 2048
82
  },
83
  {
84
  "epoch": 0.15,
85
- "eval_loss": 26872.33203125,
86
- "eval_runtime": 59.5627,
87
- "eval_samples_per_second": 57.049,
88
- "eval_steps_per_second": 57.049,
89
  "step": 2048
90
  },
91
  {
92
  "epoch": 0.17,
93
  "learning_rate": 9.659524161371362e-05,
94
- "loss": 27868.3906,
95
  "step": 2304
96
  },
97
  {
98
  "epoch": 0.19,
99
  "learning_rate": 9.621693512634847e-05,
100
- "loss": 22961.3906,
101
  "step": 2560
102
  },
103
  {
104
  "epoch": 0.19,
105
- "eval_loss": 18575.595703125,
106
- "eval_runtime": 86.8108,
107
- "eval_samples_per_second": 39.143,
108
- "eval_steps_per_second": 39.143,
109
  "step": 2560
110
- },
111
- {
112
- "epoch": 0.21,
113
- "learning_rate": 9.58386286389833e-05,
114
- "loss": 20474.4043,
115
- "step": 2816
116
- },
117
- {
118
- "epoch": 0.23,
119
- "learning_rate": 9.546032215161815e-05,
120
- "loss": 18948.6875,
121
- "step": 3072
122
- },
123
- {
124
- "epoch": 0.23,
125
- "eval_loss": 16196.150390625,
126
- "eval_runtime": 62.7371,
127
- "eval_samples_per_second": 54.163,
128
- "eval_steps_per_second": 54.163,
129
- "step": 3072
130
- },
131
- {
132
- "epoch": 0.25,
133
- "learning_rate": 9.5082015664253e-05,
134
- "loss": 17958.9785,
135
- "step": 3328
136
- },
137
- {
138
- "epoch": 0.26,
139
- "learning_rate": 9.470370917688785e-05,
140
- "loss": 17018.3223,
141
- "step": 3584
142
- },
143
- {
144
- "epoch": 0.26,
145
- "eval_loss": 15006.568359375,
146
- "eval_runtime": 59.6957,
147
- "eval_samples_per_second": 56.922,
148
- "eval_steps_per_second": 56.922,
149
- "step": 3584
150
- },
151
- {
152
- "epoch": 0.28,
153
- "learning_rate": 9.43254026895227e-05,
154
- "loss": 16859.2793,
155
- "step": 3840
156
- },
157
- {
158
- "epoch": 0.3,
159
- "learning_rate": 9.394709620215754e-05,
160
- "loss": 16723.5449,
161
- "step": 4096
162
- },
163
- {
164
- "epoch": 0.3,
165
- "eval_loss": 14789.91796875,
166
- "eval_runtime": 59.5641,
167
- "eval_samples_per_second": 57.048,
168
- "eval_steps_per_second": 57.048,
169
- "step": 4096
170
- },
171
- {
172
- "epoch": 0.32,
173
- "learning_rate": 9.356878971479238e-05,
174
- "loss": 16488.6934,
175
- "step": 4352
176
- },
177
- {
178
- "epoch": 0.34,
179
- "learning_rate": 9.319048322742722e-05,
180
- "loss": 16159.4502,
181
- "step": 4608
182
- },
183
- {
184
- "epoch": 0.34,
185
- "eval_loss": 14223.150390625,
186
- "eval_runtime": 60.4382,
187
- "eval_samples_per_second": 56.223,
188
- "eval_steps_per_second": 56.223,
189
- "step": 4608
190
- },
191
- {
192
- "epoch": 0.36,
193
- "learning_rate": 9.281217674006207e-05,
194
- "loss": 16155.2217,
195
- "step": 4864
196
- },
197
- {
198
- "epoch": 0.38,
199
- "learning_rate": 9.243387025269692e-05,
200
- "loss": 16132.4766,
201
- "step": 5120
202
- },
203
- {
204
- "epoch": 0.38,
205
- "eval_loss": 14493.603515625,
206
- "eval_runtime": 61.3546,
207
- "eval_samples_per_second": 55.383,
208
- "eval_steps_per_second": 55.383,
209
- "step": 5120
210
- },
211
- {
212
- "epoch": 0.4,
213
- "learning_rate": 9.205556376533177e-05,
214
- "loss": 16020.0381,
215
- "step": 5376
216
- },
217
- {
218
- "epoch": 0.42,
219
- "learning_rate": 9.167725727796661e-05,
220
- "loss": 15904.9912,
221
- "step": 5632
222
- },
223
- {
224
- "epoch": 0.42,
225
- "eval_loss": 14683.6728515625,
226
- "eval_runtime": 62.3009,
227
- "eval_samples_per_second": 54.542,
228
- "eval_steps_per_second": 54.542,
229
- "step": 5632
230
- },
231
- {
232
- "epoch": 0.44,
233
- "learning_rate": 9.129895079060146e-05,
234
- "loss": 15820.0801,
235
- "step": 5888
236
- },
237
- {
238
- "epoch": 0.45,
239
- "learning_rate": 9.09206443032363e-05,
240
- "loss": 15531.0293,
241
- "step": 6144
242
- },
243
- {
244
- "epoch": 0.45,
245
- "eval_loss": 14102.1162109375,
246
- "eval_runtime": 59.6921,
247
- "eval_samples_per_second": 56.925,
248
- "eval_steps_per_second": 56.925,
249
- "step": 6144
250
- },
251
- {
252
- "epoch": 0.47,
253
- "learning_rate": 9.054233781587114e-05,
254
- "loss": 15656.3779,
255
- "step": 6400
256
- },
257
- {
258
- "epoch": 0.49,
259
- "learning_rate": 9.016403132850599e-05,
260
- "loss": 15575.4033,
261
- "step": 6656
262
- },
263
- {
264
- "epoch": 0.49,
265
- "eval_loss": 13737.095703125,
266
- "eval_runtime": 49.6762,
267
- "eval_samples_per_second": 68.403,
268
- "eval_steps_per_second": 68.403,
269
- "step": 6656
270
- },
271
- {
272
- "epoch": 0.51,
273
- "learning_rate": 8.978572484114084e-05,
274
- "loss": 15500.876,
275
- "step": 6912
276
- },
277
- {
278
- "epoch": 0.53,
279
- "learning_rate": 8.940741835377569e-05,
280
- "loss": 15452.5596,
281
- "step": 7168
282
- },
283
- {
284
- "epoch": 0.53,
285
- "eval_loss": 14052.9873046875,
286
- "eval_runtime": 49.0398,
287
- "eval_samples_per_second": 69.291,
288
- "eval_steps_per_second": 69.291,
289
- "step": 7168
290
- },
291
- {
292
- "epoch": 0.55,
293
- "learning_rate": 8.902911186641053e-05,
294
- "loss": 15443.3691,
295
- "step": 7424
296
- },
297
- {
298
- "epoch": 0.57,
299
- "learning_rate": 8.865080537904538e-05,
300
- "loss": 15420.5,
301
- "step": 7680
302
- },
303
- {
304
- "epoch": 0.57,
305
- "eval_loss": 13470.451171875,
306
- "eval_runtime": 48.9981,
307
- "eval_samples_per_second": 69.35,
308
- "eval_steps_per_second": 69.35,
309
- "step": 7680
310
- },
311
- {
312
- "epoch": 0.59,
313
- "learning_rate": 8.827249889168022e-05,
314
- "loss": 15402.9678,
315
- "step": 7936
316
- },
317
- {
318
- "epoch": 0.61,
319
- "learning_rate": 8.789419240431506e-05,
320
- "loss": 15077.2871,
321
- "step": 8192
322
- },
323
- {
324
- "epoch": 0.61,
325
- "eval_loss": 13587.75,
326
- "eval_runtime": 50.6149,
327
- "eval_samples_per_second": 67.134,
328
- "eval_steps_per_second": 67.134,
329
- "step": 8192
330
- },
331
- {
332
- "epoch": 0.62,
333
- "learning_rate": 8.751588591694991e-05,
334
- "loss": 14985.3994,
335
- "step": 8448
336
- },
337
- {
338
- "epoch": 0.64,
339
- "learning_rate": 8.713757942958476e-05,
340
- "loss": 15089.6094,
341
- "step": 8704
342
- },
343
- {
344
- "epoch": 0.64,
345
- "eval_loss": 13345.4736328125,
346
- "eval_runtime": 49.1305,
347
- "eval_samples_per_second": 69.163,
348
- "eval_steps_per_second": 69.163,
349
- "step": 8704
350
- },
351
- {
352
- "epoch": 0.66,
353
- "learning_rate": 8.67592729422196e-05,
354
- "loss": 15185.1504,
355
- "step": 8960
356
- },
357
- {
358
- "epoch": 0.68,
359
- "learning_rate": 8.638096645485444e-05,
360
- "loss": 15208.8115,
361
- "step": 9216
362
- },
363
- {
364
- "epoch": 0.68,
365
- "eval_loss": 13608.46875,
366
- "eval_runtime": 49.1786,
367
- "eval_samples_per_second": 69.095,
368
- "eval_steps_per_second": 69.095,
369
- "step": 9216
370
- },
371
- {
372
- "epoch": 0.7,
373
- "learning_rate": 8.600265996748929e-05,
374
- "loss": 15028.958,
375
- "step": 9472
376
- },
377
- {
378
- "epoch": 0.72,
379
- "learning_rate": 8.562435348012414e-05,
380
- "loss": 14966.6143,
381
- "step": 9728
382
- },
383
- {
384
- "epoch": 0.72,
385
- "eval_loss": 12910.7265625,
386
- "eval_runtime": 57.0213,
387
- "eval_samples_per_second": 59.592,
388
- "eval_steps_per_second": 59.592,
389
- "step": 9728
390
- },
391
- {
392
- "epoch": 0.74,
393
- "learning_rate": 8.524604699275897e-05,
394
- "loss": 14777.6895,
395
- "step": 9984
396
- },
397
- {
398
- "epoch": 0.76,
399
- "learning_rate": 8.486774050539382e-05,
400
- "loss": 14924.749,
401
- "step": 10240
402
- },
403
- {
404
- "epoch": 0.76,
405
- "eval_loss": 13132.9765625,
406
- "eval_runtime": 49.0092,
407
- "eval_samples_per_second": 69.334,
408
- "eval_steps_per_second": 69.334,
409
- "step": 10240
410
- },
411
- {
412
- "epoch": 0.78,
413
- "learning_rate": 8.448943401802867e-05,
414
- "loss": 14694.0498,
415
- "step": 10496
416
- },
417
- {
418
- "epoch": 0.79,
419
- "learning_rate": 8.411112753066351e-05,
420
- "loss": 14639.1904,
421
- "step": 10752
422
- },
423
- {
424
- "epoch": 0.79,
425
- "eval_loss": 13389.201171875,
426
- "eval_runtime": 53.5263,
427
- "eval_samples_per_second": 63.483,
428
- "eval_steps_per_second": 63.483,
429
- "step": 10752
430
- },
431
- {
432
- "epoch": 0.81,
433
- "learning_rate": 8.373282104329836e-05,
434
- "loss": 14814.9424,
435
- "step": 11008
436
- },
437
- {
438
- "epoch": 0.83,
439
- "learning_rate": 8.335451455593321e-05,
440
- "loss": 14682.3984,
441
- "step": 11264
442
- },
443
- {
444
- "epoch": 0.83,
445
- "eval_loss": 13232.8876953125,
446
- "eval_runtime": 48.9422,
447
- "eval_samples_per_second": 69.429,
448
- "eval_steps_per_second": 69.429,
449
- "step": 11264
450
- },
451
- {
452
- "epoch": 0.85,
453
- "learning_rate": 8.297620806856804e-05,
454
- "loss": 14631.1221,
455
- "step": 11520
456
- },
457
- {
458
- "epoch": 0.87,
459
- "learning_rate": 8.259790158120289e-05,
460
- "loss": 14661.7021,
461
- "step": 11776
462
- },
463
- {
464
- "epoch": 0.87,
465
- "eval_loss": 13184.365234375,
466
- "eval_runtime": 49.4282,
467
- "eval_samples_per_second": 68.746,
468
- "eval_steps_per_second": 68.746,
469
- "step": 11776
470
- },
471
- {
472
- "epoch": 0.89,
473
- "learning_rate": 8.221959509383774e-05,
474
- "loss": 14565.9648,
475
- "step": 12032
476
- },
477
- {
478
- "epoch": 0.91,
479
- "learning_rate": 8.184128860647259e-05,
480
- "loss": 14608.0898,
481
- "step": 12288
482
- },
483
- {
484
- "epoch": 0.91,
485
- "eval_loss": 13070.6923828125,
486
- "eval_runtime": 49.197,
487
- "eval_samples_per_second": 69.069,
488
- "eval_steps_per_second": 69.069,
489
- "step": 12288
490
- },
491
- {
492
- "epoch": 0.93,
493
- "learning_rate": 8.146298211910744e-05,
494
- "loss": 14447.6699,
495
- "step": 12544
496
- },
497
- {
498
- "epoch": 0.95,
499
- "learning_rate": 8.108467563174228e-05,
500
- "loss": 14423.6045,
501
- "step": 12800
502
- },
503
- {
504
- "epoch": 0.95,
505
- "eval_loss": 12849.525390625,
506
- "eval_runtime": 48.9783,
507
- "eval_samples_per_second": 69.378,
508
- "eval_steps_per_second": 69.378,
509
- "step": 12800
510
  }
511
  ],
512
  "logging_steps": 256,
 
1
  {
2
+ "best_metric": 19020.044921875,
3
+ "best_model_checkpoint": "mgh6/TCS_Pairing_VAE/checkpoint-2560",
4
+ "epoch": 0.18914450899082907,
5
  "eval_steps": 512,
6
+ "global_step": 2560,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.02,
13
  "learning_rate": 9.962169351263485e-05,
14
+ "loss": 122076.6875,
15
  "step": 256
16
  },
17
  {
18
  "epoch": 0.04,
19
  "learning_rate": 9.92433870252697e-05,
20
+ "loss": 75620.8047,
21
  "step": 512
22
  },
23
  {
24
  "epoch": 0.04,
25
+ "eval_loss": 67218.640625,
26
+ "eval_runtime": 49.08,
27
+ "eval_samples_per_second": 69.234,
28
+ "eval_steps_per_second": 69.234,
29
  "step": 512
30
  },
31
  {
32
  "epoch": 0.06,
33
  "learning_rate": 9.886508053790455e-05,
34
+ "loss": 71143.4766,
35
  "step": 768
36
  },
37
  {
38
  "epoch": 0.08,
39
  "learning_rate": 9.848677405053938e-05,
40
+ "loss": 65495.2617,
41
  "step": 1024
42
  },
43
  {
44
  "epoch": 0.08,
45
+ "eval_loss": 55127.15234375,
46
+ "eval_runtime": 57.5577,
47
+ "eval_samples_per_second": 59.036,
48
+ "eval_steps_per_second": 59.036,
49
  "step": 1024
50
  },
51
  {
52
  "epoch": 0.09,
53
  "learning_rate": 9.810846756317423e-05,
54
+ "loss": 57857.6445,
55
  "step": 1280
56
  },
57
  {
58
  "epoch": 0.11,
59
  "learning_rate": 9.773016107580908e-05,
60
+ "loss": 49931.2188,
61
  "step": 1536
62
  },
63
  {
64
  "epoch": 0.11,
65
+ "eval_loss": 44795.0859375,
66
+ "eval_runtime": 62.2532,
67
+ "eval_samples_per_second": 54.584,
68
+ "eval_steps_per_second": 54.584,
69
  "step": 1536
70
  },
71
  {
72
  "epoch": 0.13,
73
  "learning_rate": 9.735185458844393e-05,
74
+ "loss": 42072.0,
75
  "step": 1792
76
  },
77
  {
78
  "epoch": 0.15,
79
  "learning_rate": 9.697354810107877e-05,
80
+ "loss": 35028.5938,
81
  "step": 2048
82
  },
83
  {
84
  "epoch": 0.15,
85
+ "eval_loss": 29700.298828125,
86
+ "eval_runtime": 72.4329,
87
+ "eval_samples_per_second": 46.912,
88
+ "eval_steps_per_second": 46.912,
89
  "step": 2048
90
  },
91
  {
92
  "epoch": 0.17,
93
  "learning_rate": 9.659524161371362e-05,
94
+ "loss": 27458.9082,
95
  "step": 2304
96
  },
97
  {
98
  "epoch": 0.19,
99
  "learning_rate": 9.621693512634847e-05,
100
+ "loss": 21147.1016,
101
  "step": 2560
102
  },
103
  {
104
  "epoch": 0.19,
105
+ "eval_loss": 19020.044921875,
106
+ "eval_runtime": 49.0881,
107
+ "eval_samples_per_second": 69.222,
108
+ "eval_steps_per_second": 69.222,
109
  "step": 2560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  }
111
  ],
112
  "logging_steps": 256,
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47b5973a9577dbe4f48cf65528543a6ba412189a705c054d41e38b4fabea494a
3
  size 4271
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb40828a1d4ffeac31865af000ab0d03e5851c44943c5a9b31dbbf1ac5027a97
3
  size 4271