Bingsu commited on
Commit
35deea0
1 Parent(s): d9db88e

Training in progress, step 20000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5622ffe46bf38388fa00b1a2200850f93d8dbe09275c8fca23027e8c8ecc914d
3
  size 100170757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efef12e6736ac05b05123978b5a7ba02086375a879e1e08c05db35ff70c647a0
3
  size 100170757
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64b56da0bd1eae8d31f23f66326261f592bc15670d3bf8cb6c7469bc8473bee5
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669ff7fd28968817843d8d3e735a9f1604e6f86bd0620d14ba500c796ee6cb84
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dd39757fc934e875b29f85fec3b03c133505f0c929dbd7a3fa7ae13e24256c2
3
  size 14439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ed50c4af1d37fb0d41b85169a6f1f89705f404faa32f3817c74e84cd5180c1
3
  size 14439
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f9c578680b62451c8dae8ced51654d36c9069db64cb38be21beebde9b574592
3
  size 246897640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60920ec13686e98f9f0d129e472adaac1417d4bc756e4485725a87068a11e2f1
3
  size 246897640
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.042973785990545764,
5
- "global_step": 10000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -306,11 +306,311 @@
306
  "learning_rate": 0.0005048929099291249,
307
  "loss": 5.0106,
308
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  }
310
  ],
311
  "max_steps": 500000,
312
  "num_train_epochs": 3,
313
- "total_flos": 1.593829982208e+16,
314
  "trial_name": null,
315
  "trial_params": null
316
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.08594757198109153,
5
+ "global_step": 20000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
306
  "learning_rate": 0.0005048929099291249,
307
  "loss": 5.0106,
308
  "step": 10000
309
+ },
310
+ {
311
+ "epoch": 0.04,
312
+ "learning_rate": 0.0005091144561900837,
313
+ "loss": 5.0155,
314
+ "step": 10200
315
+ },
316
+ {
317
+ "epoch": 0.04,
318
+ "learning_rate": 0.0005134183095340927,
319
+ "loss": 4.9817,
320
+ "step": 10400
321
+ },
322
+ {
323
+ "epoch": 0.05,
324
+ "learning_rate": 0.0005178043944449977,
325
+ "loss": 4.9742,
326
+ "step": 10600
327
+ },
328
+ {
329
+ "epoch": 0.05,
330
+ "learning_rate": 0.0005222726339638023,
331
+ "loss": 4.9299,
332
+ "step": 10800
333
+ },
334
+ {
335
+ "epoch": 0.05,
336
+ "learning_rate": 0.0005268229496900086,
337
+ "loss": 4.9208,
338
+ "step": 11000
339
+ },
340
+ {
341
+ "epoch": 0.05,
342
+ "learning_rate": 0.0005314552617829947,
343
+ "loss": 4.8617,
344
+ "step": 11200
345
+ },
346
+ {
347
+ "epoch": 0.05,
348
+ "learning_rate": 0.0005361694889634196,
349
+ "loss": 4.7952,
350
+ "step": 11400
351
+ },
352
+ {
353
+ "epoch": 0.05,
354
+ "learning_rate": 0.0005409655485146408,
355
+ "loss": 4.7641,
356
+ "step": 11600
357
+ },
358
+ {
359
+ "epoch": 0.05,
360
+ "learning_rate": 0.0005458433562841782,
361
+ "loss": 4.7361,
362
+ "step": 11800
363
+ },
364
+ {
365
+ "epoch": 0.05,
366
+ "learning_rate": 0.0005508028266851747,
367
+ "loss": 4.7023,
368
+ "step": 12000
369
+ },
370
+ {
371
+ "epoch": 0.05,
372
+ "learning_rate": 0.000555843872697916,
373
+ "loss": 4.6561,
374
+ "step": 12200
375
+ },
376
+ {
377
+ "epoch": 0.05,
378
+ "learning_rate": 0.0005609664058713396,
379
+ "loss": 4.63,
380
+ "step": 12400
381
+ },
382
+ {
383
+ "epoch": 0.05,
384
+ "learning_rate": 0.0005661703363245996,
385
+ "loss": 4.6307,
386
+ "step": 12600
387
+ },
388
+ {
389
+ "epoch": 0.06,
390
+ "learning_rate": 0.0005714555727486404,
391
+ "loss": 4.5881,
392
+ "step": 12800
393
+ },
394
+ {
395
+ "epoch": 0.06,
396
+ "learning_rate": 0.0005768220224077955,
397
+ "loss": 4.5489,
398
+ "step": 13000
399
+ },
400
+ {
401
+ "epoch": 0.06,
402
+ "learning_rate": 0.0005822695911414169,
403
+ "loss": 4.5521,
404
+ "step": 13200
405
+ },
406
+ {
407
+ "epoch": 0.06,
408
+ "learning_rate": 0.0005877981833655298,
409
+ "loss": 4.5165,
410
+ "step": 13400
411
+ },
412
+ {
413
+ "epoch": 0.06,
414
+ "learning_rate": 0.0005934077020745051,
415
+ "loss": 4.505,
416
+ "step": 13600
417
+ },
418
+ {
419
+ "epoch": 0.06,
420
+ "learning_rate": 0.0005990980488427659,
421
+ "loss": 4.4863,
422
+ "step": 13800
423
+ },
424
+ {
425
+ "epoch": 0.06,
426
+ "learning_rate": 0.000604869123826509,
427
+ "loss": 4.5071,
428
+ "step": 14000
429
+ },
430
+ {
431
+ "epoch": 0.06,
432
+ "learning_rate": 0.0006107208257654633,
433
+ "loss": 4.4501,
434
+ "step": 14200
435
+ },
436
+ {
437
+ "epoch": 0.06,
438
+ "learning_rate": 0.0006166530519846631,
439
+ "loss": 4.4623,
440
+ "step": 14400
441
+ },
442
+ {
443
+ "epoch": 0.06,
444
+ "learning_rate": 0.0006226656983962468,
445
+ "loss": 4.4336,
446
+ "step": 14600
447
+ },
448
+ {
449
+ "epoch": 0.06,
450
+ "learning_rate": 0.0006287586595012887,
451
+ "loss": 4.4335,
452
+ "step": 14800
453
+ },
454
+ {
455
+ "epoch": 0.06,
456
+ "learning_rate": 0.000634931828391647,
457
+ "loss": 4.4142,
458
+ "step": 15000
459
+ },
460
+ {
461
+ "epoch": 0.07,
462
+ "learning_rate": 0.0006411850967518416,
463
+ "loss": 4.4145,
464
+ "step": 15200
465
+ },
466
+ {
467
+ "epoch": 0.07,
468
+ "learning_rate": 0.0006475183548609511,
469
+ "loss": 4.3842,
470
+ "step": 15400
471
+ },
472
+ {
473
+ "epoch": 0.07,
474
+ "learning_rate": 0.0006539314915945428,
475
+ "loss": 4.3748,
476
+ "step": 15600
477
+ },
478
+ {
479
+ "epoch": 0.07,
480
+ "learning_rate": 0.0006604243944266178,
481
+ "loss": 4.3815,
482
+ "step": 15800
483
+ },
484
+ {
485
+ "epoch": 0.07,
486
+ "learning_rate": 0.0006669969494315867,
487
+ "loss": 4.352,
488
+ "step": 16000
489
+ },
490
+ {
491
+ "epoch": 0.07,
492
+ "learning_rate": 0.0006736490412862749,
493
+ "loss": 4.3575,
494
+ "step": 16200
495
+ },
496
+ {
497
+ "epoch": 0.07,
498
+ "learning_rate": 0.000680380553271933,
499
+ "loss": 4.3416,
500
+ "step": 16400
501
+ },
502
+ {
503
+ "epoch": 0.07,
504
+ "learning_rate": 0.0006871913672762998,
505
+ "loss": 4.341,
506
+ "step": 16600
507
+ },
508
+ {
509
+ "epoch": 0.07,
510
+ "learning_rate": 0.0006940813637956594,
511
+ "loss": 4.3183,
512
+ "step": 16800
513
+ },
514
+ {
515
+ "epoch": 0.07,
516
+ "learning_rate": 0.0007010504219369541,
517
+ "loss": 4.3145,
518
+ "step": 17000
519
+ },
520
+ {
521
+ "epoch": 0.07,
522
+ "learning_rate": 0.0007080984194198885,
523
+ "loss": 4.3065,
524
+ "step": 17200
525
+ },
526
+ {
527
+ "epoch": 0.07,
528
+ "learning_rate": 0.0007152252325790948,
529
+ "loss": 4.2805,
530
+ "step": 17400
531
+ },
532
+ {
533
+ "epoch": 0.08,
534
+ "learning_rate": 0.0007224307363662818,
535
+ "loss": 4.2804,
536
+ "step": 17600
537
+ },
538
+ {
539
+ "epoch": 0.08,
540
+ "learning_rate": 0.0007297148043524434,
541
+ "loss": 4.2996,
542
+ "step": 17800
543
+ },
544
+ {
545
+ "epoch": 0.08,
546
+ "learning_rate": 0.0007370773087300737,
547
+ "loss": 4.2743,
548
+ "step": 18000
549
+ },
550
+ {
551
+ "epoch": 0.08,
552
+ "learning_rate": 0.0007445181203154048,
553
+ "loss": 4.2621,
554
+ "step": 18200
555
+ },
556
+ {
557
+ "epoch": 0.08,
558
+ "learning_rate": 0.0007520371085506811,
559
+ "loss": 4.2548,
560
+ "step": 18400
561
+ },
562
+ {
563
+ "epoch": 0.08,
564
+ "learning_rate": 0.0007596341415064441,
565
+ "loss": 4.2643,
566
+ "step": 18600
567
+ },
568
+ {
569
+ "epoch": 0.08,
570
+ "learning_rate": 0.0007673090858838494,
571
+ "loss": 4.266,
572
+ "step": 18800
573
+ },
574
+ {
575
+ "epoch": 0.08,
576
+ "learning_rate": 0.0007750618070170041,
577
+ "loss": 4.2503,
578
+ "step": 19000
579
+ },
580
+ {
581
+ "epoch": 0.08,
582
+ "learning_rate": 0.0007828921688753324,
583
+ "loss": 4.2093,
584
+ "step": 19200
585
+ },
586
+ {
587
+ "epoch": 0.08,
588
+ "learning_rate": 0.0007908000340659631,
589
+ "loss": 4.2449,
590
+ "step": 19400
591
+ },
592
+ {
593
+ "epoch": 0.08,
594
+ "learning_rate": 0.0007987852638361333,
595
+ "loss": 4.2158,
596
+ "step": 19600
597
+ },
598
+ {
599
+ "epoch": 0.09,
600
+ "learning_rate": 0.0008068477180756314,
601
+ "loss": 4.202,
602
+ "step": 19800
603
+ },
604
+ {
605
+ "epoch": 0.09,
606
+ "learning_rate": 0.0008149872553192515,
607
+ "loss": 4.2065,
608
+ "step": 20000
609
  }
610
  ],
611
  "max_steps": 500000,
612
  "num_train_epochs": 3,
613
+ "total_flos": 3.187659964416e+16,
614
  "trial_name": null,
615
  "trial_params": null
616
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64b56da0bd1eae8d31f23f66326261f592bc15670d3bf8cb6c7469bc8473bee5
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669ff7fd28968817843d8d3e735a9f1604e6f86bd0620d14ba500c796ee6cb84
3
  size 146774203