Joemgu commited on
Commit
081d7ab
1 Parent(s): 0e6cbec

Training in progress, step 600

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebd7f9d6ab26cb2185ed05f18ace8d5014457a65bafae8910ab26fc80031697d
3
  size 4736616809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edc9882338455f81dd1e85ace29ccf7af7da229453c5afb0d6c4bc90350a8b18
3
  size 4736616809
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04346cfbee02636e929bbdd06ddd0aa6ecb2b8f64b73fa9d623a5793dd51c467
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1ac718a5450b8f5bc3e788c6d41748c97c82f6bf2933bbce997b0073b1c8202
3
  size 2368281769
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e39be00b4df7469aee65d451426dc5d779473844ca0a295cda433a67a93c87c0
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:425eded187a10b59dd6706ea8bd8dddc527ddb901485b28745924b7945f3098d
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8deb6207c940c5aabc87cab77725a0645b55bccfd9f3b35177ac6784668070e4
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb5f790e340963b8140823821a1411a83e471c6359b6fd89f0bb6a4aa0276e15
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 2.0627405643463135,
3
- "best_model_checkpoint": "output/checkpoint-400",
4
- "epoch": 0.27347359537658705,
5
- "global_step": 400,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -2422,11 +2422,1219 @@
2422
  "eval_samples_per_second": 5.75,
2423
  "eval_steps_per_second": 5.75,
2424
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2425
  }
2426
  ],
2427
  "max_steps": 4386,
2428
  "num_train_epochs": 3,
2429
- "total_flos": 4.8975138397771776e+17,
2430
  "trial_name": null,
2431
  "trial_params": null
2432
  }
 
1
  {
2
+ "best_metric": 2.047600030899048,
3
+ "best_model_checkpoint": "output/checkpoint-600",
4
+ "epoch": 0.41021039306488055,
5
+ "global_step": 600,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
2422
  "eval_samples_per_second": 5.75,
2423
  "eval_steps_per_second": 5.75,
2424
  "step": 400
2425
+ },
2426
+ {
2427
+ "epoch": 0.27,
2428
+ "learning_rate": 0.000571189679885332,
2429
+ "loss": 2.1854,
2430
+ "step": 401
2431
+ },
2432
+ {
2433
+ "epoch": 0.27,
2434
+ "learning_rate": 0.0005710463449593884,
2435
+ "loss": 2.1751,
2436
+ "step": 402
2437
+ },
2438
+ {
2439
+ "epoch": 0.28,
2440
+ "learning_rate": 0.0005709030100334448,
2441
+ "loss": 2.2177,
2442
+ "step": 403
2443
+ },
2444
+ {
2445
+ "epoch": 0.28,
2446
+ "learning_rate": 0.0005707596751075012,
2447
+ "loss": 2.1151,
2448
+ "step": 404
2449
+ },
2450
+ {
2451
+ "epoch": 0.28,
2452
+ "learning_rate": 0.0005706163401815575,
2453
+ "loss": 2.1397,
2454
+ "step": 405
2455
+ },
2456
+ {
2457
+ "epoch": 0.28,
2458
+ "learning_rate": 0.0005704730052556139,
2459
+ "loss": 2.1307,
2460
+ "step": 406
2461
+ },
2462
+ {
2463
+ "epoch": 0.28,
2464
+ "learning_rate": 0.0005703296703296703,
2465
+ "loss": 2.1163,
2466
+ "step": 407
2467
+ },
2468
+ {
2469
+ "epoch": 0.28,
2470
+ "learning_rate": 0.0005701863354037267,
2471
+ "loss": 2.2228,
2472
+ "step": 408
2473
+ },
2474
+ {
2475
+ "epoch": 0.28,
2476
+ "learning_rate": 0.000570043000477783,
2477
+ "loss": 2.0841,
2478
+ "step": 409
2479
+ },
2480
+ {
2481
+ "epoch": 0.28,
2482
+ "learning_rate": 0.0005698996655518395,
2483
+ "loss": 2.2314,
2484
+ "step": 410
2485
+ },
2486
+ {
2487
+ "epoch": 0.28,
2488
+ "learning_rate": 0.0005697563306258957,
2489
+ "loss": 2.2701,
2490
+ "step": 411
2491
+ },
2492
+ {
2493
+ "epoch": 0.28,
2494
+ "learning_rate": 0.0005696129956999522,
2495
+ "loss": 2.1293,
2496
+ "step": 412
2497
+ },
2498
+ {
2499
+ "epoch": 0.28,
2500
+ "learning_rate": 0.0005694696607740085,
2501
+ "loss": 2.1884,
2502
+ "step": 413
2503
+ },
2504
+ {
2505
+ "epoch": 0.28,
2506
+ "learning_rate": 0.0005693263258480649,
2507
+ "loss": 2.1422,
2508
+ "step": 414
2509
+ },
2510
+ {
2511
+ "epoch": 0.28,
2512
+ "learning_rate": 0.0005691829909221213,
2513
+ "loss": 2.1939,
2514
+ "step": 415
2515
+ },
2516
+ {
2517
+ "epoch": 0.28,
2518
+ "learning_rate": 0.0005690396559961777,
2519
+ "loss": 2.1862,
2520
+ "step": 416
2521
+ },
2522
+ {
2523
+ "epoch": 0.29,
2524
+ "learning_rate": 0.000568896321070234,
2525
+ "loss": 2.2965,
2526
+ "step": 417
2527
+ },
2528
+ {
2529
+ "epoch": 0.29,
2530
+ "learning_rate": 0.0005687529861442904,
2531
+ "loss": 2.2042,
2532
+ "step": 418
2533
+ },
2534
+ {
2535
+ "epoch": 0.29,
2536
+ "learning_rate": 0.0005686096512183468,
2537
+ "loss": 2.1242,
2538
+ "step": 419
2539
+ },
2540
+ {
2541
+ "epoch": 0.29,
2542
+ "learning_rate": 0.0005684663162924032,
2543
+ "loss": 2.1837,
2544
+ "step": 420
2545
+ },
2546
+ {
2547
+ "epoch": 0.29,
2548
+ "learning_rate": 0.0005683229813664596,
2549
+ "loss": 2.1068,
2550
+ "step": 421
2551
+ },
2552
+ {
2553
+ "epoch": 0.29,
2554
+ "learning_rate": 0.000568179646440516,
2555
+ "loss": 2.2236,
2556
+ "step": 422
2557
+ },
2558
+ {
2559
+ "epoch": 0.29,
2560
+ "learning_rate": 0.0005680363115145723,
2561
+ "loss": 2.1684,
2562
+ "step": 423
2563
+ },
2564
+ {
2565
+ "epoch": 0.29,
2566
+ "learning_rate": 0.0005678929765886287,
2567
+ "loss": 2.1441,
2568
+ "step": 424
2569
+ },
2570
+ {
2571
+ "epoch": 0.29,
2572
+ "learning_rate": 0.0005677496416626851,
2573
+ "loss": 2.1091,
2574
+ "step": 425
2575
+ },
2576
+ {
2577
+ "epoch": 0.29,
2578
+ "learning_rate": 0.0005676063067367415,
2579
+ "loss": 2.2047,
2580
+ "step": 426
2581
+ },
2582
+ {
2583
+ "epoch": 0.29,
2584
+ "learning_rate": 0.0005674629718107979,
2585
+ "loss": 2.1858,
2586
+ "step": 427
2587
+ },
2588
+ {
2589
+ "epoch": 0.29,
2590
+ "learning_rate": 0.0005673196368848541,
2591
+ "loss": 2.141,
2592
+ "step": 428
2593
+ },
2594
+ {
2595
+ "epoch": 0.29,
2596
+ "learning_rate": 0.0005671763019589106,
2597
+ "loss": 2.1447,
2598
+ "step": 429
2599
+ },
2600
+ {
2601
+ "epoch": 0.29,
2602
+ "learning_rate": 0.0005670329670329669,
2603
+ "loss": 2.1216,
2604
+ "step": 430
2605
+ },
2606
+ {
2607
+ "epoch": 0.29,
2608
+ "learning_rate": 0.0005668896321070234,
2609
+ "loss": 2.1843,
2610
+ "step": 431
2611
+ },
2612
+ {
2613
+ "epoch": 0.3,
2614
+ "learning_rate": 0.0005667462971810797,
2615
+ "loss": 2.3159,
2616
+ "step": 432
2617
+ },
2618
+ {
2619
+ "epoch": 0.3,
2620
+ "learning_rate": 0.0005666029622551362,
2621
+ "loss": 2.2143,
2622
+ "step": 433
2623
+ },
2624
+ {
2625
+ "epoch": 0.3,
2626
+ "learning_rate": 0.0005664596273291925,
2627
+ "loss": 2.1456,
2628
+ "step": 434
2629
+ },
2630
+ {
2631
+ "epoch": 0.3,
2632
+ "learning_rate": 0.0005663162924032488,
2633
+ "loss": 2.1061,
2634
+ "step": 435
2635
+ },
2636
+ {
2637
+ "epoch": 0.3,
2638
+ "learning_rate": 0.0005661729574773052,
2639
+ "loss": 2.2449,
2640
+ "step": 436
2641
+ },
2642
+ {
2643
+ "epoch": 0.3,
2644
+ "learning_rate": 0.0005660296225513616,
2645
+ "loss": 2.1625,
2646
+ "step": 437
2647
+ },
2648
+ {
2649
+ "epoch": 0.3,
2650
+ "learning_rate": 0.000565886287625418,
2651
+ "loss": 2.1141,
2652
+ "step": 438
2653
+ },
2654
+ {
2655
+ "epoch": 0.3,
2656
+ "learning_rate": 0.0005657429526994744,
2657
+ "loss": 2.1314,
2658
+ "step": 439
2659
+ },
2660
+ {
2661
+ "epoch": 0.3,
2662
+ "learning_rate": 0.0005655996177735308,
2663
+ "loss": 2.1726,
2664
+ "step": 440
2665
+ },
2666
+ {
2667
+ "epoch": 0.3,
2668
+ "learning_rate": 0.0005654562828475871,
2669
+ "loss": 2.1889,
2670
+ "step": 441
2671
+ },
2672
+ {
2673
+ "epoch": 0.3,
2674
+ "learning_rate": 0.0005653129479216435,
2675
+ "loss": 2.2172,
2676
+ "step": 442
2677
+ },
2678
+ {
2679
+ "epoch": 0.3,
2680
+ "learning_rate": 0.0005651696129956999,
2681
+ "loss": 2.1766,
2682
+ "step": 443
2683
+ },
2684
+ {
2685
+ "epoch": 0.3,
2686
+ "learning_rate": 0.0005650262780697563,
2687
+ "loss": 2.0534,
2688
+ "step": 444
2689
+ },
2690
+ {
2691
+ "epoch": 0.3,
2692
+ "learning_rate": 0.0005648829431438127,
2693
+ "loss": 2.1665,
2694
+ "step": 445
2695
+ },
2696
+ {
2697
+ "epoch": 0.3,
2698
+ "learning_rate": 0.0005647396082178691,
2699
+ "loss": 2.1383,
2700
+ "step": 446
2701
+ },
2702
+ {
2703
+ "epoch": 0.31,
2704
+ "learning_rate": 0.0005645962732919254,
2705
+ "loss": 2.1737,
2706
+ "step": 447
2707
+ },
2708
+ {
2709
+ "epoch": 0.31,
2710
+ "learning_rate": 0.0005644529383659818,
2711
+ "loss": 2.1792,
2712
+ "step": 448
2713
+ },
2714
+ {
2715
+ "epoch": 0.31,
2716
+ "learning_rate": 0.0005643096034400381,
2717
+ "loss": 2.2596,
2718
+ "step": 449
2719
+ },
2720
+ {
2721
+ "epoch": 0.31,
2722
+ "learning_rate": 0.0005641662685140946,
2723
+ "loss": 2.1804,
2724
+ "step": 450
2725
+ },
2726
+ {
2727
+ "epoch": 0.31,
2728
+ "learning_rate": 0.0005640229335881509,
2729
+ "loss": 2.0674,
2730
+ "step": 451
2731
+ },
2732
+ {
2733
+ "epoch": 0.31,
2734
+ "learning_rate": 0.0005638795986622074,
2735
+ "loss": 2.0845,
2736
+ "step": 452
2737
+ },
2738
+ {
2739
+ "epoch": 0.31,
2740
+ "learning_rate": 0.0005637362637362636,
2741
+ "loss": 2.0754,
2742
+ "step": 453
2743
+ },
2744
+ {
2745
+ "epoch": 0.31,
2746
+ "learning_rate": 0.00056359292881032,
2747
+ "loss": 2.1973,
2748
+ "step": 454
2749
+ },
2750
+ {
2751
+ "epoch": 0.31,
2752
+ "learning_rate": 0.0005634495938843764,
2753
+ "loss": 2.1183,
2754
+ "step": 455
2755
+ },
2756
+ {
2757
+ "epoch": 0.31,
2758
+ "learning_rate": 0.0005633062589584328,
2759
+ "loss": 2.1111,
2760
+ "step": 456
2761
+ },
2762
+ {
2763
+ "epoch": 0.31,
2764
+ "learning_rate": 0.0005631629240324892,
2765
+ "loss": 2.1094,
2766
+ "step": 457
2767
+ },
2768
+ {
2769
+ "epoch": 0.31,
2770
+ "learning_rate": 0.0005630195891065456,
2771
+ "loss": 2.0889,
2772
+ "step": 458
2773
+ },
2774
+ {
2775
+ "epoch": 0.31,
2776
+ "learning_rate": 0.0005628762541806019,
2777
+ "loss": 2.2136,
2778
+ "step": 459
2779
+ },
2780
+ {
2781
+ "epoch": 0.31,
2782
+ "learning_rate": 0.0005627329192546583,
2783
+ "loss": 2.1521,
2784
+ "step": 460
2785
+ },
2786
+ {
2787
+ "epoch": 0.32,
2788
+ "learning_rate": 0.0005625895843287147,
2789
+ "loss": 2.1277,
2790
+ "step": 461
2791
+ },
2792
+ {
2793
+ "epoch": 0.32,
2794
+ "learning_rate": 0.0005624462494027711,
2795
+ "loss": 2.2086,
2796
+ "step": 462
2797
+ },
2798
+ {
2799
+ "epoch": 0.32,
2800
+ "learning_rate": 0.0005623029144768275,
2801
+ "loss": 2.139,
2802
+ "step": 463
2803
+ },
2804
+ {
2805
+ "epoch": 0.32,
2806
+ "learning_rate": 0.0005621595795508839,
2807
+ "loss": 2.0627,
2808
+ "step": 464
2809
+ },
2810
+ {
2811
+ "epoch": 0.32,
2812
+ "learning_rate": 0.0005620162446249402,
2813
+ "loss": 2.2356,
2814
+ "step": 465
2815
+ },
2816
+ {
2817
+ "epoch": 0.32,
2818
+ "learning_rate": 0.0005618729096989966,
2819
+ "loss": 2.2947,
2820
+ "step": 466
2821
+ },
2822
+ {
2823
+ "epoch": 0.32,
2824
+ "learning_rate": 0.000561729574773053,
2825
+ "loss": 2.1619,
2826
+ "step": 467
2827
+ },
2828
+ {
2829
+ "epoch": 0.32,
2830
+ "learning_rate": 0.0005615862398471093,
2831
+ "loss": 2.2515,
2832
+ "step": 468
2833
+ },
2834
+ {
2835
+ "epoch": 0.32,
2836
+ "learning_rate": 0.0005614429049211658,
2837
+ "loss": 2.1746,
2838
+ "step": 469
2839
+ },
2840
+ {
2841
+ "epoch": 0.32,
2842
+ "learning_rate": 0.000561299569995222,
2843
+ "loss": 2.259,
2844
+ "step": 470
2845
+ },
2846
+ {
2847
+ "epoch": 0.32,
2848
+ "learning_rate": 0.0005611562350692785,
2849
+ "loss": 2.2093,
2850
+ "step": 471
2851
+ },
2852
+ {
2853
+ "epoch": 0.32,
2854
+ "learning_rate": 0.0005610129001433348,
2855
+ "loss": 2.2077,
2856
+ "step": 472
2857
+ },
2858
+ {
2859
+ "epoch": 0.32,
2860
+ "learning_rate": 0.0005608695652173913,
2861
+ "loss": 2.2464,
2862
+ "step": 473
2863
+ },
2864
+ {
2865
+ "epoch": 0.32,
2866
+ "learning_rate": 0.0005607262302914476,
2867
+ "loss": 2.0556,
2868
+ "step": 474
2869
+ },
2870
+ {
2871
+ "epoch": 0.32,
2872
+ "learning_rate": 0.000560582895365504,
2873
+ "loss": 2.135,
2874
+ "step": 475
2875
+ },
2876
+ {
2877
+ "epoch": 0.33,
2878
+ "learning_rate": 0.0005604395604395604,
2879
+ "loss": 2.1812,
2880
+ "step": 476
2881
+ },
2882
+ {
2883
+ "epoch": 0.33,
2884
+ "learning_rate": 0.0005602962255136167,
2885
+ "loss": 2.1082,
2886
+ "step": 477
2887
+ },
2888
+ {
2889
+ "epoch": 0.33,
2890
+ "learning_rate": 0.0005601528905876731,
2891
+ "loss": 2.0997,
2892
+ "step": 478
2893
+ },
2894
+ {
2895
+ "epoch": 0.33,
2896
+ "learning_rate": 0.0005600095556617295,
2897
+ "loss": 2.1532,
2898
+ "step": 479
2899
+ },
2900
+ {
2901
+ "epoch": 0.33,
2902
+ "learning_rate": 0.0005598662207357859,
2903
+ "loss": 2.1389,
2904
+ "step": 480
2905
+ },
2906
+ {
2907
+ "epoch": 0.33,
2908
+ "learning_rate": 0.0005597228858098423,
2909
+ "loss": 2.1122,
2910
+ "step": 481
2911
+ },
2912
+ {
2913
+ "epoch": 0.33,
2914
+ "learning_rate": 0.0005595795508838987,
2915
+ "loss": 2.1091,
2916
+ "step": 482
2917
+ },
2918
+ {
2919
+ "epoch": 0.33,
2920
+ "learning_rate": 0.000559436215957955,
2921
+ "loss": 2.1699,
2922
+ "step": 483
2923
+ },
2924
+ {
2925
+ "epoch": 0.33,
2926
+ "learning_rate": 0.0005592928810320114,
2927
+ "loss": 2.1043,
2928
+ "step": 484
2929
+ },
2930
+ {
2931
+ "epoch": 0.33,
2932
+ "learning_rate": 0.0005591495461060678,
2933
+ "loss": 2.143,
2934
+ "step": 485
2935
+ },
2936
+ {
2937
+ "epoch": 0.33,
2938
+ "learning_rate": 0.0005590062111801242,
2939
+ "loss": 2.1184,
2940
+ "step": 486
2941
+ },
2942
+ {
2943
+ "epoch": 0.33,
2944
+ "learning_rate": 0.0005588628762541806,
2945
+ "loss": 2.0185,
2946
+ "step": 487
2947
+ },
2948
+ {
2949
+ "epoch": 0.33,
2950
+ "learning_rate": 0.000558719541328237,
2951
+ "loss": 2.2015,
2952
+ "step": 488
2953
+ },
2954
+ {
2955
+ "epoch": 0.33,
2956
+ "learning_rate": 0.0005585762064022932,
2957
+ "loss": 2.2501,
2958
+ "step": 489
2959
+ },
2960
+ {
2961
+ "epoch": 0.34,
2962
+ "learning_rate": 0.0005584328714763497,
2963
+ "loss": 2.1088,
2964
+ "step": 490
2965
+ },
2966
+ {
2967
+ "epoch": 0.34,
2968
+ "learning_rate": 0.000558289536550406,
2969
+ "loss": 2.0932,
2970
+ "step": 491
2971
+ },
2972
+ {
2973
+ "epoch": 0.34,
2974
+ "learning_rate": 0.0005581462016244625,
2975
+ "loss": 2.1717,
2976
+ "step": 492
2977
+ },
2978
+ {
2979
+ "epoch": 0.34,
2980
+ "learning_rate": 0.0005580028666985188,
2981
+ "loss": 2.1509,
2982
+ "step": 493
2983
+ },
2984
+ {
2985
+ "epoch": 0.34,
2986
+ "learning_rate": 0.0005578595317725753,
2987
+ "loss": 2.1354,
2988
+ "step": 494
2989
+ },
2990
+ {
2991
+ "epoch": 0.34,
2992
+ "learning_rate": 0.0005577161968466315,
2993
+ "loss": 2.2007,
2994
+ "step": 495
2995
+ },
2996
+ {
2997
+ "epoch": 0.34,
2998
+ "learning_rate": 0.0005575728619206879,
2999
+ "loss": 2.0769,
3000
+ "step": 496
3001
+ },
3002
+ {
3003
+ "epoch": 0.34,
3004
+ "learning_rate": 0.0005574295269947443,
3005
+ "loss": 2.1375,
3006
+ "step": 497
3007
+ },
3008
+ {
3009
+ "epoch": 0.34,
3010
+ "learning_rate": 0.0005572861920688007,
3011
+ "loss": 2.1743,
3012
+ "step": 498
3013
+ },
3014
+ {
3015
+ "epoch": 0.34,
3016
+ "learning_rate": 0.0005571428571428571,
3017
+ "loss": 2.1998,
3018
+ "step": 499
3019
+ },
3020
+ {
3021
+ "epoch": 0.34,
3022
+ "learning_rate": 0.0005569995222169135,
3023
+ "loss": 2.2041,
3024
+ "step": 500
3025
+ },
3026
+ {
3027
+ "epoch": 0.34,
3028
+ "learning_rate": 0.0005568561872909698,
3029
+ "loss": 2.1778,
3030
+ "step": 501
3031
+ },
3032
+ {
3033
+ "epoch": 0.34,
3034
+ "learning_rate": 0.0005567128523650262,
3035
+ "loss": 2.2309,
3036
+ "step": 502
3037
+ },
3038
+ {
3039
+ "epoch": 0.34,
3040
+ "learning_rate": 0.0005565695174390826,
3041
+ "loss": 2.1322,
3042
+ "step": 503
3043
+ },
3044
+ {
3045
+ "epoch": 0.34,
3046
+ "learning_rate": 0.000556426182513139,
3047
+ "loss": 2.1951,
3048
+ "step": 504
3049
+ },
3050
+ {
3051
+ "epoch": 0.35,
3052
+ "learning_rate": 0.0005562828475871954,
3053
+ "loss": 2.2756,
3054
+ "step": 505
3055
+ },
3056
+ {
3057
+ "epoch": 0.35,
3058
+ "learning_rate": 0.0005561395126612518,
3059
+ "loss": 2.0972,
3060
+ "step": 506
3061
+ },
3062
+ {
3063
+ "epoch": 0.35,
3064
+ "learning_rate": 0.0005559961777353081,
3065
+ "loss": 2.138,
3066
+ "step": 507
3067
+ },
3068
+ {
3069
+ "epoch": 0.35,
3070
+ "learning_rate": 0.0005558528428093645,
3071
+ "loss": 2.1627,
3072
+ "step": 508
3073
+ },
3074
+ {
3075
+ "epoch": 0.35,
3076
+ "learning_rate": 0.0005557095078834209,
3077
+ "loss": 2.1876,
3078
+ "step": 509
3079
+ },
3080
+ {
3081
+ "epoch": 0.35,
3082
+ "learning_rate": 0.0005555661729574772,
3083
+ "loss": 2.1359,
3084
+ "step": 510
3085
+ },
3086
+ {
3087
+ "epoch": 0.35,
3088
+ "learning_rate": 0.0005554228380315337,
3089
+ "loss": 2.2495,
3090
+ "step": 511
3091
+ },
3092
+ {
3093
+ "epoch": 0.35,
3094
+ "learning_rate": 0.00055527950310559,
3095
+ "loss": 2.1951,
3096
+ "step": 512
3097
+ },
3098
+ {
3099
+ "epoch": 0.35,
3100
+ "learning_rate": 0.0005551361681796464,
3101
+ "loss": 2.1939,
3102
+ "step": 513
3103
+ },
3104
+ {
3105
+ "epoch": 0.35,
3106
+ "learning_rate": 0.0005549928332537027,
3107
+ "loss": 2.12,
3108
+ "step": 514
3109
+ },
3110
+ {
3111
+ "epoch": 0.35,
3112
+ "learning_rate": 0.0005548494983277592,
3113
+ "loss": 2.1258,
3114
+ "step": 515
3115
+ },
3116
+ {
3117
+ "epoch": 0.35,
3118
+ "learning_rate": 0.0005547061634018155,
3119
+ "loss": 2.2273,
3120
+ "step": 516
3121
+ },
3122
+ {
3123
+ "epoch": 0.35,
3124
+ "learning_rate": 0.0005545628284758719,
3125
+ "loss": 2.1856,
3126
+ "step": 517
3127
+ },
3128
+ {
3129
+ "epoch": 0.35,
3130
+ "learning_rate": 0.0005544194935499283,
3131
+ "loss": 2.0875,
3132
+ "step": 518
3133
+ },
3134
+ {
3135
+ "epoch": 0.35,
3136
+ "learning_rate": 0.0005542761586239846,
3137
+ "loss": 2.0916,
3138
+ "step": 519
3139
+ },
3140
+ {
3141
+ "epoch": 0.36,
3142
+ "learning_rate": 0.000554132823698041,
3143
+ "loss": 2.094,
3144
+ "step": 520
3145
+ },
3146
+ {
3147
+ "epoch": 0.36,
3148
+ "learning_rate": 0.0005539894887720974,
3149
+ "loss": 2.1244,
3150
+ "step": 521
3151
+ },
3152
+ {
3153
+ "epoch": 0.36,
3154
+ "learning_rate": 0.0005538461538461538,
3155
+ "loss": 2.1668,
3156
+ "step": 522
3157
+ },
3158
+ {
3159
+ "epoch": 0.36,
3160
+ "learning_rate": 0.0005537028189202102,
3161
+ "loss": 2.1785,
3162
+ "step": 523
3163
+ },
3164
+ {
3165
+ "epoch": 0.36,
3166
+ "learning_rate": 0.0005535594839942666,
3167
+ "loss": 2.1497,
3168
+ "step": 524
3169
+ },
3170
+ {
3171
+ "epoch": 0.36,
3172
+ "learning_rate": 0.0005534161490683229,
3173
+ "loss": 2.2136,
3174
+ "step": 525
3175
+ },
3176
+ {
3177
+ "epoch": 0.36,
3178
+ "learning_rate": 0.0005532728141423793,
3179
+ "loss": 2.1554,
3180
+ "step": 526
3181
+ },
3182
+ {
3183
+ "epoch": 0.36,
3184
+ "learning_rate": 0.0005531294792164357,
3185
+ "loss": 2.1288,
3186
+ "step": 527
3187
+ },
3188
+ {
3189
+ "epoch": 0.36,
3190
+ "learning_rate": 0.0005529861442904921,
3191
+ "loss": 2.1221,
3192
+ "step": 528
3193
+ },
3194
+ {
3195
+ "epoch": 0.36,
3196
+ "learning_rate": 0.0005528428093645485,
3197
+ "loss": 2.1583,
3198
+ "step": 529
3199
+ },
3200
+ {
3201
+ "epoch": 0.36,
3202
+ "learning_rate": 0.0005526994744386049,
3203
+ "loss": 2.1514,
3204
+ "step": 530
3205
+ },
3206
+ {
3207
+ "epoch": 0.36,
3208
+ "learning_rate": 0.0005525561395126611,
3209
+ "loss": 2.2256,
3210
+ "step": 531
3211
+ },
3212
+ {
3213
+ "epoch": 0.36,
3214
+ "learning_rate": 0.0005524128045867176,
3215
+ "loss": 2.1435,
3216
+ "step": 532
3217
+ },
3218
+ {
3219
+ "epoch": 0.36,
3220
+ "learning_rate": 0.0005522694696607739,
3221
+ "loss": 2.169,
3222
+ "step": 533
3223
+ },
3224
+ {
3225
+ "epoch": 0.37,
3226
+ "learning_rate": 0.0005521261347348304,
3227
+ "loss": 2.1734,
3228
+ "step": 534
3229
+ },
3230
+ {
3231
+ "epoch": 0.37,
3232
+ "learning_rate": 0.0005519827998088867,
3233
+ "loss": 2.0882,
3234
+ "step": 535
3235
+ },
3236
+ {
3237
+ "epoch": 0.37,
3238
+ "learning_rate": 0.0005518394648829432,
3239
+ "loss": 2.1364,
3240
+ "step": 536
3241
+ },
3242
+ {
3243
+ "epoch": 0.37,
3244
+ "learning_rate": 0.0005516961299569994,
3245
+ "loss": 2.1544,
3246
+ "step": 537
3247
+ },
3248
+ {
3249
+ "epoch": 0.37,
3250
+ "learning_rate": 0.0005515527950310558,
3251
+ "loss": 2.1356,
3252
+ "step": 538
3253
+ },
3254
+ {
3255
+ "epoch": 0.37,
3256
+ "learning_rate": 0.0005514094601051122,
3257
+ "loss": 2.019,
3258
+ "step": 539
3259
+ },
3260
+ {
3261
+ "epoch": 0.37,
3262
+ "learning_rate": 0.0005512661251791686,
3263
+ "loss": 2.1198,
3264
+ "step": 540
3265
+ },
3266
+ {
3267
+ "epoch": 0.37,
3268
+ "learning_rate": 0.000551122790253225,
3269
+ "loss": 2.1896,
3270
+ "step": 541
3271
+ },
3272
+ {
3273
+ "epoch": 0.37,
3274
+ "learning_rate": 0.0005509794553272814,
3275
+ "loss": 2.1615,
3276
+ "step": 542
3277
+ },
3278
+ {
3279
+ "epoch": 0.37,
3280
+ "learning_rate": 0.0005508361204013377,
3281
+ "loss": 2.1163,
3282
+ "step": 543
3283
+ },
3284
+ {
3285
+ "epoch": 0.37,
3286
+ "learning_rate": 0.0005506927854753941,
3287
+ "loss": 2.2056,
3288
+ "step": 544
3289
+ },
3290
+ {
3291
+ "epoch": 0.37,
3292
+ "learning_rate": 0.0005505494505494505,
3293
+ "loss": 2.2128,
3294
+ "step": 545
3295
+ },
3296
+ {
3297
+ "epoch": 0.37,
3298
+ "learning_rate": 0.0005504061156235069,
3299
+ "loss": 2.195,
3300
+ "step": 546
3301
+ },
3302
+ {
3303
+ "epoch": 0.37,
3304
+ "learning_rate": 0.0005502627806975633,
3305
+ "loss": 2.0768,
3306
+ "step": 547
3307
+ },
3308
+ {
3309
+ "epoch": 0.37,
3310
+ "learning_rate": 0.0005501194457716197,
3311
+ "loss": 2.1667,
3312
+ "step": 548
3313
+ },
3314
+ {
3315
+ "epoch": 0.38,
3316
+ "learning_rate": 0.000549976110845676,
3317
+ "loss": 2.1035,
3318
+ "step": 549
3319
+ },
3320
+ {
3321
+ "epoch": 0.38,
3322
+ "learning_rate": 0.0005498327759197323,
3323
+ "loss": 2.1628,
3324
+ "step": 550
3325
+ },
3326
+ {
3327
+ "epoch": 0.38,
3328
+ "learning_rate": 0.0005496894409937888,
3329
+ "loss": 2.1491,
3330
+ "step": 551
3331
+ },
3332
+ {
3333
+ "epoch": 0.38,
3334
+ "learning_rate": 0.0005495461060678451,
3335
+ "loss": 2.0676,
3336
+ "step": 552
3337
+ },
3338
+ {
3339
+ "epoch": 0.38,
3340
+ "learning_rate": 0.0005494027711419016,
3341
+ "loss": 2.3462,
3342
+ "step": 553
3343
+ },
3344
+ {
3345
+ "epoch": 0.38,
3346
+ "learning_rate": 0.0005492594362159579,
3347
+ "loss": 2.107,
3348
+ "step": 554
3349
+ },
3350
+ {
3351
+ "epoch": 0.38,
3352
+ "learning_rate": 0.0005491161012900143,
3353
+ "loss": 2.1844,
3354
+ "step": 555
3355
+ },
3356
+ {
3357
+ "epoch": 0.38,
3358
+ "learning_rate": 0.0005489727663640706,
3359
+ "loss": 2.1328,
3360
+ "step": 556
3361
+ },
3362
+ {
3363
+ "epoch": 0.38,
3364
+ "learning_rate": 0.000548829431438127,
3365
+ "loss": 2.1467,
3366
+ "step": 557
3367
+ },
3368
+ {
3369
+ "epoch": 0.38,
3370
+ "learning_rate": 0.0005486860965121834,
3371
+ "loss": 2.1706,
3372
+ "step": 558
3373
+ },
3374
+ {
3375
+ "epoch": 0.38,
3376
+ "learning_rate": 0.0005485427615862398,
3377
+ "loss": 2.1649,
3378
+ "step": 559
3379
+ },
3380
+ {
3381
+ "epoch": 0.38,
3382
+ "learning_rate": 0.0005483994266602962,
3383
+ "loss": 2.154,
3384
+ "step": 560
3385
+ },
3386
+ {
3387
+ "epoch": 0.38,
3388
+ "learning_rate": 0.0005482560917343525,
3389
+ "loss": 2.1095,
3390
+ "step": 561
3391
+ },
3392
+ {
3393
+ "epoch": 0.38,
3394
+ "learning_rate": 0.0005481127568084089,
3395
+ "loss": 2.1706,
3396
+ "step": 562
3397
+ },
3398
+ {
3399
+ "epoch": 0.38,
3400
+ "learning_rate": 0.0005479694218824653,
3401
+ "loss": 2.2179,
3402
+ "step": 563
3403
+ },
3404
+ {
3405
+ "epoch": 0.39,
3406
+ "learning_rate": 0.0005478260869565217,
3407
+ "loss": 2.2188,
3408
+ "step": 564
3409
+ },
3410
+ {
3411
+ "epoch": 0.39,
3412
+ "learning_rate": 0.0005476827520305781,
3413
+ "loss": 2.1167,
3414
+ "step": 565
3415
+ },
3416
+ {
3417
+ "epoch": 0.39,
3418
+ "learning_rate": 0.0005475394171046345,
3419
+ "loss": 2.1275,
3420
+ "step": 566
3421
+ },
3422
+ {
3423
+ "epoch": 0.39,
3424
+ "learning_rate": 0.0005473960821786908,
3425
+ "loss": 2.0766,
3426
+ "step": 567
3427
+ },
3428
+ {
3429
+ "epoch": 0.39,
3430
+ "learning_rate": 0.0005472527472527472,
3431
+ "loss": 2.1719,
3432
+ "step": 568
3433
+ },
3434
+ {
3435
+ "epoch": 0.39,
3436
+ "learning_rate": 0.0005471094123268036,
3437
+ "loss": 2.2378,
3438
+ "step": 569
3439
+ },
3440
+ {
3441
+ "epoch": 0.39,
3442
+ "learning_rate": 0.00054696607740086,
3443
+ "loss": 2.1612,
3444
+ "step": 570
3445
+ },
3446
+ {
3447
+ "epoch": 0.39,
3448
+ "learning_rate": 0.0005468227424749163,
3449
+ "loss": 2.0903,
3450
+ "step": 571
3451
+ },
3452
+ {
3453
+ "epoch": 0.39,
3454
+ "learning_rate": 0.0005466794075489728,
3455
+ "loss": 2.2279,
3456
+ "step": 572
3457
+ },
3458
+ {
3459
+ "epoch": 0.39,
3460
+ "learning_rate": 0.000546536072623029,
3461
+ "loss": 2.1084,
3462
+ "step": 573
3463
+ },
3464
+ {
3465
+ "epoch": 0.39,
3466
+ "learning_rate": 0.0005463927376970855,
3467
+ "loss": 2.1652,
3468
+ "step": 574
3469
+ },
3470
+ {
3471
+ "epoch": 0.39,
3472
+ "learning_rate": 0.0005462494027711418,
3473
+ "loss": 2.1035,
3474
+ "step": 575
3475
+ },
3476
+ {
3477
+ "epoch": 0.39,
3478
+ "learning_rate": 0.0005461060678451983,
3479
+ "loss": 2.2028,
3480
+ "step": 576
3481
+ },
3482
+ {
3483
+ "epoch": 0.39,
3484
+ "learning_rate": 0.0005459627329192546,
3485
+ "loss": 2.2347,
3486
+ "step": 577
3487
+ },
3488
+ {
3489
+ "epoch": 0.4,
3490
+ "learning_rate": 0.000545819397993311,
3491
+ "loss": 2.1609,
3492
+ "step": 578
3493
+ },
3494
+ {
3495
+ "epoch": 0.4,
3496
+ "learning_rate": 0.0005456760630673673,
3497
+ "loss": 2.0815,
3498
+ "step": 579
3499
+ },
3500
+ {
3501
+ "epoch": 0.4,
3502
+ "learning_rate": 0.0005455327281414237,
3503
+ "loss": 2.0575,
3504
+ "step": 580
3505
+ },
3506
+ {
3507
+ "epoch": 0.4,
3508
+ "learning_rate": 0.0005453893932154801,
3509
+ "loss": 2.1104,
3510
+ "step": 581
3511
+ },
3512
+ {
3513
+ "epoch": 0.4,
3514
+ "learning_rate": 0.0005452460582895365,
3515
+ "loss": 2.1784,
3516
+ "step": 582
3517
+ },
3518
+ {
3519
+ "epoch": 0.4,
3520
+ "learning_rate": 0.0005451027233635929,
3521
+ "loss": 2.117,
3522
+ "step": 583
3523
+ },
3524
+ {
3525
+ "epoch": 0.4,
3526
+ "learning_rate": 0.0005449593884376493,
3527
+ "loss": 2.0947,
3528
+ "step": 584
3529
+ },
3530
+ {
3531
+ "epoch": 0.4,
3532
+ "learning_rate": 0.0005448160535117056,
3533
+ "loss": 2.1204,
3534
+ "step": 585
3535
+ },
3536
+ {
3537
+ "epoch": 0.4,
3538
+ "learning_rate": 0.000544672718585762,
3539
+ "loss": 2.0614,
3540
+ "step": 586
3541
+ },
3542
+ {
3543
+ "epoch": 0.4,
3544
+ "learning_rate": 0.0005445293836598184,
3545
+ "loss": 2.1227,
3546
+ "step": 587
3547
+ },
3548
+ {
3549
+ "epoch": 0.4,
3550
+ "learning_rate": 0.0005443860487338748,
3551
+ "loss": 2.1831,
3552
+ "step": 588
3553
+ },
3554
+ {
3555
+ "epoch": 0.4,
3556
+ "learning_rate": 0.0005442427138079312,
3557
+ "loss": 2.2267,
3558
+ "step": 589
3559
+ },
3560
+ {
3561
+ "epoch": 0.4,
3562
+ "learning_rate": 0.0005440993788819876,
3563
+ "loss": 2.1008,
3564
+ "step": 590
3565
+ },
3566
+ {
3567
+ "epoch": 0.4,
3568
+ "learning_rate": 0.0005439560439560439,
3569
+ "loss": 2.1126,
3570
+ "step": 591
3571
+ },
3572
+ {
3573
+ "epoch": 0.4,
3574
+ "learning_rate": 0.0005438127090301002,
3575
+ "loss": 2.2081,
3576
+ "step": 592
3577
+ },
3578
+ {
3579
+ "epoch": 0.41,
3580
+ "learning_rate": 0.0005436693741041567,
3581
+ "loss": 2.2372,
3582
+ "step": 593
3583
+ },
3584
+ {
3585
+ "epoch": 0.41,
3586
+ "learning_rate": 0.000543526039178213,
3587
+ "loss": 2.1783,
3588
+ "step": 594
3589
+ },
3590
+ {
3591
+ "epoch": 0.41,
3592
+ "learning_rate": 0.0005433827042522695,
3593
+ "loss": 2.1631,
3594
+ "step": 595
3595
+ },
3596
+ {
3597
+ "epoch": 0.41,
3598
+ "learning_rate": 0.0005432393693263258,
3599
+ "loss": 2.2263,
3600
+ "step": 596
3601
+ },
3602
+ {
3603
+ "epoch": 0.41,
3604
+ "learning_rate": 0.0005430960344003822,
3605
+ "loss": 2.2438,
3606
+ "step": 597
3607
+ },
3608
+ {
3609
+ "epoch": 0.41,
3610
+ "learning_rate": 0.0005429526994744385,
3611
+ "loss": 2.125,
3612
+ "step": 598
3613
+ },
3614
+ {
3615
+ "epoch": 0.41,
3616
+ "learning_rate": 0.0005428093645484949,
3617
+ "loss": 2.1655,
3618
+ "step": 599
3619
+ },
3620
+ {
3621
+ "epoch": 0.41,
3622
+ "learning_rate": 0.0005426660296225513,
3623
+ "loss": 2.241,
3624
+ "step": 600
3625
+ },
3626
+ {
3627
+ "epoch": 0.41,
3628
+ "eval_loss": 2.047600030899048,
3629
+ "eval_runtime": 1736.5603,
3630
+ "eval_samples_per_second": 5.759,
3631
+ "eval_steps_per_second": 5.759,
3632
+ "step": 600
3633
  }
3634
  ],
3635
  "max_steps": 4386,
3636
  "num_train_epochs": 3,
3637
+ "total_flos": 7.368734337271511e+17,
3638
  "trial_name": null,
3639
  "trial_params": null
3640
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04346cfbee02636e929bbdd06ddd0aa6ecb2b8f64b73fa9d623a5793dd51c467
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1ac718a5450b8f5bc3e788c6d41748c97c82f6bf2933bbce997b0073b1c8202
3
  size 2368281769