EColi commited on
Commit
e143697
1 Parent(s): 3f53f30

Add 2233188

Browse files
Files changed (5) hide show
  1. README.md +49 -4
  2. all_results.json +5 -5
  3. pytorch_model.bin +1 -1
  4. train_results.json +5 -5
  5. trainer_state.json +588 -3
README.md CHANGED
@@ -11,9 +11,9 @@ should probably proofread and complete it, then remove this comment. -->
11
 
12
  # out
13
 
14
- This model is a fine-tuned version of [/1TB_SSD/SB_AI/out_orig2](https://huggingface.co//1TB_SSD/SB_AI/out_orig2) on an unknown dataset.
15
  It achieves the following results on the evaluation set:
16
- - Loss: 0.0619
17
 
18
  ## Model description
19
 
@@ -35,10 +35,10 @@ The following hyperparameters were used during training:
35
  - learning_rate: 5e-05
36
  - train_batch_size: 1
37
  - eval_batch_size: 1
38
- - seed: 3784447887
39
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
40
  - lr_scheduler_type: linear
41
- - num_epochs: 1
42
 
43
  ### Training results
44
 
@@ -58,6 +58,51 @@ The following hyperparameters were used during training:
58
  | 0.059 | 0.81 | 900000 | 0.0652 |
59
  | 0.0666 | 0.87 | 975000 | 0.0619 |
60
  | 0.0624 | 0.94 | 1050000 | 0.0619 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  ### Framework versions
11
 
12
  # out
13
 
14
+ This model is a fine-tuned version of [/1TB_SSD/SB_AI/out_epoch1/out/checkpoint-1115000/](https://huggingface.co//1TB_SSD/SB_AI/out_epoch1/out/checkpoint-1115000/) on an unknown dataset.
15
  It achieves the following results on the evaluation set:
16
+ - Loss: 0.0645
17
 
18
  ## Model description
19
 
35
  - learning_rate: 5e-05
36
  - train_batch_size: 1
37
  - eval_batch_size: 1
38
+ - seed: 2518227880
39
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
40
  - lr_scheduler_type: linear
41
+ - num_epochs: 2.0
42
 
43
  ### Training results
44
 
58
  | 0.059 | 0.81 | 900000 | 0.0652 |
59
  | 0.0666 | 0.87 | 975000 | 0.0619 |
60
  | 0.0624 | 0.94 | 1050000 | 0.0619 |
61
+ | 0.0625 | 1.01 | 1125000 | 0.0667 |
62
+ | 0.0614 | 1.03 | 1150000 | 0.0658 |
63
+ | 0.0597 | 1.05 | 1175000 | 0.0683 |
64
+ | 0.0629 | 1.07 | 1200000 | 0.0691 |
65
+ | 0.0603 | 1.1 | 1225000 | 0.0678 |
66
+ | 0.0601 | 1.12 | 1250000 | 0.0746 |
67
+ | 0.0606 | 1.14 | 1275000 | 0.0691 |
68
+ | 0.0671 | 1.16 | 1300000 | 0.0702 |
69
+ | 0.0625 | 1.19 | 1325000 | 0.0661 |
70
+ | 0.0617 | 1.21 | 1350000 | 0.0688 |
71
+ | 0.0579 | 1.23 | 1375000 | 0.0679 |
72
+ | 0.0663 | 1.25 | 1400000 | 0.0634 |
73
+ | 0.0583 | 1.28 | 1425000 | 0.0638 |
74
+ | 0.0623 | 1.3 | 1450000 | 0.0681 |
75
+ | 0.0615 | 1.32 | 1475000 | 0.0670 |
76
+ | 0.0592 | 1.34 | 1500000 | 0.0666 |
77
+ | 0.0626 | 1.37 | 1525000 | 0.0666 |
78
+ | 0.063 | 1.39 | 1550000 | 0.0647 |
79
+ | 0.0648 | 1.41 | 1575000 | 0.0653 |
80
+ | 0.0611 | 1.43 | 1600000 | 0.0700 |
81
+ | 0.0622 | 1.46 | 1625000 | 0.0634 |
82
+ | 0.0617 | 1.48 | 1650000 | 0.0651 |
83
+ | 0.0613 | 1.5 | 1675000 | 0.0634 |
84
+ | 0.0639 | 1.52 | 1700000 | 0.0661 |
85
+ | 0.0615 | 1.54 | 1725000 | 0.0644 |
86
+ | 0.0605 | 1.57 | 1750000 | 0.0662 |
87
+ | 0.0622 | 1.59 | 1775000 | 0.0656 |
88
+ | 0.0585 | 1.61 | 1800000 | 0.0633 |
89
+ | 0.0628 | 1.63 | 1825000 | 0.0625 |
90
+ | 0.0638 | 1.66 | 1850000 | 0.0662 |
91
+ | 0.0599 | 1.68 | 1875000 | 0.0664 |
92
+ | 0.0583 | 1.7 | 1900000 | 0.0668 |
93
+ | 0.0543 | 1.72 | 1925000 | 0.0631 |
94
+ | 0.06 | 1.75 | 1950000 | 0.0629 |
95
+ | 0.0615 | 1.77 | 1975000 | 0.0644 |
96
+ | 0.0587 | 1.79 | 2000000 | 0.0663 |
97
+ | 0.0647 | 1.81 | 2025000 | 0.0654 |
98
+ | 0.0604 | 1.84 | 2050000 | 0.0639 |
99
+ | 0.0641 | 1.86 | 2075000 | 0.0636 |
100
+ | 0.0604 | 1.88 | 2100000 | 0.0636 |
101
+ | 0.0654 | 1.9 | 2125000 | 0.0652 |
102
+ | 0.0588 | 1.93 | 2150000 | 0.0638 |
103
+ | 0.0616 | 1.95 | 2175000 | 0.0657 |
104
+ | 0.0598 | 1.97 | 2200000 | 0.0646 |
105
+ | 0.0633 | 1.99 | 2225000 | 0.0645 |
106
 
107
 
108
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.0695511792498969,
4
- "train_runtime": 176193.0564,
5
  "train_samples": 1116594,
6
- "train_samples_per_second": 6.337,
7
- "train_steps_per_second": 6.337
8
  }
1
  {
2
+ "epoch": 2.0,
3
+ "train_loss": 0.030777697094727392,
4
+ "train_runtime": 228453.6417,
5
  "train_samples": 1116594,
6
+ "train_samples_per_second": 9.775,
7
+ "train_steps_per_second": 9.775
8
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afdfb877d569756c5d3e589de624b065735445d4431398f5ec538b4f3ee17e99
3
  size 891703231
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d0ec9b60b45e3f2d6bbd55005877c63954154cb113d9575bd472932dc918566
3
  size 891703231
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.0695511792498969,
4
- "train_runtime": 176193.0564,
5
  "train_samples": 1116594,
6
- "train_samples_per_second": 6.337,
7
- "train_steps_per_second": 6.337
8
  }
1
  {
2
+ "epoch": 2.0,
3
+ "train_loss": 0.030777697094727392,
4
+ "train_runtime": 228453.6417,
5
  "train_samples": 1116594,
6
+ "train_samples_per_second": 9.775,
7
+ "train_steps_per_second": 9.775
8
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.656824235129331,
5
- "global_step": 1850000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -2578,11 +2578,596 @@
2578
  "eval_samples_per_second": 36.912,
2579
  "eval_steps_per_second": 36.912,
2580
  "step": 1850000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2581
  }
2582
  ],
2583
  "max_steps": 2233188,
2584
  "num_train_epochs": 2,
2585
- "total_flos": 5.556070184057856e+17,
2586
  "trial_name": null,
2587
  "trial_params": null
2588
  }
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "global_step": 2233188,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
2578
  "eval_samples_per_second": 36.912,
2579
  "eval_steps_per_second": 36.912,
2580
  "step": 1850000
2581
+ },
2582
+ {
2583
+ "epoch": 1.66,
2584
+ "learning_rate": 8.467446538312046e-06,
2585
+ "loss": 0.0601,
2586
+ "step": 1855000
2587
+ },
2588
+ {
2589
+ "epoch": 1.67,
2590
+ "learning_rate": 8.355498954857361e-06,
2591
+ "loss": 0.065,
2592
+ "step": 1860000
2593
+ },
2594
+ {
2595
+ "epoch": 1.67,
2596
+ "learning_rate": 8.243551371402678e-06,
2597
+ "loss": 0.059,
2598
+ "step": 1865000
2599
+ },
2600
+ {
2601
+ "epoch": 1.67,
2602
+ "learning_rate": 8.131603787947991e-06,
2603
+ "loss": 0.064,
2604
+ "step": 1870000
2605
+ },
2606
+ {
2607
+ "epoch": 1.68,
2608
+ "learning_rate": 8.019656204493308e-06,
2609
+ "loss": 0.0599,
2610
+ "step": 1875000
2611
+ },
2612
+ {
2613
+ "epoch": 1.68,
2614
+ "eval_loss": 0.06644026190042496,
2615
+ "eval_runtime": 1678.6507,
2616
+ "eval_samples_per_second": 36.954,
2617
+ "eval_steps_per_second": 36.954,
2618
+ "step": 1875000
2619
+ },
2620
+ {
2621
+ "epoch": 1.68,
2622
+ "learning_rate": 7.907708621038623e-06,
2623
+ "loss": 0.0639,
2624
+ "step": 1880000
2625
+ },
2626
+ {
2627
+ "epoch": 1.69,
2628
+ "learning_rate": 7.795761037583939e-06,
2629
+ "loss": 0.0631,
2630
+ "step": 1885000
2631
+ },
2632
+ {
2633
+ "epoch": 1.69,
2634
+ "learning_rate": 7.683813454129254e-06,
2635
+ "loss": 0.064,
2636
+ "step": 1890000
2637
+ },
2638
+ {
2639
+ "epoch": 1.7,
2640
+ "learning_rate": 7.57186587067457e-06,
2641
+ "loss": 0.062,
2642
+ "step": 1895000
2643
+ },
2644
+ {
2645
+ "epoch": 1.7,
2646
+ "learning_rate": 7.459918287219884e-06,
2647
+ "loss": 0.0583,
2648
+ "step": 1900000
2649
+ },
2650
+ {
2651
+ "epoch": 1.7,
2652
+ "eval_loss": 0.0668446272611618,
2653
+ "eval_runtime": 1681.2378,
2654
+ "eval_samples_per_second": 36.897,
2655
+ "eval_steps_per_second": 36.897,
2656
+ "step": 1900000
2657
+ },
2658
+ {
2659
+ "epoch": 1.71,
2660
+ "learning_rate": 7.3479707037652e-06,
2661
+ "loss": 0.0599,
2662
+ "step": 1905000
2663
+ },
2664
+ {
2665
+ "epoch": 1.71,
2666
+ "learning_rate": 7.236023120310516e-06,
2667
+ "loss": 0.0611,
2668
+ "step": 1910000
2669
+ },
2670
+ {
2671
+ "epoch": 1.72,
2672
+ "learning_rate": 7.124075536855831e-06,
2673
+ "loss": 0.0627,
2674
+ "step": 1915000
2675
+ },
2676
+ {
2677
+ "epoch": 1.72,
2678
+ "learning_rate": 7.012127953401147e-06,
2679
+ "loss": 0.0591,
2680
+ "step": 1920000
2681
+ },
2682
+ {
2683
+ "epoch": 1.72,
2684
+ "learning_rate": 6.900180369946463e-06,
2685
+ "loss": 0.0543,
2686
+ "step": 1925000
2687
+ },
2688
+ {
2689
+ "epoch": 1.72,
2690
+ "eval_loss": 0.06305810809135437,
2691
+ "eval_runtime": 1683.6555,
2692
+ "eval_samples_per_second": 36.844,
2693
+ "eval_steps_per_second": 36.844,
2694
+ "step": 1925000
2695
+ },
2696
+ {
2697
+ "epoch": 1.73,
2698
+ "learning_rate": 6.788232786491779e-06,
2699
+ "loss": 0.0622,
2700
+ "step": 1930000
2701
+ },
2702
+ {
2703
+ "epoch": 1.73,
2704
+ "learning_rate": 6.676285203037093e-06,
2705
+ "loss": 0.0605,
2706
+ "step": 1935000
2707
+ },
2708
+ {
2709
+ "epoch": 1.74,
2710
+ "learning_rate": 6.564337619582408e-06,
2711
+ "loss": 0.0622,
2712
+ "step": 1940000
2713
+ },
2714
+ {
2715
+ "epoch": 1.74,
2716
+ "learning_rate": 6.452390036127724e-06,
2717
+ "loss": 0.0596,
2718
+ "step": 1945000
2719
+ },
2720
+ {
2721
+ "epoch": 1.75,
2722
+ "learning_rate": 6.34044245267304e-06,
2723
+ "loss": 0.06,
2724
+ "step": 1950000
2725
+ },
2726
+ {
2727
+ "epoch": 1.75,
2728
+ "eval_loss": 0.06286083161830902,
2729
+ "eval_runtime": 1684.2136,
2730
+ "eval_samples_per_second": 36.832,
2731
+ "eval_steps_per_second": 36.832,
2732
+ "step": 1950000
2733
+ },
2734
+ {
2735
+ "epoch": 1.75,
2736
+ "learning_rate": 6.2284948692183554e-06,
2737
+ "loss": 0.058,
2738
+ "step": 1955000
2739
+ },
2740
+ {
2741
+ "epoch": 1.76,
2742
+ "learning_rate": 6.116547285763671e-06,
2743
+ "loss": 0.0594,
2744
+ "step": 1960000
2745
+ },
2746
+ {
2747
+ "epoch": 1.76,
2748
+ "learning_rate": 6.004599702308987e-06,
2749
+ "loss": 0.0679,
2750
+ "step": 1965000
2751
+ },
2752
+ {
2753
+ "epoch": 1.76,
2754
+ "learning_rate": 5.892652118854302e-06,
2755
+ "loss": 0.0621,
2756
+ "step": 1970000
2757
+ },
2758
+ {
2759
+ "epoch": 1.77,
2760
+ "learning_rate": 5.780704535399617e-06,
2761
+ "loss": 0.0615,
2762
+ "step": 1975000
2763
+ },
2764
+ {
2765
+ "epoch": 1.77,
2766
+ "eval_loss": 0.06435712426900864,
2767
+ "eval_runtime": 1684.3427,
2768
+ "eval_samples_per_second": 36.829,
2769
+ "eval_steps_per_second": 36.829,
2770
+ "step": 1975000
2771
+ },
2772
+ {
2773
+ "epoch": 1.77,
2774
+ "learning_rate": 5.668756951944933e-06,
2775
+ "loss": 0.0602,
2776
+ "step": 1980000
2777
+ },
2778
+ {
2779
+ "epoch": 1.78,
2780
+ "learning_rate": 5.556809368490248e-06,
2781
+ "loss": 0.0589,
2782
+ "step": 1985000
2783
+ },
2784
+ {
2785
+ "epoch": 1.78,
2786
+ "learning_rate": 5.444861785035564e-06,
2787
+ "loss": 0.0578,
2788
+ "step": 1990000
2789
+ },
2790
+ {
2791
+ "epoch": 1.79,
2792
+ "learning_rate": 5.332914201580879e-06,
2793
+ "loss": 0.0626,
2794
+ "step": 1995000
2795
+ },
2796
+ {
2797
+ "epoch": 1.79,
2798
+ "learning_rate": 5.220966618126195e-06,
2799
+ "loss": 0.0587,
2800
+ "step": 2000000
2801
+ },
2802
+ {
2803
+ "epoch": 1.79,
2804
+ "eval_loss": 0.06628864258527756,
2805
+ "eval_runtime": 1682.6389,
2806
+ "eval_samples_per_second": 36.866,
2807
+ "eval_steps_per_second": 36.866,
2808
+ "step": 2000000
2809
+ },
2810
+ {
2811
+ "epoch": 1.8,
2812
+ "learning_rate": 5.109019034671511e-06,
2813
+ "loss": 0.0595,
2814
+ "step": 2005000
2815
+ },
2816
+ {
2817
+ "epoch": 1.8,
2818
+ "learning_rate": 4.997071451216825e-06,
2819
+ "loss": 0.0584,
2820
+ "step": 2010000
2821
+ },
2822
+ {
2823
+ "epoch": 1.8,
2824
+ "learning_rate": 4.885123867762141e-06,
2825
+ "loss": 0.0562,
2826
+ "step": 2015000
2827
+ },
2828
+ {
2829
+ "epoch": 1.81,
2830
+ "learning_rate": 4.773176284307457e-06,
2831
+ "loss": 0.0694,
2832
+ "step": 2020000
2833
+ },
2834
+ {
2835
+ "epoch": 1.81,
2836
+ "learning_rate": 4.661228700852772e-06,
2837
+ "loss": 0.0647,
2838
+ "step": 2025000
2839
+ },
2840
+ {
2841
+ "epoch": 1.81,
2842
+ "eval_loss": 0.06544966250658035,
2843
+ "eval_runtime": 1682.8498,
2844
+ "eval_samples_per_second": 36.862,
2845
+ "eval_steps_per_second": 36.862,
2846
+ "step": 2025000
2847
+ },
2848
+ {
2849
+ "epoch": 1.82,
2850
+ "learning_rate": 4.5492811173980875e-06,
2851
+ "loss": 0.0596,
2852
+ "step": 2030000
2853
+ },
2854
+ {
2855
+ "epoch": 1.82,
2856
+ "learning_rate": 4.4373335339434035e-06,
2857
+ "loss": 0.062,
2858
+ "step": 2035000
2859
+ },
2860
+ {
2861
+ "epoch": 1.83,
2862
+ "learning_rate": 4.325385950488719e-06,
2863
+ "loss": 0.0656,
2864
+ "step": 2040000
2865
+ },
2866
+ {
2867
+ "epoch": 1.83,
2868
+ "learning_rate": 4.213438367034034e-06,
2869
+ "loss": 0.0632,
2870
+ "step": 2045000
2871
+ },
2872
+ {
2873
+ "epoch": 1.84,
2874
+ "learning_rate": 4.101490783579349e-06,
2875
+ "loss": 0.0604,
2876
+ "step": 2050000
2877
+ },
2878
+ {
2879
+ "epoch": 1.84,
2880
+ "eval_loss": 0.06386958807706833,
2881
+ "eval_runtime": 1684.8253,
2882
+ "eval_samples_per_second": 36.819,
2883
+ "eval_steps_per_second": 36.819,
2884
+ "step": 2050000
2885
+ },
2886
+ {
2887
+ "epoch": 1.84,
2888
+ "learning_rate": 3.989543200124665e-06,
2889
+ "loss": 0.0586,
2890
+ "step": 2055000
2891
+ },
2892
+ {
2893
+ "epoch": 1.84,
2894
+ "learning_rate": 3.87759561666998e-06,
2895
+ "loss": 0.0595,
2896
+ "step": 2060000
2897
+ },
2898
+ {
2899
+ "epoch": 1.85,
2900
+ "learning_rate": 3.765648033215296e-06,
2901
+ "loss": 0.0628,
2902
+ "step": 2065000
2903
+ },
2904
+ {
2905
+ "epoch": 1.85,
2906
+ "learning_rate": 3.6537004497606115e-06,
2907
+ "loss": 0.0614,
2908
+ "step": 2070000
2909
+ },
2910
+ {
2911
+ "epoch": 1.86,
2912
+ "learning_rate": 3.541752866305927e-06,
2913
+ "loss": 0.0641,
2914
+ "step": 2075000
2915
+ },
2916
+ {
2917
+ "epoch": 1.86,
2918
+ "eval_loss": 0.06358933448791504,
2919
+ "eval_runtime": 1682.4468,
2920
+ "eval_samples_per_second": 36.871,
2921
+ "eval_steps_per_second": 36.871,
2922
+ "step": 2075000
2923
+ },
2924
+ {
2925
+ "epoch": 1.86,
2926
+ "learning_rate": 3.4298052828512423e-06,
2927
+ "loss": 0.0612,
2928
+ "step": 2080000
2929
+ },
2930
+ {
2931
+ "epoch": 1.87,
2932
+ "learning_rate": 3.317857699396558e-06,
2933
+ "loss": 0.0675,
2934
+ "step": 2085000
2935
+ },
2936
+ {
2937
+ "epoch": 1.87,
2938
+ "learning_rate": 3.2059101159418736e-06,
2939
+ "loss": 0.0657,
2940
+ "step": 2090000
2941
+ },
2942
+ {
2943
+ "epoch": 1.88,
2944
+ "learning_rate": 3.0939625324871888e-06,
2945
+ "loss": 0.0628,
2946
+ "step": 2095000
2947
+ },
2948
+ {
2949
+ "epoch": 1.88,
2950
+ "learning_rate": 2.9820149490325044e-06,
2951
+ "loss": 0.0604,
2952
+ "step": 2100000
2953
+ },
2954
+ {
2955
+ "epoch": 1.88,
2956
+ "eval_loss": 0.0636412724852562,
2957
+ "eval_runtime": 1687.8932,
2958
+ "eval_samples_per_second": 36.752,
2959
+ "eval_steps_per_second": 36.752,
2960
+ "step": 2100000
2961
+ },
2962
+ {
2963
+ "epoch": 1.89,
2964
+ "learning_rate": 2.87006736557782e-06,
2965
+ "loss": 0.0592,
2966
+ "step": 2105000
2967
+ },
2968
+ {
2969
+ "epoch": 1.89,
2970
+ "learning_rate": 2.758119782123135e-06,
2971
+ "loss": 0.0619,
2972
+ "step": 2110000
2973
+ },
2974
+ {
2975
+ "epoch": 1.89,
2976
+ "learning_rate": 2.6461721986684508e-06,
2977
+ "loss": 0.0623,
2978
+ "step": 2115000
2979
+ },
2980
+ {
2981
+ "epoch": 1.9,
2982
+ "learning_rate": 2.5342246152137664e-06,
2983
+ "loss": 0.0568,
2984
+ "step": 2120000
2985
+ },
2986
+ {
2987
+ "epoch": 1.9,
2988
+ "learning_rate": 2.4222770317590816e-06,
2989
+ "loss": 0.0654,
2990
+ "step": 2125000
2991
+ },
2992
+ {
2993
+ "epoch": 1.9,
2994
+ "eval_loss": 0.06523974984884262,
2995
+ "eval_runtime": 1681.6119,
2996
+ "eval_samples_per_second": 36.889,
2997
+ "eval_steps_per_second": 36.889,
2998
+ "step": 2125000
2999
+ },
3000
+ {
3001
+ "epoch": 1.91,
3002
+ "learning_rate": 2.310329448304397e-06,
3003
+ "loss": 0.0597,
3004
+ "step": 2130000
3005
+ },
3006
+ {
3007
+ "epoch": 1.91,
3008
+ "learning_rate": 2.198381864849713e-06,
3009
+ "loss": 0.0614,
3010
+ "step": 2135000
3011
+ },
3012
+ {
3013
+ "epoch": 1.92,
3014
+ "learning_rate": 2.086434281395028e-06,
3015
+ "loss": 0.0576,
3016
+ "step": 2140000
3017
+ },
3018
+ {
3019
+ "epoch": 1.92,
3020
+ "learning_rate": 1.9744866979403436e-06,
3021
+ "loss": 0.0625,
3022
+ "step": 2145000
3023
+ },
3024
+ {
3025
+ "epoch": 1.93,
3026
+ "learning_rate": 1.862539114485659e-06,
3027
+ "loss": 0.0588,
3028
+ "step": 2150000
3029
+ },
3030
+ {
3031
+ "epoch": 1.93,
3032
+ "eval_loss": 0.06383997201919556,
3033
+ "eval_runtime": 1685.504,
3034
+ "eval_samples_per_second": 36.804,
3035
+ "eval_steps_per_second": 36.804,
3036
+ "step": 2150000
3037
+ },
3038
+ {
3039
+ "epoch": 1.93,
3040
+ "learning_rate": 1.7505915310309748e-06,
3041
+ "loss": 0.0525,
3042
+ "step": 2155000
3043
+ },
3044
+ {
3045
+ "epoch": 1.93,
3046
+ "learning_rate": 1.63864394757629e-06,
3047
+ "loss": 0.0573,
3048
+ "step": 2160000
3049
+ },
3050
+ {
3051
+ "epoch": 1.94,
3052
+ "learning_rate": 1.5266963641216056e-06,
3053
+ "loss": 0.0612,
3054
+ "step": 2165000
3055
+ },
3056
+ {
3057
+ "epoch": 1.94,
3058
+ "learning_rate": 1.4147487806669212e-06,
3059
+ "loss": 0.0625,
3060
+ "step": 2170000
3061
+ },
3062
+ {
3063
+ "epoch": 1.95,
3064
+ "learning_rate": 1.3028011972122364e-06,
3065
+ "loss": 0.0616,
3066
+ "step": 2175000
3067
+ },
3068
+ {
3069
+ "epoch": 1.95,
3070
+ "eval_loss": 0.06566296517848969,
3071
+ "eval_runtime": 1686.3029,
3072
+ "eval_samples_per_second": 36.786,
3073
+ "eval_steps_per_second": 36.786,
3074
+ "step": 2175000
3075
+ },
3076
+ {
3077
+ "epoch": 1.95,
3078
+ "learning_rate": 1.190853613757552e-06,
3079
+ "loss": 0.0629,
3080
+ "step": 2180000
3081
+ },
3082
+ {
3083
+ "epoch": 1.96,
3084
+ "learning_rate": 1.0789060303028674e-06,
3085
+ "loss": 0.0543,
3086
+ "step": 2185000
3087
+ },
3088
+ {
3089
+ "epoch": 1.96,
3090
+ "learning_rate": 9.66958446848183e-07,
3091
+ "loss": 0.0584,
3092
+ "step": 2190000
3093
+ },
3094
+ {
3095
+ "epoch": 1.97,
3096
+ "learning_rate": 8.550108633934986e-07,
3097
+ "loss": 0.0595,
3098
+ "step": 2195000
3099
+ },
3100
+ {
3101
+ "epoch": 1.97,
3102
+ "learning_rate": 7.43063279938814e-07,
3103
+ "loss": 0.0598,
3104
+ "step": 2200000
3105
+ },
3106
+ {
3107
+ "epoch": 1.97,
3108
+ "eval_loss": 0.06455818563699722,
3109
+ "eval_runtime": 1682.3023,
3110
+ "eval_samples_per_second": 36.874,
3111
+ "eval_steps_per_second": 36.874,
3112
+ "step": 2200000
3113
+ },
3114
+ {
3115
+ "epoch": 1.97,
3116
+ "learning_rate": 6.311156964841294e-07,
3117
+ "loss": 0.0595,
3118
+ "step": 2205000
3119
+ },
3120
+ {
3121
+ "epoch": 1.98,
3122
+ "learning_rate": 5.191681130294449e-07,
3123
+ "loss": 0.0602,
3124
+ "step": 2210000
3125
+ },
3126
+ {
3127
+ "epoch": 1.98,
3128
+ "learning_rate": 4.0722052957476043e-07,
3129
+ "loss": 0.0601,
3130
+ "step": 2215000
3131
+ },
3132
+ {
3133
+ "epoch": 1.99,
3134
+ "learning_rate": 2.952729461200759e-07,
3135
+ "loss": 0.0595,
3136
+ "step": 2220000
3137
+ },
3138
+ {
3139
+ "epoch": 1.99,
3140
+ "learning_rate": 1.8332536266539136e-07,
3141
+ "loss": 0.0633,
3142
+ "step": 2225000
3143
+ },
3144
+ {
3145
+ "epoch": 1.99,
3146
+ "eval_loss": 0.06449371576309204,
3147
+ "eval_runtime": 1682.7954,
3148
+ "eval_samples_per_second": 36.863,
3149
+ "eval_steps_per_second": 36.863,
3150
+ "step": 2225000
3151
+ },
3152
+ {
3153
+ "epoch": 2.0,
3154
+ "learning_rate": 7.137777921070686e-08,
3155
+ "loss": 0.0603,
3156
+ "step": 2230000
3157
+ },
3158
+ {
3159
+ "epoch": 2.0,
3160
+ "step": 2233188,
3161
+ "total_flos": 6.706494317713766e+17,
3162
+ "train_loss": 0.030777697094727392,
3163
+ "train_runtime": 228453.6417,
3164
+ "train_samples_per_second": 9.775,
3165
+ "train_steps_per_second": 9.775
3166
  }
3167
  ],
3168
  "max_steps": 2233188,
3169
  "num_train_epochs": 2,
3170
+ "total_flos": 6.706494317713766e+17,
3171
  "trial_name": null,
3172
  "trial_params": null
3173
  }