sylyas commited on
Commit
7441f8a
·
verified ·
1 Parent(s): 2bc75fd

Training in progress, step 475, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ddcea3f015ba7baf7240a381987d1fea89ddbc3bfa6cd80ad532930250eb92a
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c444a5739e2bec3b1e3f04bd30a01356e1edcb8c01b2ba0dd74328accd2825b2
3
  size 167832240
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6606e17bd4f0435631121771e7600dbc4c37c6c76273295ec21f9a17bd27b35f
3
  size 85723732
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08d25728ac5fe47a0ad27b3da864cab99af91f3548996d69185914cd67c672bb
3
  size 85723732
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b732712c22d6c942fca8e85d6d6b8d91964b43f4fe22cb00333cc39c1c2eda24
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1126b5614f51e5fa36121a9290d6b01e210532249374ad3c426067b6e3d80cfd
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02847c1628ecbc90ee51fc4d3fb5a61ddbee1e60d453008afab16af26f807227
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d3ab8b8dc5babc32b4adc3c596b50dd0fcac27b238d3838d86c3c68054c541d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.751578947368421,
5
  "eval_steps": 119,
6
- "global_step": 357,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2538,6 +2538,832 @@
2538
  "eval_samples_per_second": 4.073,
2539
  "eval_steps_per_second": 4.073,
2540
  "step": 357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2541
  }
2542
  ],
2543
  "logging_steps": 1,
@@ -2552,12 +3378,12 @@
2552
  "should_evaluate": false,
2553
  "should_log": false,
2554
  "should_save": true,
2555
- "should_training_stop": false
2556
  },
2557
  "attributes": {}
2558
  }
2559
  },
2560
- "total_flos": 3.3245823894552576e+16,
2561
  "train_batch_size": 1,
2562
  "trial_name": null,
2563
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 119,
6
+ "global_step": 475,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2538
  "eval_samples_per_second": 4.073,
2539
  "eval_steps_per_second": 4.073,
2540
  "step": 357
2541
+ },
2542
+ {
2543
+ "epoch": 0.7536842105263157,
2544
+ "grad_norm": 0.8696005940437317,
2545
+ "learning_rate": 2.964852511800519e-05,
2546
+ "loss": 1.0904,
2547
+ "step": 358
2548
+ },
2549
+ {
2550
+ "epoch": 0.7557894736842106,
2551
+ "grad_norm": 0.7804014086723328,
2552
+ "learning_rate": 2.9169990463359555e-05,
2553
+ "loss": 1.0226,
2554
+ "step": 359
2555
+ },
2556
+ {
2557
+ "epoch": 0.7578947368421053,
2558
+ "grad_norm": 0.8228606581687927,
2559
+ "learning_rate": 2.869468883687798e-05,
2560
+ "loss": 1.3508,
2561
+ "step": 360
2562
+ },
2563
+ {
2564
+ "epoch": 0.76,
2565
+ "grad_norm": 0.6658822298049927,
2566
+ "learning_rate": 2.8222641933652117e-05,
2567
+ "loss": 1.1766,
2568
+ "step": 361
2569
+ },
2570
+ {
2571
+ "epoch": 0.7621052631578947,
2572
+ "grad_norm": 0.7910031676292419,
2573
+ "learning_rate": 2.7753871300212142e-05,
2574
+ "loss": 0.7955,
2575
+ "step": 362
2576
+ },
2577
+ {
2578
+ "epoch": 0.7642105263157895,
2579
+ "grad_norm": 0.7152170538902283,
2580
+ "learning_rate": 2.7288398333543064e-05,
2581
+ "loss": 0.9675,
2582
+ "step": 363
2583
+ },
2584
+ {
2585
+ "epoch": 0.7663157894736842,
2586
+ "grad_norm": 0.5529299974441528,
2587
+ "learning_rate": 2.6826244280108437e-05,
2588
+ "loss": 0.9956,
2589
+ "step": 364
2590
+ },
2591
+ {
2592
+ "epoch": 0.7684210526315789,
2593
+ "grad_norm": 0.7928904891014099,
2594
+ "learning_rate": 2.6367430234880284e-05,
2595
+ "loss": 0.963,
2596
+ "step": 365
2597
+ },
2598
+ {
2599
+ "epoch": 0.7705263157894737,
2600
+ "grad_norm": 0.8560205101966858,
2601
+ "learning_rate": 2.591197714037631e-05,
2602
+ "loss": 1.164,
2603
+ "step": 366
2604
+ },
2605
+ {
2606
+ "epoch": 0.7726315789473684,
2607
+ "grad_norm": 0.5956188440322876,
2608
+ "learning_rate": 2.5459905785704042e-05,
2609
+ "loss": 1.1437,
2610
+ "step": 367
2611
+ },
2612
+ {
2613
+ "epoch": 0.7747368421052632,
2614
+ "grad_norm": 1.018763542175293,
2615
+ "learning_rate": 2.5011236805611814e-05,
2616
+ "loss": 1.1257,
2617
+ "step": 368
2618
+ },
2619
+ {
2620
+ "epoch": 0.7768421052631579,
2621
+ "grad_norm": 2.8923377990722656,
2622
+ "learning_rate": 2.4565990679546914e-05,
2623
+ "loss": 0.8183,
2624
+ "step": 369
2625
+ },
2626
+ {
2627
+ "epoch": 0.7789473684210526,
2628
+ "grad_norm": 0.7362539172172546,
2629
+ "learning_rate": 2.4124187730720917e-05,
2630
+ "loss": 1.2692,
2631
+ "step": 370
2632
+ },
2633
+ {
2634
+ "epoch": 0.7810526315789473,
2635
+ "grad_norm": 0.9826866388320923,
2636
+ "learning_rate": 2.368584812518184e-05,
2637
+ "loss": 0.8437,
2638
+ "step": 371
2639
+ },
2640
+ {
2641
+ "epoch": 0.783157894736842,
2642
+ "grad_norm": 0.8725941181182861,
2643
+ "learning_rate": 2.3250991870893835e-05,
2644
+ "loss": 1.0641,
2645
+ "step": 372
2646
+ },
2647
+ {
2648
+ "epoch": 0.7852631578947369,
2649
+ "grad_norm": 2.081977605819702,
2650
+ "learning_rate": 2.2819638816823797e-05,
2651
+ "loss": 1.1523,
2652
+ "step": 373
2653
+ },
2654
+ {
2655
+ "epoch": 0.7873684210526316,
2656
+ "grad_norm": 2.948880434036255,
2657
+ "learning_rate": 2.2391808652035517e-05,
2658
+ "loss": 0.6608,
2659
+ "step": 374
2660
+ },
2661
+ {
2662
+ "epoch": 0.7894736842105263,
2663
+ "grad_norm": 0.845608651638031,
2664
+ "learning_rate": 2.1967520904790827e-05,
2665
+ "loss": 1.0555,
2666
+ "step": 375
2667
+ },
2668
+ {
2669
+ "epoch": 0.791578947368421,
2670
+ "grad_norm": 0.850184440612793,
2671
+ "learning_rate": 2.154679494165829e-05,
2672
+ "loss": 1.1188,
2673
+ "step": 376
2674
+ },
2675
+ {
2676
+ "epoch": 0.7936842105263158,
2677
+ "grad_norm": 0.8228728175163269,
2678
+ "learning_rate": 2.1129649966629184e-05,
2679
+ "loss": 0.9344,
2680
+ "step": 377
2681
+ },
2682
+ {
2683
+ "epoch": 0.7957894736842105,
2684
+ "grad_norm": 0.9157503843307495,
2685
+ "learning_rate": 2.0716105020241072e-05,
2686
+ "loss": 0.7635,
2687
+ "step": 378
2688
+ },
2689
+ {
2690
+ "epoch": 0.7978947368421052,
2691
+ "grad_norm": 1.4381229877471924,
2692
+ "learning_rate": 2.0306178978708514e-05,
2693
+ "loss": 1.258,
2694
+ "step": 379
2695
+ },
2696
+ {
2697
+ "epoch": 0.8,
2698
+ "grad_norm": 0.6052026748657227,
2699
+ "learning_rate": 1.9899890553061562e-05,
2700
+ "loss": 1.1365,
2701
+ "step": 380
2702
+ },
2703
+ {
2704
+ "epoch": 0.8021052631578948,
2705
+ "grad_norm": 0.8866299390792847,
2706
+ "learning_rate": 1.9497258288291654e-05,
2707
+ "loss": 1.0385,
2708
+ "step": 381
2709
+ },
2710
+ {
2711
+ "epoch": 0.8042105263157895,
2712
+ "grad_norm": 0.743588387966156,
2713
+ "learning_rate": 1.9098300562505266e-05,
2714
+ "loss": 1.0008,
2715
+ "step": 382
2716
+ },
2717
+ {
2718
+ "epoch": 0.8063157894736842,
2719
+ "grad_norm": 0.797770082950592,
2720
+ "learning_rate": 1.8703035586084816e-05,
2721
+ "loss": 0.82,
2722
+ "step": 383
2723
+ },
2724
+ {
2725
+ "epoch": 0.8084210526315789,
2726
+ "grad_norm": 1.2986137866973877,
2727
+ "learning_rate": 1.831148140085762e-05,
2728
+ "loss": 0.9923,
2729
+ "step": 384
2730
+ },
2731
+ {
2732
+ "epoch": 0.8105263157894737,
2733
+ "grad_norm": 1.0393884181976318,
2734
+ "learning_rate": 1.7923655879272393e-05,
2735
+ "loss": 1.4782,
2736
+ "step": 385
2737
+ },
2738
+ {
2739
+ "epoch": 0.8126315789473684,
2740
+ "grad_norm": 0.961280882358551,
2741
+ "learning_rate": 1.753957672358324e-05,
2742
+ "loss": 0.9412,
2743
+ "step": 386
2744
+ },
2745
+ {
2746
+ "epoch": 0.8147368421052632,
2747
+ "grad_norm": 0.5964862108230591,
2748
+ "learning_rate": 1.7159261465041952e-05,
2749
+ "loss": 0.9657,
2750
+ "step": 387
2751
+ },
2752
+ {
2753
+ "epoch": 0.8168421052631579,
2754
+ "grad_norm": 1.1498229503631592,
2755
+ "learning_rate": 1.6782727463097624e-05,
2756
+ "loss": 0.8533,
2757
+ "step": 388
2758
+ },
2759
+ {
2760
+ "epoch": 0.8189473684210526,
2761
+ "grad_norm": 1.617012619972229,
2762
+ "learning_rate": 1.6409991904604173e-05,
2763
+ "loss": 0.6844,
2764
+ "step": 389
2765
+ },
2766
+ {
2767
+ "epoch": 0.8210526315789474,
2768
+ "grad_norm": 0.642407238483429,
2769
+ "learning_rate": 1.60410718030361e-05,
2770
+ "loss": 0.7084,
2771
+ "step": 390
2772
+ },
2773
+ {
2774
+ "epoch": 0.8231578947368421,
2775
+ "grad_norm": 0.6749714016914368,
2776
+ "learning_rate": 1.5675983997711795e-05,
2777
+ "loss": 1.1988,
2778
+ "step": 391
2779
+ },
2780
+ {
2781
+ "epoch": 0.8252631578947368,
2782
+ "grad_norm": 1.0748071670532227,
2783
+ "learning_rate": 1.5314745153024766e-05,
2784
+ "loss": 0.856,
2785
+ "step": 392
2786
+ },
2787
+ {
2788
+ "epoch": 0.8273684210526315,
2789
+ "grad_norm": 1.0286914110183716,
2790
+ "learning_rate": 1.495737175768326e-05,
2791
+ "loss": 1.1481,
2792
+ "step": 393
2793
+ },
2794
+ {
2795
+ "epoch": 0.8294736842105264,
2796
+ "grad_norm": 0.8149781227111816,
2797
+ "learning_rate": 1.4603880123957447e-05,
2798
+ "loss": 0.7169,
2799
+ "step": 394
2800
+ },
2801
+ {
2802
+ "epoch": 0.8315789473684211,
2803
+ "grad_norm": 1.1764047145843506,
2804
+ "learning_rate": 1.425428638693489e-05,
2805
+ "loss": 1.014,
2806
+ "step": 395
2807
+ },
2808
+ {
2809
+ "epoch": 0.8336842105263158,
2810
+ "grad_norm": 1.1029857397079468,
2811
+ "learning_rate": 1.3908606503784139e-05,
2812
+ "loss": 1.2927,
2813
+ "step": 396
2814
+ },
2815
+ {
2816
+ "epoch": 0.8357894736842105,
2817
+ "grad_norm": 1.063981056213379,
2818
+ "learning_rate": 1.356685625302625e-05,
2819
+ "loss": 0.5832,
2820
+ "step": 397
2821
+ },
2822
+ {
2823
+ "epoch": 0.8378947368421052,
2824
+ "grad_norm": 0.8528422117233276,
2825
+ "learning_rate": 1.3229051233814637e-05,
2826
+ "loss": 0.8959,
2827
+ "step": 398
2828
+ },
2829
+ {
2830
+ "epoch": 0.84,
2831
+ "grad_norm": 0.8783009052276611,
2832
+ "learning_rate": 1.2895206865223064e-05,
2833
+ "loss": 0.8633,
2834
+ "step": 399
2835
+ },
2836
+ {
2837
+ "epoch": 0.8421052631578947,
2838
+ "grad_norm": 1.0344438552856445,
2839
+ "learning_rate": 1.2565338385541792e-05,
2840
+ "loss": 1.7439,
2841
+ "step": 400
2842
+ },
2843
+ {
2844
+ "epoch": 0.8442105263157895,
2845
+ "grad_norm": 0.9089331030845642,
2846
+ "learning_rate": 1.2239460851582118e-05,
2847
+ "loss": 0.9778,
2848
+ "step": 401
2849
+ },
2850
+ {
2851
+ "epoch": 0.8463157894736842,
2852
+ "grad_norm": 1.5010464191436768,
2853
+ "learning_rate": 1.1917589137989005e-05,
2854
+ "loss": 0.8401,
2855
+ "step": 402
2856
+ },
2857
+ {
2858
+ "epoch": 0.848421052631579,
2859
+ "grad_norm": 1.1659886837005615,
2860
+ "learning_rate": 1.1599737936562149e-05,
2861
+ "loss": 1.1214,
2862
+ "step": 403
2863
+ },
2864
+ {
2865
+ "epoch": 0.8505263157894737,
2866
+ "grad_norm": 0.781868040561676,
2867
+ "learning_rate": 1.1285921755585504e-05,
2868
+ "loss": 1.1831,
2869
+ "step": 404
2870
+ },
2871
+ {
2872
+ "epoch": 0.8526315789473684,
2873
+ "grad_norm": 0.8510306477546692,
2874
+ "learning_rate": 1.097615491916485e-05,
2875
+ "loss": 1.0715,
2876
+ "step": 405
2877
+ },
2878
+ {
2879
+ "epoch": 0.8547368421052631,
2880
+ "grad_norm": 0.7362077236175537,
2881
+ "learning_rate": 1.0670451566574102e-05,
2882
+ "loss": 0.9367,
2883
+ "step": 406
2884
+ },
2885
+ {
2886
+ "epoch": 0.8568421052631578,
2887
+ "grad_norm": 1.3995620012283325,
2888
+ "learning_rate": 1.0368825651609893e-05,
2889
+ "loss": 0.8952,
2890
+ "step": 407
2891
+ },
2892
+ {
2893
+ "epoch": 0.8589473684210527,
2894
+ "grad_norm": 0.8937683701515198,
2895
+ "learning_rate": 1.007129094195468e-05,
2896
+ "loss": 1.2489,
2897
+ "step": 408
2898
+ },
2899
+ {
2900
+ "epoch": 0.8610526315789474,
2901
+ "grad_norm": 2.044285774230957,
2902
+ "learning_rate": 9.777861018548251e-06,
2903
+ "loss": 1.658,
2904
+ "step": 409
2905
+ },
2906
+ {
2907
+ "epoch": 0.8631578947368421,
2908
+ "grad_norm": 1.3045108318328857,
2909
+ "learning_rate": 9.488549274967872e-06,
2910
+ "loss": 0.8359,
2911
+ "step": 410
2912
+ },
2913
+ {
2914
+ "epoch": 0.8652631578947368,
2915
+ "grad_norm": 0.6791781783103943,
2916
+ "learning_rate": 9.203368916817012e-06,
2917
+ "loss": 1.0212,
2918
+ "step": 411
2919
+ },
2920
+ {
2921
+ "epoch": 0.8673684210526316,
2922
+ "grad_norm": 1.7533940076828003,
2923
+ "learning_rate": 8.92233296112236e-06,
2924
+ "loss": 0.7901,
2925
+ "step": 412
2926
+ },
2927
+ {
2928
+ "epoch": 0.8694736842105263,
2929
+ "grad_norm": 0.5513384938240051,
2930
+ "learning_rate": 8.645454235739903e-06,
2931
+ "loss": 0.8964,
2932
+ "step": 413
2933
+ },
2934
+ {
2935
+ "epoch": 0.871578947368421,
2936
+ "grad_norm": 1.2993557453155518,
2937
+ "learning_rate": 8.372745378769309e-06,
2938
+ "loss": 0.9348,
2939
+ "step": 414
2940
+ },
2941
+ {
2942
+ "epoch": 0.8736842105263158,
2943
+ "grad_norm": 1.2195082902908325,
2944
+ "learning_rate": 8.10421883797694e-06,
2945
+ "loss": 1.2758,
2946
+ "step": 415
2947
+ },
2948
+ {
2949
+ "epoch": 0.8757894736842106,
2950
+ "grad_norm": 1.2428478002548218,
2951
+ "learning_rate": 7.839886870227909e-06,
2952
+ "loss": 0.9534,
2953
+ "step": 416
2954
+ },
2955
+ {
2956
+ "epoch": 0.8778947368421053,
2957
+ "grad_norm": 0.5317150950431824,
2958
+ "learning_rate": 7.5797615409264335e-06,
2959
+ "loss": 0.7835,
2960
+ "step": 417
2961
+ },
2962
+ {
2963
+ "epoch": 0.88,
2964
+ "grad_norm": 0.7354329228401184,
2965
+ "learning_rate": 7.32385472346514e-06,
2966
+ "loss": 1.1074,
2967
+ "step": 418
2968
+ },
2969
+ {
2970
+ "epoch": 0.8821052631578947,
2971
+ "grad_norm": 1.009934663772583,
2972
+ "learning_rate": 7.072178098683246e-06,
2973
+ "loss": 1.253,
2974
+ "step": 419
2975
+ },
2976
+ {
2977
+ "epoch": 0.8842105263157894,
2978
+ "grad_norm": 0.8174380660057068,
2979
+ "learning_rate": 6.824743154333157e-06,
2980
+ "loss": 1.0671,
2981
+ "step": 420
2982
+ },
2983
+ {
2984
+ "epoch": 0.8863157894736842,
2985
+ "grad_norm": 0.777045726776123,
2986
+ "learning_rate": 6.581561184556295e-06,
2987
+ "loss": 1.0806,
2988
+ "step": 421
2989
+ },
2990
+ {
2991
+ "epoch": 0.888421052631579,
2992
+ "grad_norm": 1.013598918914795,
2993
+ "learning_rate": 6.342643289367522e-06,
2994
+ "loss": 1.1415,
2995
+ "step": 422
2996
+ },
2997
+ {
2998
+ "epoch": 0.8905263157894737,
2999
+ "grad_norm": 0.9286550283432007,
3000
+ "learning_rate": 6.108000374148448e-06,
3001
+ "loss": 0.9374,
3002
+ "step": 423
3003
+ },
3004
+ {
3005
+ "epoch": 0.8926315789473684,
3006
+ "grad_norm": 0.8254793286323547,
3007
+ "learning_rate": 5.87764314914967e-06,
3008
+ "loss": 0.8427,
3009
+ "step": 424
3010
+ },
3011
+ {
3012
+ "epoch": 0.8947368421052632,
3013
+ "grad_norm": 1.3116326332092285,
3014
+ "learning_rate": 5.651582129001986e-06,
3015
+ "loss": 1.2204,
3016
+ "step": 425
3017
+ },
3018
+ {
3019
+ "epoch": 0.8968421052631579,
3020
+ "grad_norm": 0.5832639932632446,
3021
+ "learning_rate": 5.429827632236284e-06,
3022
+ "loss": 0.7114,
3023
+ "step": 426
3024
+ },
3025
+ {
3026
+ "epoch": 0.8989473684210526,
3027
+ "grad_norm": 0.719308078289032,
3028
+ "learning_rate": 5.212389780812732e-06,
3029
+ "loss": 0.8413,
3030
+ "step": 427
3031
+ },
3032
+ {
3033
+ "epoch": 0.9010526315789473,
3034
+ "grad_norm": 0.8241797089576721,
3035
+ "learning_rate": 4.999278499658666e-06,
3036
+ "loss": 0.9538,
3037
+ "step": 428
3038
+ },
3039
+ {
3040
+ "epoch": 0.9031578947368422,
3041
+ "grad_norm": 0.68552565574646,
3042
+ "learning_rate": 4.790503516215572e-06,
3043
+ "loss": 1.0962,
3044
+ "step": 429
3045
+ },
3046
+ {
3047
+ "epoch": 0.9052631578947369,
3048
+ "grad_norm": 1.2444804906845093,
3049
+ "learning_rate": 4.586074359995119e-06,
3050
+ "loss": 1.2615,
3051
+ "step": 430
3052
+ },
3053
+ {
3054
+ "epoch": 0.9073684210526316,
3055
+ "grad_norm": 1.2105733156204224,
3056
+ "learning_rate": 4.386000362144138e-06,
3057
+ "loss": 1.1771,
3058
+ "step": 431
3059
+ },
3060
+ {
3061
+ "epoch": 0.9094736842105263,
3062
+ "grad_norm": 0.7618857026100159,
3063
+ "learning_rate": 4.190290655018736e-06,
3064
+ "loss": 1.0337,
3065
+ "step": 432
3066
+ },
3067
+ {
3068
+ "epoch": 0.911578947368421,
3069
+ "grad_norm": 0.6650370955467224,
3070
+ "learning_rate": 3.998954171767422e-06,
3071
+ "loss": 1.0167,
3072
+ "step": 433
3073
+ },
3074
+ {
3075
+ "epoch": 0.9136842105263158,
3076
+ "grad_norm": 0.8902232050895691,
3077
+ "learning_rate": 3.811999645923414e-06,
3078
+ "loss": 1.0959,
3079
+ "step": 434
3080
+ },
3081
+ {
3082
+ "epoch": 0.9157894736842105,
3083
+ "grad_norm": 0.8037033081054688,
3084
+ "learning_rate": 3.6294356110059157e-06,
3085
+ "loss": 1.0283,
3086
+ "step": 435
3087
+ },
3088
+ {
3089
+ "epoch": 0.9178947368421052,
3090
+ "grad_norm": 0.7740287780761719,
3091
+ "learning_rate": 3.451270400130646e-06,
3092
+ "loss": 0.9952,
3093
+ "step": 436
3094
+ },
3095
+ {
3096
+ "epoch": 0.92,
3097
+ "grad_norm": 0.6354364156723022,
3098
+ "learning_rate": 3.277512145629502e-06,
3099
+ "loss": 0.7446,
3100
+ "step": 437
3101
+ },
3102
+ {
3103
+ "epoch": 0.9221052631578948,
3104
+ "grad_norm": 1.0572397708892822,
3105
+ "learning_rate": 3.10816877867931e-06,
3106
+ "loss": 1.0779,
3107
+ "step": 438
3108
+ },
3109
+ {
3110
+ "epoch": 0.9242105263157895,
3111
+ "grad_norm": 1.2464516162872314,
3112
+ "learning_rate": 2.943248028939838e-06,
3113
+ "loss": 1.1318,
3114
+ "step": 439
3115
+ },
3116
+ {
3117
+ "epoch": 0.9263157894736842,
3118
+ "grad_norm": 1.017857313156128,
3119
+ "learning_rate": 2.7827574242009437e-06,
3120
+ "loss": 0.7569,
3121
+ "step": 440
3122
+ },
3123
+ {
3124
+ "epoch": 0.9284210526315789,
3125
+ "grad_norm": 0.8179978132247925,
3126
+ "learning_rate": 2.626704290039017e-06,
3127
+ "loss": 0.9221,
3128
+ "step": 441
3129
+ },
3130
+ {
3131
+ "epoch": 0.9305263157894736,
3132
+ "grad_norm": 0.7406687140464783,
3133
+ "learning_rate": 2.4750957494826033e-06,
3134
+ "loss": 0.9411,
3135
+ "step": 442
3136
+ },
3137
+ {
3138
+ "epoch": 0.9326315789473684,
3139
+ "grad_norm": 0.7985221147537231,
3140
+ "learning_rate": 2.327938722687184e-06,
3141
+ "loss": 0.5579,
3142
+ "step": 443
3143
+ },
3144
+ {
3145
+ "epoch": 0.9347368421052632,
3146
+ "grad_norm": 0.8860235214233398,
3147
+ "learning_rate": 2.1852399266194314e-06,
3148
+ "loss": 1.2353,
3149
+ "step": 444
3150
+ },
3151
+ {
3152
+ "epoch": 0.9368421052631579,
3153
+ "grad_norm": 0.8313080072402954,
3154
+ "learning_rate": 2.0470058747505516e-06,
3155
+ "loss": 0.9344,
3156
+ "step": 445
3157
+ },
3158
+ {
3159
+ "epoch": 0.9389473684210526,
3160
+ "grad_norm": 3.333054780960083,
3161
+ "learning_rate": 1.9132428767589473e-06,
3162
+ "loss": 1.4123,
3163
+ "step": 446
3164
+ },
3165
+ {
3166
+ "epoch": 0.9410526315789474,
3167
+ "grad_norm": 1.0423784255981445,
3168
+ "learning_rate": 1.7839570382422787e-06,
3169
+ "loss": 1.0753,
3170
+ "step": 447
3171
+ },
3172
+ {
3173
+ "epoch": 0.9431578947368421,
3174
+ "grad_norm": 0.7692586183547974,
3175
+ "learning_rate": 1.6591542604387445e-06,
3176
+ "loss": 0.669,
3177
+ "step": 448
3178
+ },
3179
+ {
3180
+ "epoch": 0.9452631578947368,
3181
+ "grad_norm": 0.7186653017997742,
3182
+ "learning_rate": 1.538840239957684e-06,
3183
+ "loss": 1.2113,
3184
+ "step": 449
3185
+ },
3186
+ {
3187
+ "epoch": 0.9473684210526315,
3188
+ "grad_norm": 0.7315964698791504,
3189
+ "learning_rate": 1.4230204685196203e-06,
3190
+ "loss": 1.2322,
3191
+ "step": 450
3192
+ },
3193
+ {
3194
+ "epoch": 0.9494736842105264,
3195
+ "grad_norm": 1.0726860761642456,
3196
+ "learning_rate": 1.3117002327055927e-06,
3197
+ "loss": 0.9252,
3198
+ "step": 451
3199
+ },
3200
+ {
3201
+ "epoch": 0.9515789473684211,
3202
+ "grad_norm": 0.6930716037750244,
3203
+ "learning_rate": 1.20488461371574e-06,
3204
+ "loss": 0.9903,
3205
+ "step": 452
3206
+ },
3207
+ {
3208
+ "epoch": 0.9536842105263158,
3209
+ "grad_norm": 1.1421600580215454,
3210
+ "learning_rate": 1.102578487137529e-06,
3211
+ "loss": 1.0869,
3212
+ "step": 453
3213
+ },
3214
+ {
3215
+ "epoch": 0.9557894736842105,
3216
+ "grad_norm": 0.6389675736427307,
3217
+ "learning_rate": 1.004786522723089e-06,
3218
+ "loss": 0.7617,
3219
+ "step": 454
3220
+ },
3221
+ {
3222
+ "epoch": 0.9578947368421052,
3223
+ "grad_norm": 1.449903964996338,
3224
+ "learning_rate": 9.11513184176116e-07,
3225
+ "loss": 1.0412,
3226
+ "step": 455
3227
+ },
3228
+ {
3229
+ "epoch": 0.96,
3230
+ "grad_norm": 0.7298595905303955,
3231
+ "learning_rate": 8.227627289481121e-07,
3232
+ "loss": 0.9267,
3233
+ "step": 456
3234
+ },
3235
+ {
3236
+ "epoch": 0.9621052631578947,
3237
+ "grad_norm": 0.988619327545166,
3238
+ "learning_rate": 7.385392080440534e-07,
3239
+ "loss": 0.9785,
3240
+ "step": 457
3241
+ },
3242
+ {
3243
+ "epoch": 0.9642105263157895,
3244
+ "grad_norm": 1.0173723697662354,
3245
+ "learning_rate": 6.588464658374815e-07,
3246
+ "loss": 1.4313,
3247
+ "step": 458
3248
+ },
3249
+ {
3250
+ "epoch": 0.9663157894736842,
3251
+ "grad_norm": 0.8676223158836365,
3252
+ "learning_rate": 5.836881398950667e-07,
3253
+ "loss": 1.1253,
3254
+ "step": 459
3255
+ },
3256
+ {
3257
+ "epoch": 0.968421052631579,
3258
+ "grad_norm": 2.1744537353515625,
3259
+ "learning_rate": 5.130676608104845e-07,
3260
+ "loss": 0.9635,
3261
+ "step": 460
3262
+ },
3263
+ {
3264
+ "epoch": 0.9705263157894737,
3265
+ "grad_norm": 0.9125698804855347,
3266
+ "learning_rate": 4.469882520479196e-07,
3267
+ "loss": 1.2048,
3268
+ "step": 461
3269
+ },
3270
+ {
3271
+ "epoch": 0.9726315789473684,
3272
+ "grad_norm": 0.8515905141830444,
3273
+ "learning_rate": 3.8545292979486057e-07,
3274
+ "loss": 0.9358,
3275
+ "step": 462
3276
+ },
3277
+ {
3278
+ "epoch": 0.9747368421052631,
3279
+ "grad_norm": 0.8601663708686829,
3280
+ "learning_rate": 3.2846450282447703e-07,
3281
+ "loss": 1.0253,
3282
+ "step": 463
3283
+ },
3284
+ {
3285
+ "epoch": 0.9768421052631578,
3286
+ "grad_norm": 0.8213205337524414,
3287
+ "learning_rate": 2.760255723673888e-07,
3288
+ "loss": 0.8577,
3289
+ "step": 464
3290
+ },
3291
+ {
3292
+ "epoch": 0.9789473684210527,
3293
+ "grad_norm": 0.7541871666908264,
3294
+ "learning_rate": 2.2813853199292746e-07,
3295
+ "loss": 0.9517,
3296
+ "step": 465
3297
+ },
3298
+ {
3299
+ "epoch": 0.9810526315789474,
3300
+ "grad_norm": 0.7226133346557617,
3301
+ "learning_rate": 1.8480556749991274e-07,
3302
+ "loss": 1.0434,
3303
+ "step": 466
3304
+ },
3305
+ {
3306
+ "epoch": 0.9831578947368421,
3307
+ "grad_norm": 1.1320992708206177,
3308
+ "learning_rate": 1.460286568168212e-07,
3309
+ "loss": 0.7791,
3310
+ "step": 467
3311
+ },
3312
+ {
3313
+ "epoch": 0.9852631578947368,
3314
+ "grad_norm": 0.9081979990005493,
3315
+ "learning_rate": 1.1180956991160286e-07,
3316
+ "loss": 0.3705,
3317
+ "step": 468
3318
+ },
3319
+ {
3320
+ "epoch": 0.9873684210526316,
3321
+ "grad_norm": 2.7810940742492676,
3322
+ "learning_rate": 8.214986871076802e-08,
3323
+ "loss": 1.2752,
3324
+ "step": 469
3325
+ },
3326
+ {
3327
+ "epoch": 0.9894736842105263,
3328
+ "grad_norm": 0.7669661045074463,
3329
+ "learning_rate": 5.705090702819993e-08,
3330
+ "loss": 0.7446,
3331
+ "step": 470
3332
+ },
3333
+ {
3334
+ "epoch": 0.991578947368421,
3335
+ "grad_norm": 1.0128413438796997,
3336
+ "learning_rate": 3.6513830503293045e-08,
3337
+ "loss": 1.2359,
3338
+ "step": 471
3339
+ },
3340
+ {
3341
+ "epoch": 0.9936842105263158,
3342
+ "grad_norm": 1.728697657585144,
3343
+ "learning_rate": 2.0539576548717076e-08,
3344
+ "loss": 0.7628,
3345
+ "step": 472
3346
+ },
3347
+ {
3348
+ "epoch": 0.9957894736842106,
3349
+ "grad_norm": 0.7068576812744141,
3350
+ "learning_rate": 9.128874307551272e-09,
3351
+ "loss": 1.063,
3352
+ "step": 473
3353
+ },
3354
+ {
3355
+ "epoch": 0.9978947368421053,
3356
+ "grad_norm": 0.7579674124717712,
3357
+ "learning_rate": 2.282244620088747e-09,
3358
+ "loss": 0.8835,
3359
+ "step": 474
3360
+ },
3361
+ {
3362
+ "epoch": 1.0,
3363
+ "grad_norm": 0.7143834829330444,
3364
+ "learning_rate": 0.0,
3365
+ "loss": 1.1322,
3366
+ "step": 475
3367
  }
3368
  ],
3369
  "logging_steps": 1,
 
3378
  "should_evaluate": false,
3379
  "should_log": false,
3380
  "should_save": true,
3381
+ "should_training_stop": true
3382
  },
3383
  "attributes": {}
3384
  }
3385
  },
3386
+ "total_flos": 4.425821325990298e+16,
3387
  "train_batch_size": 1,
3388
  "trial_name": null,
3389
  "trial_params": null