Bingsu commited on
Commit
d268d93
1 Parent(s): 1910f3a

Training in progress, step 90000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61757e92d02b06dda1da003da57fa0b18bc1cc2b413fc514841b017d0d63c3c8
3
  size 100172997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0403e77fedd175fe2813435246e47b4db08141719006e4af642e66c252088876
3
  size 100172997
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81a90871ae24751a566fb99821bee5e29d062c303c164fcd6aeac08948cab240
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459351fdd706b427a4771aa3d8515e577348d2de84329d8049ae17fe23916e8b
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7978600af4170dc4592efcab1d33d1582d45b26dc998a10a280a81e23e422deb
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70d5cceb52d0fa5a00f924e679cb68057477b161920a598903be579e5d6a58aa
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7331ea7e49edb5d8c1485934eca953ca913987924fdd220c26d2fc895357dc9
3
  size 246899880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2086d296d1d3514176646dc6eef7a73ed351fdafb4d4a71a834e5c0056ed735
3
  size 246899880
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3437902879243661,
5
- "global_step": 80000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -2406,11 +2406,311 @@
2406
  "learning_rate": 0.0005701789946806666,
2407
  "loss": 3.1995,
2408
  "step": 80000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2409
  }
2410
  ],
2411
  "max_steps": 500000,
2412
  "num_train_epochs": 3,
2413
- "total_flos": 1.2750639857664e+17,
2414
  "trial_name": null,
2415
  "trial_params": null
2416
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3867640739149119,
5
+ "global_step": 90000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
2406
  "learning_rate": 0.0005701789946806666,
2407
  "loss": 3.1995,
2408
  "step": 80000
2409
+ },
2410
+ {
2411
+ "epoch": 0.34,
2412
+ "learning_rate": 0.0005721781645068867,
2413
+ "loss": 3.2116,
2414
+ "step": 80200
2415
+ },
2416
+ {
2417
+ "epoch": 0.35,
2418
+ "learning_rate": 0.000574176418806075,
2419
+ "loss": 3.2256,
2420
+ "step": 80400
2421
+ },
2422
+ {
2423
+ "epoch": 0.35,
2424
+ "learning_rate": 0.0005761737225165182,
2425
+ "loss": 3.2221,
2426
+ "step": 80600
2427
+ },
2428
+ {
2429
+ "epoch": 0.35,
2430
+ "learning_rate": 0.0005781700405931827,
2431
+ "loss": 3.1956,
2432
+ "step": 80800
2433
+ },
2434
+ {
2435
+ "epoch": 0.35,
2436
+ "learning_rate": 0.0005801653380083288,
2437
+ "loss": 3.2031,
2438
+ "step": 81000
2439
+ },
2440
+ {
2441
+ "epoch": 0.35,
2442
+ "learning_rate": 0.0005821595797521253,
2443
+ "loss": 3.2029,
2444
+ "step": 81200
2445
+ },
2446
+ {
2447
+ "epoch": 0.35,
2448
+ "learning_rate": 0.0005841527308332645,
2449
+ "loss": 3.2065,
2450
+ "step": 81400
2451
+ },
2452
+ {
2453
+ "epoch": 0.35,
2454
+ "learning_rate": 0.0005861447562795751,
2455
+ "loss": 3.1783,
2456
+ "step": 81600
2457
+ },
2458
+ {
2459
+ "epoch": 0.35,
2460
+ "learning_rate": 0.0005881356211386371,
2461
+ "loss": 3.2181,
2462
+ "step": 81800
2463
+ },
2464
+ {
2465
+ "epoch": 0.35,
2466
+ "learning_rate": 0.0005901252904783932,
2467
+ "loss": 3.1991,
2468
+ "step": 82000
2469
+ },
2470
+ {
2471
+ "epoch": 0.35,
2472
+ "learning_rate": 0.0005921137293877644,
2473
+ "loss": 3.2011,
2474
+ "step": 82200
2475
+ },
2476
+ {
2477
+ "epoch": 0.35,
2478
+ "learning_rate": 0.0005941009029772594,
2479
+ "loss": 3.1852,
2480
+ "step": 82400
2481
+ },
2482
+ {
2483
+ "epoch": 0.35,
2484
+ "learning_rate": 0.00059608677637959,
2485
+ "loss": 3.1911,
2486
+ "step": 82600
2487
+ },
2488
+ {
2489
+ "epoch": 0.36,
2490
+ "learning_rate": 0.00059807131475028,
2491
+ "loss": 3.1999,
2492
+ "step": 82800
2493
+ },
2494
+ {
2495
+ "epoch": 0.36,
2496
+ "learning_rate": 0.000600054483268279,
2497
+ "loss": 3.1853,
2498
+ "step": 83000
2499
+ },
2500
+ {
2501
+ "epoch": 0.36,
2502
+ "learning_rate": 0.0006020362471365711,
2503
+ "loss": 3.1928,
2504
+ "step": 83200
2505
+ },
2506
+ {
2507
+ "epoch": 0.36,
2508
+ "learning_rate": 0.0006040165715827878,
2509
+ "loss": 3.1833,
2510
+ "step": 83400
2511
+ },
2512
+ {
2513
+ "epoch": 0.36,
2514
+ "learning_rate": 0.0006059954218598161,
2515
+ "loss": 3.1972,
2516
+ "step": 83600
2517
+ },
2518
+ {
2519
+ "epoch": 0.36,
2520
+ "learning_rate": 0.0006079727632464092,
2521
+ "loss": 3.2052,
2522
+ "step": 83800
2523
+ },
2524
+ {
2525
+ "epoch": 0.36,
2526
+ "learning_rate": 0.000609948561047796,
2527
+ "loss": 3.1881,
2528
+ "step": 84000
2529
+ },
2530
+ {
2531
+ "epoch": 0.36,
2532
+ "learning_rate": 0.0006119227805962891,
2533
+ "loss": 3.174,
2534
+ "step": 84200
2535
+ },
2536
+ {
2537
+ "epoch": 0.36,
2538
+ "learning_rate": 0.0006138953872518932,
2539
+ "loss": 3.1831,
2540
+ "step": 84400
2541
+ },
2542
+ {
2543
+ "epoch": 0.36,
2544
+ "learning_rate": 0.0006158663464029133,
2545
+ "loss": 3.1961,
2546
+ "step": 84600
2547
+ },
2548
+ {
2549
+ "epoch": 0.36,
2550
+ "learning_rate": 0.0006178356234665618,
2551
+ "loss": 3.1759,
2552
+ "step": 84800
2553
+ },
2554
+ {
2555
+ "epoch": 0.37,
2556
+ "learning_rate": 0.0006198031838895652,
2557
+ "loss": 3.1728,
2558
+ "step": 85000
2559
+ },
2560
+ {
2561
+ "epoch": 0.37,
2562
+ "learning_rate": 0.0006217689931487707,
2563
+ "loss": 3.194,
2564
+ "step": 85200
2565
+ },
2566
+ {
2567
+ "epoch": 0.37,
2568
+ "learning_rate": 0.0006237330167517514,
2569
+ "loss": 3.1747,
2570
+ "step": 85400
2571
+ },
2572
+ {
2573
+ "epoch": 0.37,
2574
+ "learning_rate": 0.0006256952202374121,
2575
+ "loss": 3.1805,
2576
+ "step": 85600
2577
+ },
2578
+ {
2579
+ "epoch": 0.37,
2580
+ "learning_rate": 0.0006276555691765935,
2581
+ "loss": 3.1781,
2582
+ "step": 85800
2583
+ },
2584
+ {
2585
+ "epoch": 0.37,
2586
+ "learning_rate": 0.0006296140291726768,
2587
+ "loss": 3.1829,
2588
+ "step": 86000
2589
+ },
2590
+ {
2591
+ "epoch": 0.37,
2592
+ "learning_rate": 0.0006315705658621865,
2593
+ "loss": 3.1832,
2594
+ "step": 86200
2595
+ },
2596
+ {
2597
+ "epoch": 0.37,
2598
+ "learning_rate": 0.0006335251449153937,
2599
+ "loss": 3.1634,
2600
+ "step": 86400
2601
+ },
2602
+ {
2603
+ "epoch": 0.37,
2604
+ "learning_rate": 0.0006354777320369192,
2605
+ "loss": 3.1664,
2606
+ "step": 86600
2607
+ },
2608
+ {
2609
+ "epoch": 0.37,
2610
+ "learning_rate": 0.0006374282929663341,
2611
+ "loss": 3.1576,
2612
+ "step": 86800
2613
+ },
2614
+ {
2615
+ "epoch": 0.37,
2616
+ "learning_rate": 0.0006393767934787615,
2617
+ "loss": 3.1667,
2618
+ "step": 87000
2619
+ },
2620
+ {
2621
+ "epoch": 0.37,
2622
+ "learning_rate": 0.0006413231993854767,
2623
+ "loss": 3.1718,
2624
+ "step": 87200
2625
+ },
2626
+ {
2627
+ "epoch": 0.38,
2628
+ "learning_rate": 0.0006432674765345077,
2629
+ "loss": 3.1645,
2630
+ "step": 87400
2631
+ },
2632
+ {
2633
+ "epoch": 0.38,
2634
+ "learning_rate": 0.0006452095908112336,
2635
+ "loss": 3.2063,
2636
+ "step": 87600
2637
+ },
2638
+ {
2639
+ "epoch": 0.38,
2640
+ "learning_rate": 0.0006471495081389843,
2641
+ "loss": 3.2638,
2642
+ "step": 87800
2643
+ },
2644
+ {
2645
+ "epoch": 0.38,
2646
+ "learning_rate": 0.0006490871944796369,
2647
+ "loss": 3.2685,
2648
+ "step": 88000
2649
+ },
2650
+ {
2651
+ "epoch": 0.38,
2652
+ "learning_rate": 0.0006510226158342146,
2653
+ "loss": 3.2519,
2654
+ "step": 88200
2655
+ },
2656
+ {
2657
+ "epoch": 0.38,
2658
+ "learning_rate": 0.0006529557382434818,
2659
+ "loss": 3.2101,
2660
+ "step": 88400
2661
+ },
2662
+ {
2663
+ "epoch": 0.38,
2664
+ "learning_rate": 0.0006548865277885411,
2665
+ "loss": 3.2109,
2666
+ "step": 88600
2667
+ },
2668
+ {
2669
+ "epoch": 0.38,
2670
+ "learning_rate": 0.0006568149505914273,
2671
+ "loss": 3.2291,
2672
+ "step": 88800
2673
+ },
2674
+ {
2675
+ "epoch": 0.38,
2676
+ "learning_rate": 0.0006587409728157031,
2677
+ "loss": 3.2282,
2678
+ "step": 89000
2679
+ },
2680
+ {
2681
+ "epoch": 0.38,
2682
+ "learning_rate": 0.000660664560667052,
2683
+ "loss": 3.2404,
2684
+ "step": 89200
2685
+ },
2686
+ {
2687
+ "epoch": 0.38,
2688
+ "learning_rate": 0.0006625856803938711,
2689
+ "loss": 3.2334,
2690
+ "step": 89400
2691
+ },
2692
+ {
2693
+ "epoch": 0.39,
2694
+ "learning_rate": 0.000664504298287864,
2695
+ "loss": 3.243,
2696
+ "step": 89600
2697
+ },
2698
+ {
2699
+ "epoch": 0.39,
2700
+ "learning_rate": 0.0006664203806846319,
2701
+ "loss": 3.2475,
2702
+ "step": 89800
2703
+ },
2704
+ {
2705
+ "epoch": 0.39,
2706
+ "learning_rate": 0.0006683338939642638,
2707
+ "loss": 3.3556,
2708
+ "step": 90000
2709
  }
2710
  ],
2711
  "max_steps": 500000,
2712
  "num_train_epochs": 3,
2713
+ "total_flos": 1.4344469839872e+17,
2714
  "trial_name": null,
2715
  "trial_params": null
2716
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81a90871ae24751a566fb99821bee5e29d062c303c164fcd6aeac08948cab240
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459351fdd706b427a4771aa3d8515e577348d2de84329d8049ae17fe23916e8b
3
  size 146774203