mtzig commited on
Commit
362ae50
·
verified ·
1 Parent(s): 227aedf

Training in progress, step 348, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3720c9d7c3d147e3c6a4c31b748a401804a80a6f28683e7c9983f2e8c8a0f20a
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df206f3b879f704bd24d474b4494d4527c95a2627aeee5322496bb015dc2e1ed
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3638a66f29f136c13174bee0dd43e693f5fc102e10bee4ca9b5d7060756ced7c
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7cd57b99d8cb0b28e877ada22b15f71a91fbb719fd88618a8834bfdc37a18a9
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59454dac3aeba9c46e0b8ed50eb871c1d98271ecbabd9754c8618cdc65584ad9
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bde4bd3a698c50b8bcc6d141f3a2dd9bc7cfa485ff7f655f8d648878900078a
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93ca40f1a301b546ff3c8e51d511eb49571367df816e5ac6c43d69b073ba1e35
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3be417bebf8d699c301a1ed56e05bca215a475491b001651ee8b864cb7762cf1
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e305eb64bbd004a3712d4d0f3b65560d9c0d8b3920c2789be35be33fef333cd
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8f344278ba1c8e1ca3a2e31d078c62dfed2ae5e3dbe234706a67121c9e5394
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16b6721393030c99c98218e1bcc44fa93cc347e7c920295cebba18bf69ebf311
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2357908bc64768e845681e2bb008065d0f46e1ceec84ebf0b10e23d9502acec1
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9c2af98820448e537c2aa09618f8c2299b2ed8c9040abdad7cc23d455398738
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e67ddb3e8ee36e3dbf20aa58a6346f6498d44ac1adff04b53091aca883c6f52
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa8dfa614952af057b305d24646b204cdfa9b2fbf5610aa112de72f4d1903dd4
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40237d4b55e93987e3e60ec247c712ac33d89cd732d545fb30a27bb79745517d
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6185843c50764de20922699c89193c33e1e13037719a5d55479aa190e715e4fc
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:269eae03847773cb4bd93a4517a905853bc54a68269b815c31577215fdc55d2d
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35e51ecf57078c2d652964726d8abc8157e10e9fdddf8cacb5700305b465147a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d187c3af11ce3a881345c7efd9082ca0cb36a6949dfbe3343c996f42f2afc3e6
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebe10fe55b3a58ae13fa7a58fca8f2486fa82c4aa360522ee9cde43cc43ba473
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:344fb25f2c262f28ced1e24ddcd208e4806d9788cdec5b2a7f2397013b68c284
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcda73faaa8d5a9ab0a72d2fef1c1af0341c8e7f8ec0eede744acae39dd22f43
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:805247d87233bfa93147263830dc68344cf75f632c10a49edc8486147debfef7
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3e47edb1b664bc04c493b0996774157c1ffdb9f0b12df515a0b32829d748704
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d015d34b7739a1124fe31968c605ff2948a6a921eb3374a8e282b0787486605
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8620689655172413,
5
  "eval_steps": 20,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2299,6 +2299,366 @@
2299
  "eval_samples_per_second": 6.275,
2300
  "eval_steps_per_second": 0.237,
2301
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2302
  }
2303
  ],
2304
  "logging_steps": 1,
@@ -2313,12 +2673,12 @@
2313
  "should_evaluate": false,
2314
  "should_log": false,
2315
  "should_save": true,
2316
- "should_training_stop": false
2317
  },
2318
  "attributes": {}
2319
  }
2320
  },
2321
- "total_flos": 9.46781682579538e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 20,
6
+ "global_step": 348,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2299
  "eval_samples_per_second": 6.275,
2300
  "eval_steps_per_second": 0.237,
2301
  "step": 300
2302
+ },
2303
+ {
2304
+ "epoch": 0.8649425287356322,
2305
+ "grad_norm": 2.3745739459991455,
2306
+ "learning_rate": 1.0922142398186097e-06,
2307
+ "loss": 0.2735,
2308
+ "step": 301
2309
+ },
2310
+ {
2311
+ "epoch": 0.867816091954023,
2312
+ "grad_norm": 3.9790706634521484,
2313
+ "learning_rate": 1.0470516257959351e-06,
2314
+ "loss": 0.2966,
2315
+ "step": 302
2316
+ },
2317
+ {
2318
+ "epoch": 0.8706896551724138,
2319
+ "grad_norm": 2.3996686935424805,
2320
+ "learning_rate": 1.00279094316854e-06,
2321
+ "loss": 0.2725,
2322
+ "step": 303
2323
+ },
2324
+ {
2325
+ "epoch": 0.8735632183908046,
2326
+ "grad_norm": 3.1654207706451416,
2327
+ "learning_rate": 9.594366508138352e-07,
2328
+ "loss": 0.2983,
2329
+ "step": 304
2330
+ },
2331
+ {
2332
+ "epoch": 0.8764367816091954,
2333
+ "grad_norm": 4.094039440155029,
2334
+ "learning_rate": 9.169931162983137e-07,
2335
+ "loss": 0.2797,
2336
+ "step": 305
2337
+ },
2338
+ {
2339
+ "epoch": 0.8793103448275862,
2340
+ "grad_norm": 2.8442471027374268,
2341
+ "learning_rate": 8.754646154375801e-07,
2342
+ "loss": 0.2584,
2343
+ "step": 306
2344
+ },
2345
+ {
2346
+ "epoch": 0.882183908045977,
2347
+ "grad_norm": 3.6936562061309814,
2348
+ "learning_rate": 8.348553318655795e-07,
2349
+ "loss": 0.3164,
2350
+ "step": 307
2351
+ },
2352
+ {
2353
+ "epoch": 0.8850574712643678,
2354
+ "grad_norm": 2.954345703125,
2355
+ "learning_rate": 7.951693566131325e-07,
2356
+ "loss": 0.3144,
2357
+ "step": 308
2358
+ },
2359
+ {
2360
+ "epoch": 0.8879310344827587,
2361
+ "grad_norm": 2.597691774368286,
2362
+ "learning_rate": 7.564106876958188e-07,
2363
+ "loss": 0.2971,
2364
+ "step": 309
2365
+ },
2366
+ {
2367
+ "epoch": 0.8908045977011494,
2368
+ "grad_norm": 3.1714141368865967,
2369
+ "learning_rate": 7.185832297111939e-07,
2370
+ "loss": 0.391,
2371
+ "step": 310
2372
+ },
2373
+ {
2374
+ "epoch": 0.8936781609195402,
2375
+ "grad_norm": 2.410207748413086,
2376
+ "learning_rate": 6.816907934454353e-07,
2377
+ "loss": 0.2538,
2378
+ "step": 311
2379
+ },
2380
+ {
2381
+ "epoch": 0.896551724137931,
2382
+ "grad_norm": 2.8893043994903564,
2383
+ "learning_rate": 6.457370954894582e-07,
2384
+ "loss": 0.2312,
2385
+ "step": 312
2386
+ },
2387
+ {
2388
+ "epoch": 0.8994252873563219,
2389
+ "grad_norm": 2.990267276763916,
2390
+ "learning_rate": 6.107257578644721e-07,
2391
+ "loss": 0.27,
2392
+ "step": 313
2393
+ },
2394
+ {
2395
+ "epoch": 0.9022988505747126,
2396
+ "grad_norm": 2.6063694953918457,
2397
+ "learning_rate": 5.766603076571164e-07,
2398
+ "loss": 0.2675,
2399
+ "step": 314
2400
+ },
2401
+ {
2402
+ "epoch": 0.9051724137931034,
2403
+ "grad_norm": 2.8116941452026367,
2404
+ "learning_rate": 5.43544176664137e-07,
2405
+ "loss": 0.2846,
2406
+ "step": 315
2407
+ },
2408
+ {
2409
+ "epoch": 0.9080459770114943,
2410
+ "grad_norm": 2.8802504539489746,
2411
+ "learning_rate": 5.113807010466432e-07,
2412
+ "loss": 0.2898,
2413
+ "step": 316
2414
+ },
2415
+ {
2416
+ "epoch": 0.9109195402298851,
2417
+ "grad_norm": 3.1726322174072266,
2418
+ "learning_rate": 4.801731209940375e-07,
2419
+ "loss": 0.2796,
2420
+ "step": 317
2421
+ },
2422
+ {
2423
+ "epoch": 0.9137931034482759,
2424
+ "grad_norm": 3.0099661350250244,
2425
+ "learning_rate": 4.499245803975927e-07,
2426
+ "loss": 0.2649,
2427
+ "step": 318
2428
+ },
2429
+ {
2430
+ "epoch": 0.9166666666666666,
2431
+ "grad_norm": 3.7728664875030518,
2432
+ "learning_rate": 4.206381265337189e-07,
2433
+ "loss": 0.3021,
2434
+ "step": 319
2435
+ },
2436
+ {
2437
+ "epoch": 0.9195402298850575,
2438
+ "grad_norm": 3.430644989013672,
2439
+ "learning_rate": 3.9231670975699354e-07,
2440
+ "loss": 0.326,
2441
+ "step": 320
2442
+ },
2443
+ {
2444
+ "epoch": 0.9195402298850575,
2445
+ "eval_accuracy": 0.8589108910891089,
2446
+ "eval_f1": 0.7348837209302326,
2447
+ "eval_loss": 0.31510937213897705,
2448
+ "eval_precision": 0.7247706422018348,
2449
+ "eval_recall": 0.7452830188679245,
2450
+ "eval_runtime": 17.0364,
2451
+ "eval_samples_per_second": 6.222,
2452
+ "eval_steps_per_second": 0.235,
2453
+ "step": 320
2454
+ },
2455
+ {
2456
+ "epoch": 0.9224137931034483,
2457
+ "grad_norm": 4.415366172790527,
2458
+ "learning_rate": 3.649631832029288e-07,
2459
+ "loss": 0.3382,
2460
+ "step": 321
2461
+ },
2462
+ {
2463
+ "epoch": 0.9252873563218391,
2464
+ "grad_norm": 2.5626957416534424,
2465
+ "learning_rate": 3.385803025005463e-07,
2466
+ "loss": 0.2792,
2467
+ "step": 322
2468
+ },
2469
+ {
2470
+ "epoch": 0.9281609195402298,
2471
+ "grad_norm": 3.4501123428344727,
2472
+ "learning_rate": 3.1317072549477246e-07,
2473
+ "loss": 0.3482,
2474
+ "step": 323
2475
+ },
2476
+ {
2477
+ "epoch": 0.9310344827586207,
2478
+ "grad_norm": 2.9147861003875732,
2479
+ "learning_rate": 2.887370119786792e-07,
2480
+ "loss": 0.2824,
2481
+ "step": 324
2482
+ },
2483
+ {
2484
+ "epoch": 0.9339080459770115,
2485
+ "grad_norm": 3.037773847579956,
2486
+ "learning_rate": 2.6528162343561593e-07,
2487
+ "loss": 0.3257,
2488
+ "step": 325
2489
+ },
2490
+ {
2491
+ "epoch": 0.9367816091954023,
2492
+ "grad_norm": 3.1816771030426025,
2493
+ "learning_rate": 2.4280692279122554e-07,
2494
+ "loss": 0.2645,
2495
+ "step": 326
2496
+ },
2497
+ {
2498
+ "epoch": 0.9396551724137931,
2499
+ "grad_norm": 2.802854537963867,
2500
+ "learning_rate": 2.2131517417540937e-07,
2501
+ "loss": 0.2947,
2502
+ "step": 327
2503
+ },
2504
+ {
2505
+ "epoch": 0.9425287356321839,
2506
+ "grad_norm": 2.949431896209717,
2507
+ "learning_rate": 2.00808542694233e-07,
2508
+ "loss": 0.2907,
2509
+ "step": 328
2510
+ },
2511
+ {
2512
+ "epoch": 0.9454022988505747,
2513
+ "grad_norm": 2.371004581451416,
2514
+ "learning_rate": 1.8128909421180506e-07,
2515
+ "loss": 0.2558,
2516
+ "step": 329
2517
+ },
2518
+ {
2519
+ "epoch": 0.9482758620689655,
2520
+ "grad_norm": 2.651993989944458,
2521
+ "learning_rate": 1.6275879514217052e-07,
2522
+ "loss": 0.3132,
2523
+ "step": 330
2524
+ },
2525
+ {
2526
+ "epoch": 0.9511494252873564,
2527
+ "grad_norm": 3.5036203861236572,
2528
+ "learning_rate": 1.4521951225120345e-07,
2529
+ "loss": 0.3745,
2530
+ "step": 331
2531
+ },
2532
+ {
2533
+ "epoch": 0.9540229885057471,
2534
+ "grad_norm": 2.3972132205963135,
2535
+ "learning_rate": 1.2867301246854757e-07,
2536
+ "loss": 0.2746,
2537
+ "step": 332
2538
+ },
2539
+ {
2540
+ "epoch": 0.9568965517241379,
2541
+ "grad_norm": 2.4108810424804688,
2542
+ "learning_rate": 1.1312096270961525e-07,
2543
+ "loss": 0.2656,
2544
+ "step": 333
2545
+ },
2546
+ {
2547
+ "epoch": 0.9597701149425287,
2548
+ "grad_norm": 2.761547565460205,
2549
+ "learning_rate": 9.856492970766296e-08,
2550
+ "loss": 0.3047,
2551
+ "step": 334
2552
+ },
2553
+ {
2554
+ "epoch": 0.9626436781609196,
2555
+ "grad_norm": 2.521554470062256,
2556
+ "learning_rate": 8.50063798559475e-08,
2557
+ "loss": 0.2753,
2558
+ "step": 335
2559
+ },
2560
+ {
2561
+ "epoch": 0.9655172413793104,
2562
+ "grad_norm": 3.3763246536254883,
2563
+ "learning_rate": 7.244667906001202e-08,
2564
+ "loss": 0.2594,
2565
+ "step": 336
2566
+ },
2567
+ {
2568
+ "epoch": 0.9683908045977011,
2569
+ "grad_norm": 3.34621000289917,
2570
+ "learning_rate": 6.088709260007153e-08,
2571
+ "loss": 0.2837,
2572
+ "step": 337
2573
+ },
2574
+ {
2575
+ "epoch": 0.9712643678160919,
2576
+ "grad_norm": 2.6902542114257812,
2577
+ "learning_rate": 5.032878500355498e-08,
2578
+ "loss": 0.2849,
2579
+ "step": 338
2580
+ },
2581
+ {
2582
+ "epoch": 0.9741379310344828,
2583
+ "grad_norm": 4.887283802032471,
2584
+ "learning_rate": 4.07728199277857e-08,
2585
+ "loss": 0.3055,
2586
+ "step": 339
2587
+ },
2588
+ {
2589
+ "epoch": 0.9770114942528736,
2590
+ "grad_norm": 4.528458118438721,
2591
+ "learning_rate": 3.2220160052828245e-08,
2592
+ "loss": 0.3475,
2593
+ "step": 340
2594
+ },
2595
+ {
2596
+ "epoch": 0.9770114942528736,
2597
+ "eval_accuracy": 0.8638613861386139,
2598
+ "eval_f1": 0.7417840375586855,
2599
+ "eval_loss": 0.3143324553966522,
2600
+ "eval_precision": 0.7383177570093458,
2601
+ "eval_recall": 0.7452830188679245,
2602
+ "eval_runtime": 16.6976,
2603
+ "eval_samples_per_second": 6.348,
2604
+ "eval_steps_per_second": 0.24,
2605
+ "step": 340
2606
+ },
2607
+ {
2608
+ "epoch": 0.9798850574712644,
2609
+ "grad_norm": 2.665673017501831,
2610
+ "learning_rate": 2.467166698450485e-08,
2611
+ "loss": 0.2825,
2612
+ "step": 341
2613
+ },
2614
+ {
2615
+ "epoch": 0.9827586206896551,
2616
+ "grad_norm": 3.8746824264526367,
2617
+ "learning_rate": 1.812810116760044e-08,
2618
+ "loss": 0.2802,
2619
+ "step": 342
2620
+ },
2621
+ {
2622
+ "epoch": 0.985632183908046,
2623
+ "grad_norm": 4.368228912353516,
2624
+ "learning_rate": 1.2590121809247235e-08,
2625
+ "loss": 0.3322,
2626
+ "step": 343
2627
+ },
2628
+ {
2629
+ "epoch": 0.9885057471264368,
2630
+ "grad_norm": 2.3723561763763428,
2631
+ "learning_rate": 8.05828681252452e-09,
2632
+ "loss": 0.2589,
2633
+ "step": 344
2634
+ },
2635
+ {
2636
+ "epoch": 0.9913793103448276,
2637
+ "grad_norm": 3.3824245929718018,
2638
+ "learning_rate": 4.5330527202480656e-09,
2639
+ "loss": 0.3456,
2640
+ "step": 345
2641
+ },
2642
+ {
2643
+ "epoch": 0.9942528735632183,
2644
+ "grad_norm": 2.772489070892334,
2645
+ "learning_rate": 2.014774668979147e-09,
2646
+ "loss": 0.2756,
2647
+ "step": 346
2648
+ },
2649
+ {
2650
+ "epoch": 0.9971264367816092,
2651
+ "grad_norm": 4.0318827629089355,
2652
+ "learning_rate": 5.037063532498109e-10,
2653
+ "loss": 0.303,
2654
+ "step": 347
2655
+ },
2656
+ {
2657
+ "epoch": 1.0,
2658
+ "grad_norm": 3.210477113723755,
2659
+ "learning_rate": 0.0,
2660
+ "loss": 0.2905,
2661
+ "step": 348
2662
  }
2663
  ],
2664
  "logging_steps": 1,
 
2673
  "should_evaluate": false,
2674
  "should_log": false,
2675
  "should_save": true,
2676
+ "should_training_stop": true
2677
  },
2678
  "attributes": {}
2679
  }
2680
  },
2681
+ "total_flos": 1.0991834484860518e+17,
2682
  "train_batch_size": 8,
2683
  "trial_name": null,
2684
  "trial_params": null