mtzig commited on
Commit
a85c2bc
·
verified ·
1 Parent(s): d5ae2dd

Training in progress, step 348, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa94ec7fc25e7a6ee26ffbbee034689965f336821b064b9f9e0f5dc2f6c05a21
3
  size 27566236
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99fe370c2f00762cd74b9cb5aec9aa42f07d787132197749629db88fcf2aec33
3
  size 27566236
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:442607c5229133e53cd00ee913ff5d44cebaf82c6c20f843b24d268fb4a436af
3
  size 27630900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f3036f71fb4fd6c846f1cdc90056395da95de0fd5098ff1e73a878cada04312
3
  size 27630900
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd8d46607d03e57c5376b8610387ec2fcd94514faf917ee762afcbbb96dc811b
3
  size 27622392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6af6def5736b32ebbd0c4bfff61dd50ea286783ffbdf0a68f88ecc4fd9de8fac
3
  size 27622392
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:459e2897a31e6241cdd6bb0e2e2ba82db83a9a33d01874d52b25108986341565
3
  size 27622392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:854e6cd413fe017412c2a23609972d95a9f49d7b17bbf96772c906a93d68179a
3
  size 27622392
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a4cd00ad97e87fe831d1bdd5ce43db8c2cd9365cf21907f488bae601ba88367
3
  size 13782528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e39d219947f97fb0697e09ca03ada80fea6ebd2d2f7398d49d92cd29fba84b
3
  size 13782528
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6299f59ecc2a7b18770108f685e07e657c3c9e93ace7b10cbbaeca1a530f4c90
3
  size 13782528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44eadd3bf4caa7c7cd81bf6e5ef4ed2272c95a1d6750dd0c1ce498f64f5df152
3
  size 13782528
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9868d8cd622192535fce3348430733d8e14fcbe9d92ed5bbb4ad802e3299996
3
  size 13782528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dd2651be51deeb4f5349f12e8814303c3186d56477216518cd4eba83ad053ca
3
  size 13782528
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b8d30fea35f5dc3498b7f3e5ffb9dd3408ba45c34eb925feec55d7455ab343f
3
  size 13782528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6162de5d036d94f4c618c02a3e3791f8579e2111accaf85f18eb469017f56675
3
  size 13782528
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6185843c50764de20922699c89193c33e1e13037719a5d55479aa190e715e4fc
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:269eae03847773cb4bd93a4517a905853bc54a68269b815c31577215fdc55d2d
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35e51ecf57078c2d652964726d8abc8157e10e9fdddf8cacb5700305b465147a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d187c3af11ce3a881345c7efd9082ca0cb36a6949dfbe3343c996f42f2afc3e6
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebe10fe55b3a58ae13fa7a58fca8f2486fa82c4aa360522ee9cde43cc43ba473
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:344fb25f2c262f28ced1e24ddcd208e4806d9788cdec5b2a7f2397013b68c284
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcda73faaa8d5a9ab0a72d2fef1c1af0341c8e7f8ec0eede744acae39dd22f43
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:805247d87233bfa93147263830dc68344cf75f632c10a49edc8486147debfef7
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3e47edb1b664bc04c493b0996774157c1ffdb9f0b12df515a0b32829d748704
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d015d34b7739a1124fe31968c605ff2948a6a921eb3374a8e282b0787486605
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8620689655172413,
5
  "eval_steps": 20,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2299,6 +2299,366 @@
2299
  "eval_samples_per_second": 6.3,
2300
  "eval_steps_per_second": 0.238,
2301
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2302
  }
2303
  ],
2304
  "logging_steps": 1,
@@ -2313,12 +2673,12 @@
2313
  "should_evaluate": false,
2314
  "should_log": false,
2315
  "should_save": true,
2316
- "should_training_stop": false
2317
  },
2318
  "attributes": {}
2319
  }
2320
  },
2321
- "total_flos": 9.4768839882965e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 20,
6
+ "global_step": 348,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2299
  "eval_samples_per_second": 6.3,
2300
  "eval_steps_per_second": 0.238,
2301
  "step": 300
2302
+ },
2303
+ {
2304
+ "epoch": 0.8649425287356322,
2305
+ "grad_norm": 1.617297649383545,
2306
+ "learning_rate": 1.0922142398186097e-06,
2307
+ "loss": 0.2681,
2308
+ "step": 301
2309
+ },
2310
+ {
2311
+ "epoch": 0.867816091954023,
2312
+ "grad_norm": 2.8231587409973145,
2313
+ "learning_rate": 1.0470516257959351e-06,
2314
+ "loss": 0.2958,
2315
+ "step": 302
2316
+ },
2317
+ {
2318
+ "epoch": 0.8706896551724138,
2319
+ "grad_norm": 1.7231059074401855,
2320
+ "learning_rate": 1.00279094316854e-06,
2321
+ "loss": 0.2739,
2322
+ "step": 303
2323
+ },
2324
+ {
2325
+ "epoch": 0.8735632183908046,
2326
+ "grad_norm": 2.3094263076782227,
2327
+ "learning_rate": 9.594366508138352e-07,
2328
+ "loss": 0.3014,
2329
+ "step": 304
2330
+ },
2331
+ {
2332
+ "epoch": 0.8764367816091954,
2333
+ "grad_norm": 2.9980764389038086,
2334
+ "learning_rate": 9.169931162983137e-07,
2335
+ "loss": 0.2781,
2336
+ "step": 305
2337
+ },
2338
+ {
2339
+ "epoch": 0.8793103448275862,
2340
+ "grad_norm": 2.005481243133545,
2341
+ "learning_rate": 8.754646154375801e-07,
2342
+ "loss": 0.2592,
2343
+ "step": 306
2344
+ },
2345
+ {
2346
+ "epoch": 0.882183908045977,
2347
+ "grad_norm": 2.7463858127593994,
2348
+ "learning_rate": 8.348553318655795e-07,
2349
+ "loss": 0.3159,
2350
+ "step": 307
2351
+ },
2352
+ {
2353
+ "epoch": 0.8850574712643678,
2354
+ "grad_norm": 2.01882266998291,
2355
+ "learning_rate": 7.951693566131325e-07,
2356
+ "loss": 0.3136,
2357
+ "step": 308
2358
+ },
2359
+ {
2360
+ "epoch": 0.8879310344827587,
2361
+ "grad_norm": 1.7923675775527954,
2362
+ "learning_rate": 7.564106876958188e-07,
2363
+ "loss": 0.2936,
2364
+ "step": 309
2365
+ },
2366
+ {
2367
+ "epoch": 0.8908045977011494,
2368
+ "grad_norm": 2.2415239810943604,
2369
+ "learning_rate": 7.185832297111939e-07,
2370
+ "loss": 0.3882,
2371
+ "step": 310
2372
+ },
2373
+ {
2374
+ "epoch": 0.8936781609195402,
2375
+ "grad_norm": 1.7161551713943481,
2376
+ "learning_rate": 6.816907934454353e-07,
2377
+ "loss": 0.2533,
2378
+ "step": 311
2379
+ },
2380
+ {
2381
+ "epoch": 0.896551724137931,
2382
+ "grad_norm": 2.019099712371826,
2383
+ "learning_rate": 6.457370954894582e-07,
2384
+ "loss": 0.2335,
2385
+ "step": 312
2386
+ },
2387
+ {
2388
+ "epoch": 0.8994252873563219,
2389
+ "grad_norm": 2.187922477722168,
2390
+ "learning_rate": 6.107257578644721e-07,
2391
+ "loss": 0.2695,
2392
+ "step": 313
2393
+ },
2394
+ {
2395
+ "epoch": 0.9022988505747126,
2396
+ "grad_norm": 1.7923535108566284,
2397
+ "learning_rate": 5.766603076571164e-07,
2398
+ "loss": 0.2652,
2399
+ "step": 314
2400
+ },
2401
+ {
2402
+ "epoch": 0.9051724137931034,
2403
+ "grad_norm": 1.9885005950927734,
2404
+ "learning_rate": 5.43544176664137e-07,
2405
+ "loss": 0.2851,
2406
+ "step": 315
2407
+ },
2408
+ {
2409
+ "epoch": 0.9080459770114943,
2410
+ "grad_norm": 1.9837305545806885,
2411
+ "learning_rate": 5.113807010466432e-07,
2412
+ "loss": 0.2884,
2413
+ "step": 316
2414
+ },
2415
+ {
2416
+ "epoch": 0.9109195402298851,
2417
+ "grad_norm": 2.307769536972046,
2418
+ "learning_rate": 4.801731209940375e-07,
2419
+ "loss": 0.2827,
2420
+ "step": 317
2421
+ },
2422
+ {
2423
+ "epoch": 0.9137931034482759,
2424
+ "grad_norm": 2.077484130859375,
2425
+ "learning_rate": 4.499245803975927e-07,
2426
+ "loss": 0.2686,
2427
+ "step": 318
2428
+ },
2429
+ {
2430
+ "epoch": 0.9166666666666666,
2431
+ "grad_norm": 2.722639799118042,
2432
+ "learning_rate": 4.206381265337189e-07,
2433
+ "loss": 0.301,
2434
+ "step": 319
2435
+ },
2436
+ {
2437
+ "epoch": 0.9195402298850575,
2438
+ "grad_norm": 2.3338513374328613,
2439
+ "learning_rate": 3.9231670975699354e-07,
2440
+ "loss": 0.3223,
2441
+ "step": 320
2442
+ },
2443
+ {
2444
+ "epoch": 0.9195402298850575,
2445
+ "eval_accuracy": 0.8613861386138614,
2446
+ "eval_f1": 0.7407407407407407,
2447
+ "eval_loss": 0.3157050609588623,
2448
+ "eval_precision": 0.7272727272727273,
2449
+ "eval_recall": 0.7547169811320755,
2450
+ "eval_runtime": 16.7717,
2451
+ "eval_samples_per_second": 6.32,
2452
+ "eval_steps_per_second": 0.238,
2453
+ "step": 320
2454
+ },
2455
+ {
2456
+ "epoch": 0.9224137931034483,
2457
+ "grad_norm": 3.109226942062378,
2458
+ "learning_rate": 3.649631832029288e-07,
2459
+ "loss": 0.3366,
2460
+ "step": 321
2461
+ },
2462
+ {
2463
+ "epoch": 0.9252873563218391,
2464
+ "grad_norm": 1.830435872077942,
2465
+ "learning_rate": 3.385803025005463e-07,
2466
+ "loss": 0.277,
2467
+ "step": 322
2468
+ },
2469
+ {
2470
+ "epoch": 0.9281609195402298,
2471
+ "grad_norm": 2.406721591949463,
2472
+ "learning_rate": 3.1317072549477246e-07,
2473
+ "loss": 0.3517,
2474
+ "step": 323
2475
+ },
2476
+ {
2477
+ "epoch": 0.9310344827586207,
2478
+ "grad_norm": 2.0932199954986572,
2479
+ "learning_rate": 2.887370119786792e-07,
2480
+ "loss": 0.2832,
2481
+ "step": 324
2482
+ },
2483
+ {
2484
+ "epoch": 0.9339080459770115,
2485
+ "grad_norm": 2.193326473236084,
2486
+ "learning_rate": 2.6528162343561593e-07,
2487
+ "loss": 0.3269,
2488
+ "step": 325
2489
+ },
2490
+ {
2491
+ "epoch": 0.9367816091954023,
2492
+ "grad_norm": 2.217686653137207,
2493
+ "learning_rate": 2.4280692279122554e-07,
2494
+ "loss": 0.2642,
2495
+ "step": 326
2496
+ },
2497
+ {
2498
+ "epoch": 0.9396551724137931,
2499
+ "grad_norm": 1.9883277416229248,
2500
+ "learning_rate": 2.2131517417540937e-07,
2501
+ "loss": 0.2929,
2502
+ "step": 327
2503
+ },
2504
+ {
2505
+ "epoch": 0.9425287356321839,
2506
+ "grad_norm": 2.0583817958831787,
2507
+ "learning_rate": 2.00808542694233e-07,
2508
+ "loss": 0.2886,
2509
+ "step": 328
2510
+ },
2511
+ {
2512
+ "epoch": 0.9454022988505747,
2513
+ "grad_norm": 1.7167632579803467,
2514
+ "learning_rate": 1.8128909421180506e-07,
2515
+ "loss": 0.2528,
2516
+ "step": 329
2517
+ },
2518
+ {
2519
+ "epoch": 0.9482758620689655,
2520
+ "grad_norm": 1.9056061506271362,
2521
+ "learning_rate": 1.6275879514217052e-07,
2522
+ "loss": 0.3133,
2523
+ "step": 330
2524
+ },
2525
+ {
2526
+ "epoch": 0.9511494252873564,
2527
+ "grad_norm": 2.4173924922943115,
2528
+ "learning_rate": 1.4521951225120345e-07,
2529
+ "loss": 0.3712,
2530
+ "step": 331
2531
+ },
2532
+ {
2533
+ "epoch": 0.9540229885057471,
2534
+ "grad_norm": 1.6717036962509155,
2535
+ "learning_rate": 1.2867301246854757e-07,
2536
+ "loss": 0.2764,
2537
+ "step": 332
2538
+ },
2539
+ {
2540
+ "epoch": 0.9568965517241379,
2541
+ "grad_norm": 1.7265243530273438,
2542
+ "learning_rate": 1.1312096270961525e-07,
2543
+ "loss": 0.2683,
2544
+ "step": 333
2545
+ },
2546
+ {
2547
+ "epoch": 0.9597701149425287,
2548
+ "grad_norm": 1.933962345123291,
2549
+ "learning_rate": 9.856492970766296e-08,
2550
+ "loss": 0.3035,
2551
+ "step": 334
2552
+ },
2553
+ {
2554
+ "epoch": 0.9626436781609196,
2555
+ "grad_norm": 1.7811708450317383,
2556
+ "learning_rate": 8.50063798559475e-08,
2557
+ "loss": 0.2747,
2558
+ "step": 335
2559
+ },
2560
+ {
2561
+ "epoch": 0.9655172413793104,
2562
+ "grad_norm": 2.420718193054199,
2563
+ "learning_rate": 7.244667906001202e-08,
2564
+ "loss": 0.2595,
2565
+ "step": 336
2566
+ },
2567
+ {
2568
+ "epoch": 0.9683908045977011,
2569
+ "grad_norm": 2.360978841781616,
2570
+ "learning_rate": 6.088709260007153e-08,
2571
+ "loss": 0.2845,
2572
+ "step": 337
2573
+ },
2574
+ {
2575
+ "epoch": 0.9712643678160919,
2576
+ "grad_norm": 1.8564136028289795,
2577
+ "learning_rate": 5.032878500355498e-08,
2578
+ "loss": 0.2868,
2579
+ "step": 338
2580
+ },
2581
+ {
2582
+ "epoch": 0.9741379310344828,
2583
+ "grad_norm": 3.320560932159424,
2584
+ "learning_rate": 4.07728199277857e-08,
2585
+ "loss": 0.298,
2586
+ "step": 339
2587
+ },
2588
+ {
2589
+ "epoch": 0.9770114942528736,
2590
+ "grad_norm": 3.1737234592437744,
2591
+ "learning_rate": 3.2220160052828245e-08,
2592
+ "loss": 0.3448,
2593
+ "step": 340
2594
+ },
2595
+ {
2596
+ "epoch": 0.9770114942528736,
2597
+ "eval_accuracy": 0.8613861386138614,
2598
+ "eval_f1": 0.7407407407407407,
2599
+ "eval_loss": 0.31532150506973267,
2600
+ "eval_precision": 0.7272727272727273,
2601
+ "eval_recall": 0.7547169811320755,
2602
+ "eval_runtime": 17.2915,
2603
+ "eval_samples_per_second": 6.13,
2604
+ "eval_steps_per_second": 0.231,
2605
+ "step": 340
2606
+ },
2607
+ {
2608
+ "epoch": 0.9798850574712644,
2609
+ "grad_norm": 1.8069976568222046,
2610
+ "learning_rate": 2.467166698450485e-08,
2611
+ "loss": 0.2825,
2612
+ "step": 341
2613
+ },
2614
+ {
2615
+ "epoch": 0.9827586206896551,
2616
+ "grad_norm": 2.923757314682007,
2617
+ "learning_rate": 1.812810116760044e-08,
2618
+ "loss": 0.2836,
2619
+ "step": 342
2620
+ },
2621
+ {
2622
+ "epoch": 0.985632183908046,
2623
+ "grad_norm": 3.138817310333252,
2624
+ "learning_rate": 1.2590121809247235e-08,
2625
+ "loss": 0.3312,
2626
+ "step": 343
2627
+ },
2628
+ {
2629
+ "epoch": 0.9885057471264368,
2630
+ "grad_norm": 1.6502025127410889,
2631
+ "learning_rate": 8.05828681252452e-09,
2632
+ "loss": 0.2625,
2633
+ "step": 344
2634
+ },
2635
+ {
2636
+ "epoch": 0.9913793103448276,
2637
+ "grad_norm": 2.4544200897216797,
2638
+ "learning_rate": 4.5330527202480656e-09,
2639
+ "loss": 0.3451,
2640
+ "step": 345
2641
+ },
2642
+ {
2643
+ "epoch": 0.9942528735632183,
2644
+ "grad_norm": 1.9814451932907104,
2645
+ "learning_rate": 2.014774668979147e-09,
2646
+ "loss": 0.2715,
2647
+ "step": 346
2648
+ },
2649
+ {
2650
+ "epoch": 0.9971264367816092,
2651
+ "grad_norm": 2.851459503173828,
2652
+ "learning_rate": 5.037063532498109e-10,
2653
+ "loss": 0.3053,
2654
+ "step": 347
2655
+ },
2656
+ {
2657
+ "epoch": 1.0,
2658
+ "grad_norm": 1.8435858488082886,
2659
+ "learning_rate": 0.0,
2660
+ "loss": 0.2909,
2661
+ "step": 348
2662
  }
2663
  ],
2664
  "logging_steps": 1,
 
2673
  "should_evaluate": false,
2674
  "should_log": false,
2675
  "should_save": true,
2676
+ "should_training_stop": true
2677
  },
2678
  "attributes": {}
2679
  }
2680
  },
2681
+ "total_flos": 1.1002361050405274e+17,
2682
  "train_batch_size": 8,
2683
  "trial_name": null,
2684
  "trial_params": null