jdannem6 commited on
Commit
50672be
1 Parent(s): 5e77112

Uploaded checkpoint-17500

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3096a2a0a313024fcb4310adb597cca112913b2df25d9eb4dbdc2a0e36fbad89
3
  size 4986380064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55542cf68a7a56641df7a59e91b124f6d16eb72304a4aab6742a0f93a5b3d6a9
3
  size 4986380064
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9074d4e047d062c990725d5973278cea97bac7a7d3f188d619178effdae752ef
3
  size 399532808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ecbf27a334befe14f1464c73a6a77128f6598de400961bb7d5097ecfb48f69
3
  size 399532808
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03426f56d34420363c9eed69bbfc8333f73687623a6b28e4211138ae9f651527
3
  size 2699039674
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10a07aad0a46264c45185ced9dd0645d835455a11a08613ac0cc316256bf2101
3
  size 2699039674
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:befa7552d4a0d4f83dbfa1a7fff53b249a27c7a93b17ac8467553e6392864844
3
- size 14180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fc99115bf5f04a1f69339b55b87574e78f76c0017fb7fbc54425e463c53fe09
3
+ size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b94442b72559c9262bb7b2684827bc59deb41027ddc14af8d9ffecb8119b4aa
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75cd0c4e544f7391f9754fd838738017fc0e36a7e8de482816ca502f9dc5c07
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.375,
5
  "eval_steps": 100,
6
- "global_step": 15000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2257,6 +2257,381 @@
2257
  "eval_samples_per_second": 10.165,
2258
  "eval_steps_per_second": 10.165,
2259
  "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2260
  }
2261
  ],
2262
  "logging_steps": 100,
@@ -2264,7 +2639,7 @@
2264
  "num_input_tokens_seen": 0,
2265
  "num_train_epochs": 1,
2266
  "save_steps": 2500,
2267
- "total_flos": 2.3600547495936e+17,
2268
  "train_batch_size": 1,
2269
  "trial_name": null,
2270
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4375,
5
  "eval_steps": 100,
6
+ "global_step": 17500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2257
  "eval_samples_per_second": 10.165,
2258
  "eval_steps_per_second": 10.165,
2259
  "step": 15000
2260
+ },
2261
+ {
2262
+ "epoch": 0.38,
2263
+ "grad_norm": 6.856844902038574,
2264
+ "learning_rate": 5.025641025641026e-06,
2265
+ "loss": 0.6922,
2266
+ "step": 15100
2267
+ },
2268
+ {
2269
+ "epoch": 0.38,
2270
+ "eval_loss": 0.6971193552017212,
2271
+ "eval_runtime": 98.4333,
2272
+ "eval_samples_per_second": 10.159,
2273
+ "eval_steps_per_second": 10.159,
2274
+ "step": 15100
2275
+ },
2276
+ {
2277
+ "epoch": 0.38,
2278
+ "grad_norm": 4.937108039855957,
2279
+ "learning_rate": 4.923076923076924e-06,
2280
+ "loss": 0.6843,
2281
+ "step": 15200
2282
+ },
2283
+ {
2284
+ "epoch": 0.38,
2285
+ "eval_loss": 0.6769686937332153,
2286
+ "eval_runtime": 98.4292,
2287
+ "eval_samples_per_second": 10.16,
2288
+ "eval_steps_per_second": 10.16,
2289
+ "step": 15200
2290
+ },
2291
+ {
2292
+ "epoch": 0.38,
2293
+ "grad_norm": 7.337194442749023,
2294
+ "learning_rate": 4.820512820512821e-06,
2295
+ "loss": 0.7265,
2296
+ "step": 15300
2297
+ },
2298
+ {
2299
+ "epoch": 0.38,
2300
+ "eval_loss": 0.6937099695205688,
2301
+ "eval_runtime": 98.3534,
2302
+ "eval_samples_per_second": 10.167,
2303
+ "eval_steps_per_second": 10.167,
2304
+ "step": 15300
2305
+ },
2306
+ {
2307
+ "epoch": 0.39,
2308
+ "grad_norm": 6.544970512390137,
2309
+ "learning_rate": 4.717948717948718e-06,
2310
+ "loss": 0.6854,
2311
+ "step": 15400
2312
+ },
2313
+ {
2314
+ "epoch": 0.39,
2315
+ "eval_loss": 0.6800307631492615,
2316
+ "eval_runtime": 98.3657,
2317
+ "eval_samples_per_second": 10.166,
2318
+ "eval_steps_per_second": 10.166,
2319
+ "step": 15400
2320
+ },
2321
+ {
2322
+ "epoch": 0.39,
2323
+ "grad_norm": 4.409825801849365,
2324
+ "learning_rate": 4.615384615384616e-06,
2325
+ "loss": 0.7084,
2326
+ "step": 15500
2327
+ },
2328
+ {
2329
+ "epoch": 0.39,
2330
+ "eval_loss": 0.6897458434104919,
2331
+ "eval_runtime": 98.3938,
2332
+ "eval_samples_per_second": 10.163,
2333
+ "eval_steps_per_second": 10.163,
2334
+ "step": 15500
2335
+ },
2336
+ {
2337
+ "epoch": 0.39,
2338
+ "grad_norm": 4.764963150024414,
2339
+ "learning_rate": 4.512820512820513e-06,
2340
+ "loss": 0.6835,
2341
+ "step": 15600
2342
+ },
2343
+ {
2344
+ "epoch": 0.39,
2345
+ "eval_loss": 0.6795360445976257,
2346
+ "eval_runtime": 98.3753,
2347
+ "eval_samples_per_second": 10.165,
2348
+ "eval_steps_per_second": 10.165,
2349
+ "step": 15600
2350
+ },
2351
+ {
2352
+ "epoch": 0.39,
2353
+ "grad_norm": 4.578685283660889,
2354
+ "learning_rate": 4.4102564102564104e-06,
2355
+ "loss": 0.6789,
2356
+ "step": 15700
2357
+ },
2358
+ {
2359
+ "epoch": 0.39,
2360
+ "eval_loss": 0.6842684149742126,
2361
+ "eval_runtime": 98.3424,
2362
+ "eval_samples_per_second": 10.169,
2363
+ "eval_steps_per_second": 10.169,
2364
+ "step": 15700
2365
+ },
2366
+ {
2367
+ "epoch": 0.4,
2368
+ "grad_norm": 7.085032939910889,
2369
+ "learning_rate": 4.307692307692308e-06,
2370
+ "loss": 0.6473,
2371
+ "step": 15800
2372
+ },
2373
+ {
2374
+ "epoch": 0.4,
2375
+ "eval_loss": 0.6702744364738464,
2376
+ "eval_runtime": 98.3744,
2377
+ "eval_samples_per_second": 10.165,
2378
+ "eval_steps_per_second": 10.165,
2379
+ "step": 15800
2380
+ },
2381
+ {
2382
+ "epoch": 0.4,
2383
+ "grad_norm": 9.547301292419434,
2384
+ "learning_rate": 4.2051282051282055e-06,
2385
+ "loss": 0.6666,
2386
+ "step": 15900
2387
+ },
2388
+ {
2389
+ "epoch": 0.4,
2390
+ "eval_loss": 0.6775800585746765,
2391
+ "eval_runtime": 98.3228,
2392
+ "eval_samples_per_second": 10.171,
2393
+ "eval_steps_per_second": 10.171,
2394
+ "step": 15900
2395
+ },
2396
+ {
2397
+ "epoch": 0.4,
2398
+ "grad_norm": 2.161081075668335,
2399
+ "learning_rate": 4.102564102564103e-06,
2400
+ "loss": 0.6968,
2401
+ "step": 16000
2402
+ },
2403
+ {
2404
+ "epoch": 0.4,
2405
+ "eval_loss": 0.683419942855835,
2406
+ "eval_runtime": 98.4131,
2407
+ "eval_samples_per_second": 10.161,
2408
+ "eval_steps_per_second": 10.161,
2409
+ "step": 16000
2410
+ },
2411
+ {
2412
+ "epoch": 0.4,
2413
+ "grad_norm": 4.315452575683594,
2414
+ "learning_rate": 4.000000000000001e-06,
2415
+ "loss": 0.64,
2416
+ "step": 16100
2417
+ },
2418
+ {
2419
+ "epoch": 0.4,
2420
+ "eval_loss": 0.6916409134864807,
2421
+ "eval_runtime": 98.331,
2422
+ "eval_samples_per_second": 10.17,
2423
+ "eval_steps_per_second": 10.17,
2424
+ "step": 16100
2425
+ },
2426
+ {
2427
+ "epoch": 0.41,
2428
+ "grad_norm": 4.9351582527160645,
2429
+ "learning_rate": 3.897435897435898e-06,
2430
+ "loss": 0.6516,
2431
+ "step": 16200
2432
+ },
2433
+ {
2434
+ "epoch": 0.41,
2435
+ "eval_loss": 0.6831104755401611,
2436
+ "eval_runtime": 98.342,
2437
+ "eval_samples_per_second": 10.169,
2438
+ "eval_steps_per_second": 10.169,
2439
+ "step": 16200
2440
+ },
2441
+ {
2442
+ "epoch": 0.41,
2443
+ "grad_norm": 2.858851432800293,
2444
+ "learning_rate": 3.794871794871795e-06,
2445
+ "loss": 0.6626,
2446
+ "step": 16300
2447
+ },
2448
+ {
2449
+ "epoch": 0.41,
2450
+ "eval_loss": 0.6660827398300171,
2451
+ "eval_runtime": 98.3507,
2452
+ "eval_samples_per_second": 10.168,
2453
+ "eval_steps_per_second": 10.168,
2454
+ "step": 16300
2455
+ },
2456
+ {
2457
+ "epoch": 0.41,
2458
+ "grad_norm": 10.384642601013184,
2459
+ "learning_rate": 3.692307692307693e-06,
2460
+ "loss": 0.7119,
2461
+ "step": 16400
2462
+ },
2463
+ {
2464
+ "epoch": 0.41,
2465
+ "eval_loss": 0.6715333461761475,
2466
+ "eval_runtime": 98.3678,
2467
+ "eval_samples_per_second": 10.166,
2468
+ "eval_steps_per_second": 10.166,
2469
+ "step": 16400
2470
+ },
2471
+ {
2472
+ "epoch": 0.41,
2473
+ "grad_norm": 2.9744458198547363,
2474
+ "learning_rate": 3.58974358974359e-06,
2475
+ "loss": 0.6827,
2476
+ "step": 16500
2477
+ },
2478
+ {
2479
+ "epoch": 0.41,
2480
+ "eval_loss": 0.6574322581291199,
2481
+ "eval_runtime": 98.3789,
2482
+ "eval_samples_per_second": 10.165,
2483
+ "eval_steps_per_second": 10.165,
2484
+ "step": 16500
2485
+ },
2486
+ {
2487
+ "epoch": 0.41,
2488
+ "grad_norm": 3.588651657104492,
2489
+ "learning_rate": 3.487179487179487e-06,
2490
+ "loss": 0.7398,
2491
+ "step": 16600
2492
+ },
2493
+ {
2494
+ "epoch": 0.41,
2495
+ "eval_loss": 0.6578336358070374,
2496
+ "eval_runtime": 98.3458,
2497
+ "eval_samples_per_second": 10.168,
2498
+ "eval_steps_per_second": 10.168,
2499
+ "step": 16600
2500
+ },
2501
+ {
2502
+ "epoch": 0.42,
2503
+ "grad_norm": 3.190857410430908,
2504
+ "learning_rate": 3.384615384615385e-06,
2505
+ "loss": 0.6553,
2506
+ "step": 16700
2507
+ },
2508
+ {
2509
+ "epoch": 0.42,
2510
+ "eval_loss": 0.6542092561721802,
2511
+ "eval_runtime": 98.3756,
2512
+ "eval_samples_per_second": 10.165,
2513
+ "eval_steps_per_second": 10.165,
2514
+ "step": 16700
2515
+ },
2516
+ {
2517
+ "epoch": 0.42,
2518
+ "grad_norm": 3.513176441192627,
2519
+ "learning_rate": 3.2820512820512823e-06,
2520
+ "loss": 0.6531,
2521
+ "step": 16800
2522
+ },
2523
+ {
2524
+ "epoch": 0.42,
2525
+ "eval_loss": 0.6713552474975586,
2526
+ "eval_runtime": 98.3796,
2527
+ "eval_samples_per_second": 10.165,
2528
+ "eval_steps_per_second": 10.165,
2529
+ "step": 16800
2530
+ },
2531
+ {
2532
+ "epoch": 0.42,
2533
+ "grad_norm": 5.897684574127197,
2534
+ "learning_rate": 3.1794871794871795e-06,
2535
+ "loss": 0.6923,
2536
+ "step": 16900
2537
+ },
2538
+ {
2539
+ "epoch": 0.42,
2540
+ "eval_loss": 0.6415435075759888,
2541
+ "eval_runtime": 98.3575,
2542
+ "eval_samples_per_second": 10.167,
2543
+ "eval_steps_per_second": 10.167,
2544
+ "step": 16900
2545
+ },
2546
+ {
2547
+ "epoch": 0.42,
2548
+ "grad_norm": 5.851967811584473,
2549
+ "learning_rate": 3.0769230769230774e-06,
2550
+ "loss": 0.6582,
2551
+ "step": 17000
2552
+ },
2553
+ {
2554
+ "epoch": 0.42,
2555
+ "eval_loss": 0.6755939722061157,
2556
+ "eval_runtime": 98.3485,
2557
+ "eval_samples_per_second": 10.168,
2558
+ "eval_steps_per_second": 10.168,
2559
+ "step": 17000
2560
+ },
2561
+ {
2562
+ "epoch": 0.43,
2563
+ "grad_norm": 3.1814448833465576,
2564
+ "learning_rate": 2.9743589743589746e-06,
2565
+ "loss": 0.6471,
2566
+ "step": 17100
2567
+ },
2568
+ {
2569
+ "epoch": 0.43,
2570
+ "eval_loss": 0.6580842733383179,
2571
+ "eval_runtime": 98.3328,
2572
+ "eval_samples_per_second": 10.17,
2573
+ "eval_steps_per_second": 10.17,
2574
+ "step": 17100
2575
+ },
2576
+ {
2577
+ "epoch": 0.43,
2578
+ "grad_norm": 3.3297533988952637,
2579
+ "learning_rate": 2.8717948717948717e-06,
2580
+ "loss": 0.6238,
2581
+ "step": 17200
2582
+ },
2583
+ {
2584
+ "epoch": 0.43,
2585
+ "eval_loss": 0.6540150046348572,
2586
+ "eval_runtime": 98.392,
2587
+ "eval_samples_per_second": 10.163,
2588
+ "eval_steps_per_second": 10.163,
2589
+ "step": 17200
2590
+ },
2591
+ {
2592
+ "epoch": 0.43,
2593
+ "grad_norm": 4.152995586395264,
2594
+ "learning_rate": 2.7692307692307697e-06,
2595
+ "loss": 0.6523,
2596
+ "step": 17300
2597
+ },
2598
+ {
2599
+ "epoch": 0.43,
2600
+ "eval_loss": 0.6668370366096497,
2601
+ "eval_runtime": 98.343,
2602
+ "eval_samples_per_second": 10.168,
2603
+ "eval_steps_per_second": 10.168,
2604
+ "step": 17300
2605
+ },
2606
+ {
2607
+ "epoch": 0.43,
2608
+ "grad_norm": 8.26444149017334,
2609
+ "learning_rate": 2.666666666666667e-06,
2610
+ "loss": 0.6453,
2611
+ "step": 17400
2612
+ },
2613
+ {
2614
+ "epoch": 0.43,
2615
+ "eval_loss": 0.6706939935684204,
2616
+ "eval_runtime": 98.3728,
2617
+ "eval_samples_per_second": 10.165,
2618
+ "eval_steps_per_second": 10.165,
2619
+ "step": 17400
2620
+ },
2621
+ {
2622
+ "epoch": 0.44,
2623
+ "grad_norm": 4.3345627784729,
2624
+ "learning_rate": 2.564102564102564e-06,
2625
+ "loss": 0.6436,
2626
+ "step": 17500
2627
+ },
2628
+ {
2629
+ "epoch": 0.44,
2630
+ "eval_loss": 0.6370129585266113,
2631
+ "eval_runtime": 98.3453,
2632
+ "eval_samples_per_second": 10.168,
2633
+ "eval_steps_per_second": 10.168,
2634
+ "step": 17500
2635
  }
2636
  ],
2637
  "logging_steps": 100,
 
2639
  "num_input_tokens_seen": 0,
2640
  "num_train_epochs": 1,
2641
  "save_steps": 2500,
2642
+ "total_flos": 2.7533972078592e+17,
2643
  "train_batch_size": 1,
2644
  "trial_name": null,
2645
  "trial_params": null