Training in progress, step 190000
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scaler.pt +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +139 -3
- pytorch_model.bin +1 -1
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 893439185
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd03613df05982cc6cd8521404bf2d7d311a82ab0ee46fc664ebdeffd43ec5fb
|
3 |
size 893439185
|
last-checkpoint/pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 449471589
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75854e0ff3e7c4405dc53eac04c2010a206af7aae27dae0d9ee35db9ad0a959a
|
3 |
size 449471589
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15587
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed3586f2d8b7a9d0704645682c4f2d417639e4cca27eecf545ccb9e56c8d74df
|
3 |
size 15587
|
last-checkpoint/scaler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 559
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38e985eb8bf02ef58974d91bc1d920b2617a41af091b03e6ddbcd3b7548fe4b3
|
3 |
size 559
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 623
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d738b37d6429a4b318ddcdaacb6b35096cf2474500c27a66a5a92064653d6fd
|
3 |
size 623
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
-
"global_step":
|
6 |
"is_hyper_param_search": false,
|
7 |
"is_local_process_zero": true,
|
8 |
"is_world_process_zero": true,
|
@@ -2454,11 +2454,147 @@
|
|
2454 |
"eval_samples_per_second": 148.965,
|
2455 |
"eval_steps_per_second": 2.328,
|
2456 |
"step": 180000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2457 |
}
|
2458 |
],
|
2459 |
"max_steps": 200000,
|
2460 |
"num_train_epochs": 9223372036854775807,
|
2461 |
-
"total_flos": 4.
|
2462 |
"trial_name": null,
|
2463 |
"trial_params": null
|
2464 |
}
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.2,
|
5 |
+
"global_step": 190000,
|
6 |
"is_hyper_param_search": false,
|
7 |
"is_local_process_zero": true,
|
8 |
"is_world_process_zero": true,
|
|
|
2454 |
"eval_samples_per_second": 148.965,
|
2455 |
"eval_steps_per_second": 2.328,
|
2456 |
"step": 180000
|
2457 |
+
},
|
2458 |
+
{
|
2459 |
+
"epoch": 0.15,
|
2460 |
+
"learning_rate": 1.2127388544199013e-05,
|
2461 |
+
"loss": 0.378,
|
2462 |
+
"step": 180500
|
2463 |
+
},
|
2464 |
+
{
|
2465 |
+
"epoch": 0.15,
|
2466 |
+
"learning_rate": 1.2020863570515961e-05,
|
2467 |
+
"loss": 0.3783,
|
2468 |
+
"step": 181000
|
2469 |
+
},
|
2470 |
+
{
|
2471 |
+
"epoch": 0.16,
|
2472 |
+
"learning_rate": 1.1917218873266704e-05,
|
2473 |
+
"loss": 0.3774,
|
2474 |
+
"step": 181500
|
2475 |
+
},
|
2476 |
+
{
|
2477 |
+
"epoch": 0.16,
|
2478 |
+
"learning_rate": 1.1816245104688946e-05,
|
2479 |
+
"loss": 0.3768,
|
2480 |
+
"step": 182000
|
2481 |
+
},
|
2482 |
+
{
|
2483 |
+
"epoch": 0.16,
|
2484 |
+
"learning_rate": 1.1717754173131136e-05,
|
2485 |
+
"loss": 0.378,
|
2486 |
+
"step": 182500
|
2487 |
+
},
|
2488 |
+
{
|
2489 |
+
"epoch": 0.17,
|
2490 |
+
"learning_rate": 1.162195718996353e-05,
|
2491 |
+
"loss": 0.3775,
|
2492 |
+
"step": 183000
|
2493 |
+
},
|
2494 |
+
{
|
2495 |
+
"epoch": 0.17,
|
2496 |
+
"learning_rate": 1.1528860064395268e-05,
|
2497 |
+
"loss": 0.3778,
|
2498 |
+
"step": 183500
|
2499 |
+
},
|
2500 |
+
{
|
2501 |
+
"epoch": 0.17,
|
2502 |
+
"learning_rate": 1.14384685390956e-05,
|
2503 |
+
"loss": 0.377,
|
2504 |
+
"step": 184000
|
2505 |
+
},
|
2506 |
+
{
|
2507 |
+
"epoch": 0.17,
|
2508 |
+
"learning_rate": 1.1350788189839584e-05,
|
2509 |
+
"loss": 0.3769,
|
2510 |
+
"step": 184500
|
2511 |
+
},
|
2512 |
+
{
|
2513 |
+
"epoch": 0.17,
|
2514 |
+
"learning_rate": 1.126582442516417e-05,
|
2515 |
+
"loss": 0.3779,
|
2516 |
+
"step": 185000
|
2517 |
+
},
|
2518 |
+
{
|
2519 |
+
"epoch": 0.17,
|
2520 |
+
"eval_loss": 0.3469138443470001,
|
2521 |
+
"eval_runtime": 287.4474,
|
2522 |
+
"eval_samples_per_second": 149.593,
|
2523 |
+
"eval_steps_per_second": 2.338,
|
2524 |
+
"step": 185000
|
2525 |
+
},
|
2526 |
+
{
|
2527 |
+
"epoch": 0.18,
|
2528 |
+
"learning_rate": 1.1183582486034581e-05,
|
2529 |
+
"loss": 0.3766,
|
2530 |
+
"step": 185500
|
2531 |
+
},
|
2532 |
+
{
|
2533 |
+
"epoch": 0.18,
|
2534 |
+
"learning_rate": 1.1104067445521018e-05,
|
2535 |
+
"loss": 0.3776,
|
2536 |
+
"step": 186000
|
2537 |
+
},
|
2538 |
+
{
|
2539 |
+
"epoch": 0.18,
|
2540 |
+
"learning_rate": 1.102728420848572e-05,
|
2541 |
+
"loss": 0.3772,
|
2542 |
+
"step": 186500
|
2543 |
+
},
|
2544 |
+
{
|
2545 |
+
"epoch": 0.18,
|
2546 |
+
"learning_rate": 1.0953237511280449e-05,
|
2547 |
+
"loss": 0.3769,
|
2548 |
+
"step": 187000
|
2549 |
+
},
|
2550 |
+
{
|
2551 |
+
"epoch": 0.19,
|
2552 |
+
"learning_rate": 1.0881931921454253e-05,
|
2553 |
+
"loss": 0.3776,
|
2554 |
+
"step": 187500
|
2555 |
+
},
|
2556 |
+
{
|
2557 |
+
"epoch": 0.19,
|
2558 |
+
"learning_rate": 1.0813506214785774e-05,
|
2559 |
+
"loss": 0.3769,
|
2560 |
+
"step": 188000
|
2561 |
+
},
|
2562 |
+
{
|
2563 |
+
"epoch": 0.19,
|
2564 |
+
"learning_rate": 1.0747690362178142e-05,
|
2565 |
+
"loss": 0.377,
|
2566 |
+
"step": 188500
|
2567 |
+
},
|
2568 |
+
{
|
2569 |
+
"epoch": 0.2,
|
2570 |
+
"learning_rate": 1.0684628296065977e-05,
|
2571 |
+
"loss": 0.3765,
|
2572 |
+
"step": 189000
|
2573 |
+
},
|
2574 |
+
{
|
2575 |
+
"epoch": 0.2,
|
2576 |
+
"learning_rate": 1.0624323906414552e-05,
|
2577 |
+
"loss": 0.376,
|
2578 |
+
"step": 189500
|
2579 |
+
},
|
2580 |
+
{
|
2581 |
+
"epoch": 0.2,
|
2582 |
+
"learning_rate": 1.0566780913082688e-05,
|
2583 |
+
"loss": 0.3777,
|
2584 |
+
"step": 190000
|
2585 |
+
},
|
2586 |
+
{
|
2587 |
+
"epoch": 0.2,
|
2588 |
+
"eval_loss": 0.34515419602394104,
|
2589 |
+
"eval_runtime": 275.3559,
|
2590 |
+
"eval_samples_per_second": 156.162,
|
2591 |
+
"eval_steps_per_second": 2.44,
|
2592 |
+
"step": 190000
|
2593 |
}
|
2594 |
],
|
2595 |
"max_steps": 200000,
|
2596 |
"num_train_epochs": 9223372036854775807,
|
2597 |
+
"total_flos": 4.4743682799304704e+21,
|
2598 |
"trial_name": null,
|
2599 |
"trial_params": null
|
2600 |
}
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 449471589
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75854e0ff3e7c4405dc53eac04c2010a206af7aae27dae0d9ee35db9ad0a959a
|
3 |
size 449471589
|