Uploaded checkpoint-17500
Browse files- model-00001-of-00002.safetensors +1 -1
- model-00002-of-00002.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +2 -2
- scheduler.pt +1 -1
- trainer_state.json +378 -3
model-00001-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4986380064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55542cf68a7a56641df7a59e91b124f6d16eb72304a4aab6742a0f93a5b3d6a9
|
3 |
size 4986380064
|
model-00002-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 399532808
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5ecbf27a334befe14f1464c73a6a77128f6598de400961bb7d5097ecfb48f69
|
3 |
size 399532808
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2699039674
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10a07aad0a46264c45185ced9dd0645d835455a11a08613ac0cc316256bf2101
|
3 |
size 2699039674
|
rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fc99115bf5f04a1f69339b55b87574e78f76c0017fb7fbc54425e463c53fe09
|
3 |
+
size 14244
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d75cd0c4e544f7391f9754fd838738017fc0e36a7e8de482816ca502f9dc5c07
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 100,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -2257,6 +2257,381 @@
|
|
2257 |
"eval_samples_per_second": 10.165,
|
2258 |
"eval_steps_per_second": 10.165,
|
2259 |
"step": 15000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2260 |
}
|
2261 |
],
|
2262 |
"logging_steps": 100,
|
@@ -2264,7 +2639,7 @@
|
|
2264 |
"num_input_tokens_seen": 0,
|
2265 |
"num_train_epochs": 1,
|
2266 |
"save_steps": 2500,
|
2267 |
-
"total_flos": 2.
|
2268 |
"train_batch_size": 1,
|
2269 |
"trial_name": null,
|
2270 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.4375,
|
5 |
"eval_steps": 100,
|
6 |
+
"global_step": 17500,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
2257 |
"eval_samples_per_second": 10.165,
|
2258 |
"eval_steps_per_second": 10.165,
|
2259 |
"step": 15000
|
2260 |
+
},
|
2261 |
+
{
|
2262 |
+
"epoch": 0.38,
|
2263 |
+
"grad_norm": 6.856844902038574,
|
2264 |
+
"learning_rate": 5.025641025641026e-06,
|
2265 |
+
"loss": 0.6922,
|
2266 |
+
"step": 15100
|
2267 |
+
},
|
2268 |
+
{
|
2269 |
+
"epoch": 0.38,
|
2270 |
+
"eval_loss": 0.6971193552017212,
|
2271 |
+
"eval_runtime": 98.4333,
|
2272 |
+
"eval_samples_per_second": 10.159,
|
2273 |
+
"eval_steps_per_second": 10.159,
|
2274 |
+
"step": 15100
|
2275 |
+
},
|
2276 |
+
{
|
2277 |
+
"epoch": 0.38,
|
2278 |
+
"grad_norm": 4.937108039855957,
|
2279 |
+
"learning_rate": 4.923076923076924e-06,
|
2280 |
+
"loss": 0.6843,
|
2281 |
+
"step": 15200
|
2282 |
+
},
|
2283 |
+
{
|
2284 |
+
"epoch": 0.38,
|
2285 |
+
"eval_loss": 0.6769686937332153,
|
2286 |
+
"eval_runtime": 98.4292,
|
2287 |
+
"eval_samples_per_second": 10.16,
|
2288 |
+
"eval_steps_per_second": 10.16,
|
2289 |
+
"step": 15200
|
2290 |
+
},
|
2291 |
+
{
|
2292 |
+
"epoch": 0.38,
|
2293 |
+
"grad_norm": 7.337194442749023,
|
2294 |
+
"learning_rate": 4.820512820512821e-06,
|
2295 |
+
"loss": 0.7265,
|
2296 |
+
"step": 15300
|
2297 |
+
},
|
2298 |
+
{
|
2299 |
+
"epoch": 0.38,
|
2300 |
+
"eval_loss": 0.6937099695205688,
|
2301 |
+
"eval_runtime": 98.3534,
|
2302 |
+
"eval_samples_per_second": 10.167,
|
2303 |
+
"eval_steps_per_second": 10.167,
|
2304 |
+
"step": 15300
|
2305 |
+
},
|
2306 |
+
{
|
2307 |
+
"epoch": 0.39,
|
2308 |
+
"grad_norm": 6.544970512390137,
|
2309 |
+
"learning_rate": 4.717948717948718e-06,
|
2310 |
+
"loss": 0.6854,
|
2311 |
+
"step": 15400
|
2312 |
+
},
|
2313 |
+
{
|
2314 |
+
"epoch": 0.39,
|
2315 |
+
"eval_loss": 0.6800307631492615,
|
2316 |
+
"eval_runtime": 98.3657,
|
2317 |
+
"eval_samples_per_second": 10.166,
|
2318 |
+
"eval_steps_per_second": 10.166,
|
2319 |
+
"step": 15400
|
2320 |
+
},
|
2321 |
+
{
|
2322 |
+
"epoch": 0.39,
|
2323 |
+
"grad_norm": 4.409825801849365,
|
2324 |
+
"learning_rate": 4.615384615384616e-06,
|
2325 |
+
"loss": 0.7084,
|
2326 |
+
"step": 15500
|
2327 |
+
},
|
2328 |
+
{
|
2329 |
+
"epoch": 0.39,
|
2330 |
+
"eval_loss": 0.6897458434104919,
|
2331 |
+
"eval_runtime": 98.3938,
|
2332 |
+
"eval_samples_per_second": 10.163,
|
2333 |
+
"eval_steps_per_second": 10.163,
|
2334 |
+
"step": 15500
|
2335 |
+
},
|
2336 |
+
{
|
2337 |
+
"epoch": 0.39,
|
2338 |
+
"grad_norm": 4.764963150024414,
|
2339 |
+
"learning_rate": 4.512820512820513e-06,
|
2340 |
+
"loss": 0.6835,
|
2341 |
+
"step": 15600
|
2342 |
+
},
|
2343 |
+
{
|
2344 |
+
"epoch": 0.39,
|
2345 |
+
"eval_loss": 0.6795360445976257,
|
2346 |
+
"eval_runtime": 98.3753,
|
2347 |
+
"eval_samples_per_second": 10.165,
|
2348 |
+
"eval_steps_per_second": 10.165,
|
2349 |
+
"step": 15600
|
2350 |
+
},
|
2351 |
+
{
|
2352 |
+
"epoch": 0.39,
|
2353 |
+
"grad_norm": 4.578685283660889,
|
2354 |
+
"learning_rate": 4.4102564102564104e-06,
|
2355 |
+
"loss": 0.6789,
|
2356 |
+
"step": 15700
|
2357 |
+
},
|
2358 |
+
{
|
2359 |
+
"epoch": 0.39,
|
2360 |
+
"eval_loss": 0.6842684149742126,
|
2361 |
+
"eval_runtime": 98.3424,
|
2362 |
+
"eval_samples_per_second": 10.169,
|
2363 |
+
"eval_steps_per_second": 10.169,
|
2364 |
+
"step": 15700
|
2365 |
+
},
|
2366 |
+
{
|
2367 |
+
"epoch": 0.4,
|
2368 |
+
"grad_norm": 7.085032939910889,
|
2369 |
+
"learning_rate": 4.307692307692308e-06,
|
2370 |
+
"loss": 0.6473,
|
2371 |
+
"step": 15800
|
2372 |
+
},
|
2373 |
+
{
|
2374 |
+
"epoch": 0.4,
|
2375 |
+
"eval_loss": 0.6702744364738464,
|
2376 |
+
"eval_runtime": 98.3744,
|
2377 |
+
"eval_samples_per_second": 10.165,
|
2378 |
+
"eval_steps_per_second": 10.165,
|
2379 |
+
"step": 15800
|
2380 |
+
},
|
2381 |
+
{
|
2382 |
+
"epoch": 0.4,
|
2383 |
+
"grad_norm": 9.547301292419434,
|
2384 |
+
"learning_rate": 4.2051282051282055e-06,
|
2385 |
+
"loss": 0.6666,
|
2386 |
+
"step": 15900
|
2387 |
+
},
|
2388 |
+
{
|
2389 |
+
"epoch": 0.4,
|
2390 |
+
"eval_loss": 0.6775800585746765,
|
2391 |
+
"eval_runtime": 98.3228,
|
2392 |
+
"eval_samples_per_second": 10.171,
|
2393 |
+
"eval_steps_per_second": 10.171,
|
2394 |
+
"step": 15900
|
2395 |
+
},
|
2396 |
+
{
|
2397 |
+
"epoch": 0.4,
|
2398 |
+
"grad_norm": 2.161081075668335,
|
2399 |
+
"learning_rate": 4.102564102564103e-06,
|
2400 |
+
"loss": 0.6968,
|
2401 |
+
"step": 16000
|
2402 |
+
},
|
2403 |
+
{
|
2404 |
+
"epoch": 0.4,
|
2405 |
+
"eval_loss": 0.683419942855835,
|
2406 |
+
"eval_runtime": 98.4131,
|
2407 |
+
"eval_samples_per_second": 10.161,
|
2408 |
+
"eval_steps_per_second": 10.161,
|
2409 |
+
"step": 16000
|
2410 |
+
},
|
2411 |
+
{
|
2412 |
+
"epoch": 0.4,
|
2413 |
+
"grad_norm": 4.315452575683594,
|
2414 |
+
"learning_rate": 4.000000000000001e-06,
|
2415 |
+
"loss": 0.64,
|
2416 |
+
"step": 16100
|
2417 |
+
},
|
2418 |
+
{
|
2419 |
+
"epoch": 0.4,
|
2420 |
+
"eval_loss": 0.6916409134864807,
|
2421 |
+
"eval_runtime": 98.331,
|
2422 |
+
"eval_samples_per_second": 10.17,
|
2423 |
+
"eval_steps_per_second": 10.17,
|
2424 |
+
"step": 16100
|
2425 |
+
},
|
2426 |
+
{
|
2427 |
+
"epoch": 0.41,
|
2428 |
+
"grad_norm": 4.9351582527160645,
|
2429 |
+
"learning_rate": 3.897435897435898e-06,
|
2430 |
+
"loss": 0.6516,
|
2431 |
+
"step": 16200
|
2432 |
+
},
|
2433 |
+
{
|
2434 |
+
"epoch": 0.41,
|
2435 |
+
"eval_loss": 0.6831104755401611,
|
2436 |
+
"eval_runtime": 98.342,
|
2437 |
+
"eval_samples_per_second": 10.169,
|
2438 |
+
"eval_steps_per_second": 10.169,
|
2439 |
+
"step": 16200
|
2440 |
+
},
|
2441 |
+
{
|
2442 |
+
"epoch": 0.41,
|
2443 |
+
"grad_norm": 2.858851432800293,
|
2444 |
+
"learning_rate": 3.794871794871795e-06,
|
2445 |
+
"loss": 0.6626,
|
2446 |
+
"step": 16300
|
2447 |
+
},
|
2448 |
+
{
|
2449 |
+
"epoch": 0.41,
|
2450 |
+
"eval_loss": 0.6660827398300171,
|
2451 |
+
"eval_runtime": 98.3507,
|
2452 |
+
"eval_samples_per_second": 10.168,
|
2453 |
+
"eval_steps_per_second": 10.168,
|
2454 |
+
"step": 16300
|
2455 |
+
},
|
2456 |
+
{
|
2457 |
+
"epoch": 0.41,
|
2458 |
+
"grad_norm": 10.384642601013184,
|
2459 |
+
"learning_rate": 3.692307692307693e-06,
|
2460 |
+
"loss": 0.7119,
|
2461 |
+
"step": 16400
|
2462 |
+
},
|
2463 |
+
{
|
2464 |
+
"epoch": 0.41,
|
2465 |
+
"eval_loss": 0.6715333461761475,
|
2466 |
+
"eval_runtime": 98.3678,
|
2467 |
+
"eval_samples_per_second": 10.166,
|
2468 |
+
"eval_steps_per_second": 10.166,
|
2469 |
+
"step": 16400
|
2470 |
+
},
|
2471 |
+
{
|
2472 |
+
"epoch": 0.41,
|
2473 |
+
"grad_norm": 2.9744458198547363,
|
2474 |
+
"learning_rate": 3.58974358974359e-06,
|
2475 |
+
"loss": 0.6827,
|
2476 |
+
"step": 16500
|
2477 |
+
},
|
2478 |
+
{
|
2479 |
+
"epoch": 0.41,
|
2480 |
+
"eval_loss": 0.6574322581291199,
|
2481 |
+
"eval_runtime": 98.3789,
|
2482 |
+
"eval_samples_per_second": 10.165,
|
2483 |
+
"eval_steps_per_second": 10.165,
|
2484 |
+
"step": 16500
|
2485 |
+
},
|
2486 |
+
{
|
2487 |
+
"epoch": 0.41,
|
2488 |
+
"grad_norm": 3.588651657104492,
|
2489 |
+
"learning_rate": 3.487179487179487e-06,
|
2490 |
+
"loss": 0.7398,
|
2491 |
+
"step": 16600
|
2492 |
+
},
|
2493 |
+
{
|
2494 |
+
"epoch": 0.41,
|
2495 |
+
"eval_loss": 0.6578336358070374,
|
2496 |
+
"eval_runtime": 98.3458,
|
2497 |
+
"eval_samples_per_second": 10.168,
|
2498 |
+
"eval_steps_per_second": 10.168,
|
2499 |
+
"step": 16600
|
2500 |
+
},
|
2501 |
+
{
|
2502 |
+
"epoch": 0.42,
|
2503 |
+
"grad_norm": 3.190857410430908,
|
2504 |
+
"learning_rate": 3.384615384615385e-06,
|
2505 |
+
"loss": 0.6553,
|
2506 |
+
"step": 16700
|
2507 |
+
},
|
2508 |
+
{
|
2509 |
+
"epoch": 0.42,
|
2510 |
+
"eval_loss": 0.6542092561721802,
|
2511 |
+
"eval_runtime": 98.3756,
|
2512 |
+
"eval_samples_per_second": 10.165,
|
2513 |
+
"eval_steps_per_second": 10.165,
|
2514 |
+
"step": 16700
|
2515 |
+
},
|
2516 |
+
{
|
2517 |
+
"epoch": 0.42,
|
2518 |
+
"grad_norm": 3.513176441192627,
|
2519 |
+
"learning_rate": 3.2820512820512823e-06,
|
2520 |
+
"loss": 0.6531,
|
2521 |
+
"step": 16800
|
2522 |
+
},
|
2523 |
+
{
|
2524 |
+
"epoch": 0.42,
|
2525 |
+
"eval_loss": 0.6713552474975586,
|
2526 |
+
"eval_runtime": 98.3796,
|
2527 |
+
"eval_samples_per_second": 10.165,
|
2528 |
+
"eval_steps_per_second": 10.165,
|
2529 |
+
"step": 16800
|
2530 |
+
},
|
2531 |
+
{
|
2532 |
+
"epoch": 0.42,
|
2533 |
+
"grad_norm": 5.897684574127197,
|
2534 |
+
"learning_rate": 3.1794871794871795e-06,
|
2535 |
+
"loss": 0.6923,
|
2536 |
+
"step": 16900
|
2537 |
+
},
|
2538 |
+
{
|
2539 |
+
"epoch": 0.42,
|
2540 |
+
"eval_loss": 0.6415435075759888,
|
2541 |
+
"eval_runtime": 98.3575,
|
2542 |
+
"eval_samples_per_second": 10.167,
|
2543 |
+
"eval_steps_per_second": 10.167,
|
2544 |
+
"step": 16900
|
2545 |
+
},
|
2546 |
+
{
|
2547 |
+
"epoch": 0.42,
|
2548 |
+
"grad_norm": 5.851967811584473,
|
2549 |
+
"learning_rate": 3.0769230769230774e-06,
|
2550 |
+
"loss": 0.6582,
|
2551 |
+
"step": 17000
|
2552 |
+
},
|
2553 |
+
{
|
2554 |
+
"epoch": 0.42,
|
2555 |
+
"eval_loss": 0.6755939722061157,
|
2556 |
+
"eval_runtime": 98.3485,
|
2557 |
+
"eval_samples_per_second": 10.168,
|
2558 |
+
"eval_steps_per_second": 10.168,
|
2559 |
+
"step": 17000
|
2560 |
+
},
|
2561 |
+
{
|
2562 |
+
"epoch": 0.43,
|
2563 |
+
"grad_norm": 3.1814448833465576,
|
2564 |
+
"learning_rate": 2.9743589743589746e-06,
|
2565 |
+
"loss": 0.6471,
|
2566 |
+
"step": 17100
|
2567 |
+
},
|
2568 |
+
{
|
2569 |
+
"epoch": 0.43,
|
2570 |
+
"eval_loss": 0.6580842733383179,
|
2571 |
+
"eval_runtime": 98.3328,
|
2572 |
+
"eval_samples_per_second": 10.17,
|
2573 |
+
"eval_steps_per_second": 10.17,
|
2574 |
+
"step": 17100
|
2575 |
+
},
|
2576 |
+
{
|
2577 |
+
"epoch": 0.43,
|
2578 |
+
"grad_norm": 3.3297533988952637,
|
2579 |
+
"learning_rate": 2.8717948717948717e-06,
|
2580 |
+
"loss": 0.6238,
|
2581 |
+
"step": 17200
|
2582 |
+
},
|
2583 |
+
{
|
2584 |
+
"epoch": 0.43,
|
2585 |
+
"eval_loss": 0.6540150046348572,
|
2586 |
+
"eval_runtime": 98.392,
|
2587 |
+
"eval_samples_per_second": 10.163,
|
2588 |
+
"eval_steps_per_second": 10.163,
|
2589 |
+
"step": 17200
|
2590 |
+
},
|
2591 |
+
{
|
2592 |
+
"epoch": 0.43,
|
2593 |
+
"grad_norm": 4.152995586395264,
|
2594 |
+
"learning_rate": 2.7692307692307697e-06,
|
2595 |
+
"loss": 0.6523,
|
2596 |
+
"step": 17300
|
2597 |
+
},
|
2598 |
+
{
|
2599 |
+
"epoch": 0.43,
|
2600 |
+
"eval_loss": 0.6668370366096497,
|
2601 |
+
"eval_runtime": 98.343,
|
2602 |
+
"eval_samples_per_second": 10.168,
|
2603 |
+
"eval_steps_per_second": 10.168,
|
2604 |
+
"step": 17300
|
2605 |
+
},
|
2606 |
+
{
|
2607 |
+
"epoch": 0.43,
|
2608 |
+
"grad_norm": 8.26444149017334,
|
2609 |
+
"learning_rate": 2.666666666666667e-06,
|
2610 |
+
"loss": 0.6453,
|
2611 |
+
"step": 17400
|
2612 |
+
},
|
2613 |
+
{
|
2614 |
+
"epoch": 0.43,
|
2615 |
+
"eval_loss": 0.6706939935684204,
|
2616 |
+
"eval_runtime": 98.3728,
|
2617 |
+
"eval_samples_per_second": 10.165,
|
2618 |
+
"eval_steps_per_second": 10.165,
|
2619 |
+
"step": 17400
|
2620 |
+
},
|
2621 |
+
{
|
2622 |
+
"epoch": 0.44,
|
2623 |
+
"grad_norm": 4.3345627784729,
|
2624 |
+
"learning_rate": 2.564102564102564e-06,
|
2625 |
+
"loss": 0.6436,
|
2626 |
+
"step": 17500
|
2627 |
+
},
|
2628 |
+
{
|
2629 |
+
"epoch": 0.44,
|
2630 |
+
"eval_loss": 0.6370129585266113,
|
2631 |
+
"eval_runtime": 98.3453,
|
2632 |
+
"eval_samples_per_second": 10.168,
|
2633 |
+
"eval_steps_per_second": 10.168,
|
2634 |
+
"step": 17500
|
2635 |
}
|
2636 |
],
|
2637 |
"logging_steps": 100,
|
|
|
2639 |
"num_input_tokens_seen": 0,
|
2640 |
"num_train_epochs": 1,
|
2641 |
"save_steps": 2500,
|
2642 |
+
"total_flos": 2.7533972078592e+17,
|
2643 |
"train_batch_size": 1,
|
2644 |
"trial_name": null,
|
2645 |
"trial_params": null
|