SummerSigh
commited on
Commit
•
bd8a4e1
1
Parent(s):
3afb0a7
Upload 8 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- scheduler.pt +1 -1
- trainer_state.json +884 -4
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 18494040
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f64a3cafa47c9ba3e54437d1f9852c222a0087b81b8ce6e387c02057cb1bfd3
|
3 |
size 18494040
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 37035002
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be93c1be2bc3f33f7d84eeeeb4d8c4d995ed64199a72fdbe553b1f003bc30445
|
3 |
size 37035002
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60cbdd85cf6bcb7c6140c88eacbc709e5746be6620fc2427f93d0a9c73d83631
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 1.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -5311,11 +5311,891 @@
|
|
5311 |
"loss": 4.3878,
|
5312 |
"num_input_tokens_seen": 581214146,
|
5313 |
"step": 99450
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5314 |
}
|
5315 |
],
|
5316 |
"logging_steps": 150,
|
5317 |
"max_steps": 272232,
|
5318 |
-
"num_input_tokens_seen":
|
5319 |
"num_train_epochs": 3,
|
5320 |
"save_steps": 500,
|
5321 |
"stateful_callbacks": {
|
@@ -5330,7 +6210,7 @@
|
|
5330 |
"attributes": {}
|
5331 |
}
|
5332 |
},
|
5333 |
-
"total_flos":
|
5334 |
"train_batch_size": 32,
|
5335 |
"trial_name": null,
|
5336 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.2783143881998358,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 116000,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
5311 |
"loss": 4.3878,
|
5312 |
"num_input_tokens_seen": 581214146,
|
5313 |
"step": 99450
|
5314 |
+
},
|
5315 |
+
{
|
5316 |
+
"epoch": 1.0975871815922729,
|
5317 |
+
"grad_norm": 1.9078856706619263,
|
5318 |
+
"learning_rate": 0.00010148434400575989,
|
5319 |
+
"loss": 4.3976,
|
5320 |
+
"num_input_tokens_seen": 582094658,
|
5321 |
+
"step": 99600
|
5322 |
+
},
|
5323 |
+
{
|
5324 |
+
"epoch": 1.0992401743356346,
|
5325 |
+
"grad_norm": 1.8750337362289429,
|
5326 |
+
"learning_rate": 0.00010139618261163437,
|
5327 |
+
"loss": 4.3999,
|
5328 |
+
"num_input_tokens_seen": 582981922,
|
5329 |
+
"step": 99750
|
5330 |
+
},
|
5331 |
+
{
|
5332 |
+
"epoch": 1.1008931670789965,
|
5333 |
+
"grad_norm": 1.9243488311767578,
|
5334 |
+
"learning_rate": 0.00010130802121750887,
|
5335 |
+
"loss": 4.3879,
|
5336 |
+
"num_input_tokens_seen": 583869026,
|
5337 |
+
"step": 99900
|
5338 |
+
},
|
5339 |
+
{
|
5340 |
+
"epoch": 1.1025461598223583,
|
5341 |
+
"grad_norm": 1.8446391820907593,
|
5342 |
+
"learning_rate": 0.00010121985982338334,
|
5343 |
+
"loss": 4.3894,
|
5344 |
+
"num_input_tokens_seen": 584749826,
|
5345 |
+
"step": 100050
|
5346 |
+
},
|
5347 |
+
{
|
5348 |
+
"epoch": 1.1041991525657202,
|
5349 |
+
"grad_norm": 1.726158857345581,
|
5350 |
+
"learning_rate": 0.00010113169842925785,
|
5351 |
+
"loss": 4.3985,
|
5352 |
+
"num_input_tokens_seen": 585630274,
|
5353 |
+
"step": 100200
|
5354 |
+
},
|
5355 |
+
{
|
5356 |
+
"epoch": 1.1058521453090822,
|
5357 |
+
"grad_norm": 1.8227604627609253,
|
5358 |
+
"learning_rate": 0.00010104353703513232,
|
5359 |
+
"loss": 4.3906,
|
5360 |
+
"num_input_tokens_seen": 586484930,
|
5361 |
+
"step": 100350
|
5362 |
+
},
|
5363 |
+
{
|
5364 |
+
"epoch": 1.1075051380524439,
|
5365 |
+
"grad_norm": 1.9156420230865479,
|
5366 |
+
"learning_rate": 0.00010095537564100682,
|
5367 |
+
"loss": 4.3893,
|
5368 |
+
"num_input_tokens_seen": 587352738,
|
5369 |
+
"step": 100500
|
5370 |
+
},
|
5371 |
+
{
|
5372 |
+
"epoch": 1.1091581307958058,
|
5373 |
+
"grad_norm": 1.8385225534439087,
|
5374 |
+
"learning_rate": 0.0001008678019895088,
|
5375 |
+
"loss": 4.3994,
|
5376 |
+
"num_input_tokens_seen": 588239810,
|
5377 |
+
"step": 100650
|
5378 |
+
},
|
5379 |
+
{
|
5380 |
+
"epoch": 1.1108111235391678,
|
5381 |
+
"grad_norm": 1.9076261520385742,
|
5382 |
+
"learning_rate": 0.00010077964059538329,
|
5383 |
+
"loss": 4.3922,
|
5384 |
+
"num_input_tokens_seen": 589116514,
|
5385 |
+
"step": 100800
|
5386 |
+
},
|
5387 |
+
{
|
5388 |
+
"epoch": 1.1124641162825295,
|
5389 |
+
"grad_norm": 1.8701651096343994,
|
5390 |
+
"learning_rate": 0.00010069147920125778,
|
5391 |
+
"loss": 4.4015,
|
5392 |
+
"num_input_tokens_seen": 589983426,
|
5393 |
+
"step": 100950
|
5394 |
+
},
|
5395 |
+
{
|
5396 |
+
"epoch": 1.1141171090258915,
|
5397 |
+
"grad_norm": 1.9545180797576904,
|
5398 |
+
"learning_rate": 0.00010060331780713227,
|
5399 |
+
"loss": 4.3978,
|
5400 |
+
"num_input_tokens_seen": 590856994,
|
5401 |
+
"step": 101100
|
5402 |
+
},
|
5403 |
+
{
|
5404 |
+
"epoch": 1.1157701017692532,
|
5405 |
+
"grad_norm": 1.9418137073516846,
|
5406 |
+
"learning_rate": 0.00010051515641300676,
|
5407 |
+
"loss": 4.3893,
|
5408 |
+
"num_input_tokens_seen": 591735490,
|
5409 |
+
"step": 101250
|
5410 |
+
},
|
5411 |
+
{
|
5412 |
+
"epoch": 1.1174230945126151,
|
5413 |
+
"grad_norm": 1.892683982849121,
|
5414 |
+
"learning_rate": 0.00010042699501888123,
|
5415 |
+
"loss": 4.3833,
|
5416 |
+
"num_input_tokens_seen": 592622626,
|
5417 |
+
"step": 101400
|
5418 |
+
},
|
5419 |
+
{
|
5420 |
+
"epoch": 1.1190760872559768,
|
5421 |
+
"grad_norm": 1.830404281616211,
|
5422 |
+
"learning_rate": 0.00010033883362475573,
|
5423 |
+
"loss": 4.3939,
|
5424 |
+
"num_input_tokens_seen": 593500354,
|
5425 |
+
"step": 101550
|
5426 |
+
},
|
5427 |
+
{
|
5428 |
+
"epoch": 1.1207290799993388,
|
5429 |
+
"grad_norm": 1.8536481857299805,
|
5430 |
+
"learning_rate": 0.00010025067223063021,
|
5431 |
+
"loss": 4.3826,
|
5432 |
+
"num_input_tokens_seen": 594383234,
|
5433 |
+
"step": 101700
|
5434 |
+
},
|
5435 |
+
{
|
5436 |
+
"epoch": 1.1223820727427007,
|
5437 |
+
"grad_norm": 1.84872567653656,
|
5438 |
+
"learning_rate": 0.00010016251083650471,
|
5439 |
+
"loss": 4.3847,
|
5440 |
+
"num_input_tokens_seen": 595255266,
|
5441 |
+
"step": 101850
|
5442 |
+
},
|
5443 |
+
{
|
5444 |
+
"epoch": 1.1240350654860625,
|
5445 |
+
"grad_norm": 1.8653180599212646,
|
5446 |
+
"learning_rate": 0.00010007434944237918,
|
5447 |
+
"loss": 4.392,
|
5448 |
+
"num_input_tokens_seen": 596135586,
|
5449 |
+
"step": 102000
|
5450 |
+
},
|
5451 |
+
{
|
5452 |
+
"epoch": 1.1256880582294244,
|
5453 |
+
"grad_norm": 1.8534561395645142,
|
5454 |
+
"learning_rate": 9.998618804825369e-05,
|
5455 |
+
"loss": 4.3862,
|
5456 |
+
"num_input_tokens_seen": 597009218,
|
5457 |
+
"step": 102150
|
5458 |
+
},
|
5459 |
+
{
|
5460 |
+
"epoch": 1.1273410509727861,
|
5461 |
+
"grad_norm": 1.8982864618301392,
|
5462 |
+
"learning_rate": 9.989802665412816e-05,
|
5463 |
+
"loss": 4.3969,
|
5464 |
+
"num_input_tokens_seen": 597873026,
|
5465 |
+
"step": 102300
|
5466 |
+
},
|
5467 |
+
{
|
5468 |
+
"epoch": 1.128994043716148,
|
5469 |
+
"grad_norm": 1.9212620258331299,
|
5470 |
+
"learning_rate": 9.980986526000266e-05,
|
5471 |
+
"loss": 4.3872,
|
5472 |
+
"num_input_tokens_seen": 598748322,
|
5473 |
+
"step": 102450
|
5474 |
+
},
|
5475 |
+
{
|
5476 |
+
"epoch": 1.13064703645951,
|
5477 |
+
"grad_norm": 1.8133482933044434,
|
5478 |
+
"learning_rate": 9.972170386587714e-05,
|
5479 |
+
"loss": 4.3801,
|
5480 |
+
"num_input_tokens_seen": 599625410,
|
5481 |
+
"step": 102600
|
5482 |
+
},
|
5483 |
+
{
|
5484 |
+
"epoch": 1.1323000292028718,
|
5485 |
+
"grad_norm": 1.8521312475204468,
|
5486 |
+
"learning_rate": 9.963354247175164e-05,
|
5487 |
+
"loss": 4.3867,
|
5488 |
+
"num_input_tokens_seen": 600489762,
|
5489 |
+
"step": 102750
|
5490 |
+
},
|
5491 |
+
{
|
5492 |
+
"epoch": 1.1339530219462337,
|
5493 |
+
"grad_norm": 2.050074577331543,
|
5494 |
+
"learning_rate": 9.954538107762612e-05,
|
5495 |
+
"loss": 4.3813,
|
5496 |
+
"num_input_tokens_seen": 601357666,
|
5497 |
+
"step": 102900
|
5498 |
+
},
|
5499 |
+
{
|
5500 |
+
"epoch": 1.1356060146895954,
|
5501 |
+
"grad_norm": 1.8785549402236938,
|
5502 |
+
"learning_rate": 9.945721968350062e-05,
|
5503 |
+
"loss": 4.3799,
|
5504 |
+
"num_input_tokens_seen": 602239362,
|
5505 |
+
"step": 103050
|
5506 |
+
},
|
5507 |
+
{
|
5508 |
+
"epoch": 1.1372590074329574,
|
5509 |
+
"grad_norm": 1.9237360954284668,
|
5510 |
+
"learning_rate": 9.93690582893751e-05,
|
5511 |
+
"loss": 4.3902,
|
5512 |
+
"num_input_tokens_seen": 603119650,
|
5513 |
+
"step": 103200
|
5514 |
+
},
|
5515 |
+
{
|
5516 |
+
"epoch": 1.1389120001763193,
|
5517 |
+
"grad_norm": 1.8664278984069824,
|
5518 |
+
"learning_rate": 9.928089689524957e-05,
|
5519 |
+
"loss": 4.3905,
|
5520 |
+
"num_input_tokens_seen": 603985666,
|
5521 |
+
"step": 103350
|
5522 |
+
},
|
5523 |
+
{
|
5524 |
+
"epoch": 1.140564992919681,
|
5525 |
+
"grad_norm": 1.812515139579773,
|
5526 |
+
"learning_rate": 9.919273550112407e-05,
|
5527 |
+
"loss": 4.3757,
|
5528 |
+
"num_input_tokens_seen": 604874530,
|
5529 |
+
"step": 103500
|
5530 |
+
},
|
5531 |
+
{
|
5532 |
+
"epoch": 1.142217985663043,
|
5533 |
+
"grad_norm": 1.9093918800354004,
|
5534 |
+
"learning_rate": 9.910457410699855e-05,
|
5535 |
+
"loss": 4.4058,
|
5536 |
+
"num_input_tokens_seen": 605755394,
|
5537 |
+
"step": 103650
|
5538 |
+
},
|
5539 |
+
{
|
5540 |
+
"epoch": 1.1438709784064047,
|
5541 |
+
"grad_norm": 1.9712496995925903,
|
5542 |
+
"learning_rate": 9.901641271287305e-05,
|
5543 |
+
"loss": 4.3848,
|
5544 |
+
"num_input_tokens_seen": 606649794,
|
5545 |
+
"step": 103800
|
5546 |
+
},
|
5547 |
+
{
|
5548 |
+
"epoch": 1.1455239711497667,
|
5549 |
+
"grad_norm": 1.9102181196212769,
|
5550 |
+
"learning_rate": 9.892825131874752e-05,
|
5551 |
+
"loss": 4.3926,
|
5552 |
+
"num_input_tokens_seen": 607513858,
|
5553 |
+
"step": 103950
|
5554 |
+
},
|
5555 |
+
{
|
5556 |
+
"epoch": 1.1471769638931284,
|
5557 |
+
"grad_norm": 1.7749512195587158,
|
5558 |
+
"learning_rate": 9.884008992462201e-05,
|
5559 |
+
"loss": 4.3906,
|
5560 |
+
"num_input_tokens_seen": 608391202,
|
5561 |
+
"step": 104100
|
5562 |
+
},
|
5563 |
+
{
|
5564 |
+
"epoch": 1.1488299566364903,
|
5565 |
+
"grad_norm": 1.8394023180007935,
|
5566 |
+
"learning_rate": 9.87519285304965e-05,
|
5567 |
+
"loss": 4.3814,
|
5568 |
+
"num_input_tokens_seen": 609282018,
|
5569 |
+
"step": 104250
|
5570 |
+
},
|
5571 |
+
{
|
5572 |
+
"epoch": 1.1504829493798523,
|
5573 |
+
"grad_norm": 1.9161593914031982,
|
5574 |
+
"learning_rate": 9.866376713637099e-05,
|
5575 |
+
"loss": 4.3947,
|
5576 |
+
"num_input_tokens_seen": 610168514,
|
5577 |
+
"step": 104400
|
5578 |
+
},
|
5579 |
+
{
|
5580 |
+
"epoch": 1.152135942123214,
|
5581 |
+
"grad_norm": 1.930790901184082,
|
5582 |
+
"learning_rate": 9.857560574224548e-05,
|
5583 |
+
"loss": 4.3928,
|
5584 |
+
"num_input_tokens_seen": 611052354,
|
5585 |
+
"step": 104550
|
5586 |
+
},
|
5587 |
+
{
|
5588 |
+
"epoch": 1.153788934866576,
|
5589 |
+
"grad_norm": 1.836146354675293,
|
5590 |
+
"learning_rate": 9.848803209074748e-05,
|
5591 |
+
"loss": 4.3977,
|
5592 |
+
"num_input_tokens_seen": 611926498,
|
5593 |
+
"step": 104700
|
5594 |
+
},
|
5595 |
+
{
|
5596 |
+
"epoch": 1.155441927609938,
|
5597 |
+
"grad_norm": 1.7802364826202393,
|
5598 |
+
"learning_rate": 9.839987069662196e-05,
|
5599 |
+
"loss": 4.3921,
|
5600 |
+
"num_input_tokens_seen": 612818210,
|
5601 |
+
"step": 104850
|
5602 |
+
},
|
5603 |
+
{
|
5604 |
+
"epoch": 1.1570949203532996,
|
5605 |
+
"grad_norm": 1.9587794542312622,
|
5606 |
+
"learning_rate": 9.831170930249643e-05,
|
5607 |
+
"loss": 4.3925,
|
5608 |
+
"num_input_tokens_seen": 613694850,
|
5609 |
+
"step": 105000
|
5610 |
+
},
|
5611 |
+
{
|
5612 |
+
"epoch": 1.1587479130966616,
|
5613 |
+
"grad_norm": 1.9676165580749512,
|
5614 |
+
"learning_rate": 9.822354790837093e-05,
|
5615 |
+
"loss": 4.3782,
|
5616 |
+
"num_input_tokens_seen": 614583618,
|
5617 |
+
"step": 105150
|
5618 |
+
},
|
5619 |
+
{
|
5620 |
+
"epoch": 1.1604009058400233,
|
5621 |
+
"grad_norm": 1.8942914009094238,
|
5622 |
+
"learning_rate": 9.813538651424541e-05,
|
5623 |
+
"loss": 4.3792,
|
5624 |
+
"num_input_tokens_seen": 615478530,
|
5625 |
+
"step": 105300
|
5626 |
+
},
|
5627 |
+
{
|
5628 |
+
"epoch": 1.1620538985833853,
|
5629 |
+
"grad_norm": 1.8436447381973267,
|
5630 |
+
"learning_rate": 9.804722512011991e-05,
|
5631 |
+
"loss": 4.3848,
|
5632 |
+
"num_input_tokens_seen": 616374914,
|
5633 |
+
"step": 105450
|
5634 |
+
},
|
5635 |
+
{
|
5636 |
+
"epoch": 1.163706891326747,
|
5637 |
+
"grad_norm": 1.9150909185409546,
|
5638 |
+
"learning_rate": 9.795906372599439e-05,
|
5639 |
+
"loss": 4.381,
|
5640 |
+
"num_input_tokens_seen": 617260162,
|
5641 |
+
"step": 105600
|
5642 |
+
},
|
5643 |
+
{
|
5644 |
+
"epoch": 1.165359884070109,
|
5645 |
+
"grad_norm": 2.0403525829315186,
|
5646 |
+
"learning_rate": 9.787090233186889e-05,
|
5647 |
+
"loss": 4.3835,
|
5648 |
+
"num_input_tokens_seen": 618136386,
|
5649 |
+
"step": 105750
|
5650 |
+
},
|
5651 |
+
{
|
5652 |
+
"epoch": 1.1670128768134709,
|
5653 |
+
"grad_norm": 1.8062185049057007,
|
5654 |
+
"learning_rate": 9.778274093774336e-05,
|
5655 |
+
"loss": 4.3821,
|
5656 |
+
"num_input_tokens_seen": 619009282,
|
5657 |
+
"step": 105900
|
5658 |
+
},
|
5659 |
+
{
|
5660 |
+
"epoch": 1.1686658695568326,
|
5661 |
+
"grad_norm": 1.9948753118515015,
|
5662 |
+
"learning_rate": 9.769457954361787e-05,
|
5663 |
+
"loss": 4.3911,
|
5664 |
+
"num_input_tokens_seen": 619886722,
|
5665 |
+
"step": 106050
|
5666 |
+
},
|
5667 |
+
{
|
5668 |
+
"epoch": 1.1703188623001946,
|
5669 |
+
"grad_norm": 1.8109992742538452,
|
5670 |
+
"learning_rate": 9.760641814949234e-05,
|
5671 |
+
"loss": 4.3791,
|
5672 |
+
"num_input_tokens_seen": 620758178,
|
5673 |
+
"step": 106200
|
5674 |
+
},
|
5675 |
+
{
|
5676 |
+
"epoch": 1.1719718550435563,
|
5677 |
+
"grad_norm": 1.9707014560699463,
|
5678 |
+
"learning_rate": 9.751825675536684e-05,
|
5679 |
+
"loss": 4.3809,
|
5680 |
+
"num_input_tokens_seen": 621629506,
|
5681 |
+
"step": 106350
|
5682 |
+
},
|
5683 |
+
{
|
5684 |
+
"epoch": 1.1736248477869182,
|
5685 |
+
"grad_norm": 1.9458143711090088,
|
5686 |
+
"learning_rate": 9.743009536124132e-05,
|
5687 |
+
"loss": 4.3952,
|
5688 |
+
"num_input_tokens_seen": 622496418,
|
5689 |
+
"step": 106500
|
5690 |
+
},
|
5691 |
+
{
|
5692 |
+
"epoch": 1.17527784053028,
|
5693 |
+
"grad_norm": 1.9349957704544067,
|
5694 |
+
"learning_rate": 9.734310945237081e-05,
|
5695 |
+
"loss": 4.379,
|
5696 |
+
"num_input_tokens_seen": 623395010,
|
5697 |
+
"step": 106650
|
5698 |
+
},
|
5699 |
+
{
|
5700 |
+
"epoch": 1.176930833273642,
|
5701 |
+
"grad_norm": 1.9133590459823608,
|
5702 |
+
"learning_rate": 9.725494805824531e-05,
|
5703 |
+
"loss": 4.3689,
|
5704 |
+
"num_input_tokens_seen": 624262434,
|
5705 |
+
"step": 106800
|
5706 |
+
},
|
5707 |
+
{
|
5708 |
+
"epoch": 1.1785838260170038,
|
5709 |
+
"grad_norm": 1.9451539516448975,
|
5710 |
+
"learning_rate": 9.716678666411979e-05,
|
5711 |
+
"loss": 4.3863,
|
5712 |
+
"num_input_tokens_seen": 625153506,
|
5713 |
+
"step": 106950
|
5714 |
+
},
|
5715 |
+
{
|
5716 |
+
"epoch": 1.1802368187603656,
|
5717 |
+
"grad_norm": 2.0072357654571533,
|
5718 |
+
"learning_rate": 9.707862526999429e-05,
|
5719 |
+
"loss": 4.378,
|
5720 |
+
"num_input_tokens_seen": 626026690,
|
5721 |
+
"step": 107100
|
5722 |
+
},
|
5723 |
+
{
|
5724 |
+
"epoch": 1.1818898115037275,
|
5725 |
+
"grad_norm": 1.7655397653579712,
|
5726 |
+
"learning_rate": 9.699046387586877e-05,
|
5727 |
+
"loss": 4.3801,
|
5728 |
+
"num_input_tokens_seen": 626902594,
|
5729 |
+
"step": 107250
|
5730 |
+
},
|
5731 |
+
{
|
5732 |
+
"epoch": 1.1835428042470895,
|
5733 |
+
"grad_norm": 1.9583156108856201,
|
5734 |
+
"learning_rate": 9.690230248174325e-05,
|
5735 |
+
"loss": 4.3902,
|
5736 |
+
"num_input_tokens_seen": 627796194,
|
5737 |
+
"step": 107400
|
5738 |
+
},
|
5739 |
+
{
|
5740 |
+
"epoch": 1.1851957969904512,
|
5741 |
+
"grad_norm": 1.7717612981796265,
|
5742 |
+
"learning_rate": 9.681414108761774e-05,
|
5743 |
+
"loss": 4.3812,
|
5744 |
+
"num_input_tokens_seen": 628675970,
|
5745 |
+
"step": 107550
|
5746 |
+
},
|
5747 |
+
{
|
5748 |
+
"epoch": 1.1868487897338131,
|
5749 |
+
"grad_norm": 1.9090009927749634,
|
5750 |
+
"learning_rate": 9.672597969349223e-05,
|
5751 |
+
"loss": 4.3889,
|
5752 |
+
"num_input_tokens_seen": 629549794,
|
5753 |
+
"step": 107700
|
5754 |
+
},
|
5755 |
+
{
|
5756 |
+
"epoch": 1.1885017824771749,
|
5757 |
+
"grad_norm": 1.8910843133926392,
|
5758 |
+
"learning_rate": 9.663781829936672e-05,
|
5759 |
+
"loss": 4.3913,
|
5760 |
+
"num_input_tokens_seen": 630437378,
|
5761 |
+
"step": 107850
|
5762 |
+
},
|
5763 |
+
{
|
5764 |
+
"epoch": 1.1901547752205368,
|
5765 |
+
"grad_norm": 1.840728521347046,
|
5766 |
+
"learning_rate": 9.654965690524121e-05,
|
5767 |
+
"loss": 4.3792,
|
5768 |
+
"num_input_tokens_seen": 631313666,
|
5769 |
+
"step": 108000
|
5770 |
+
},
|
5771 |
+
{
|
5772 |
+
"epoch": 1.1918077679638985,
|
5773 |
+
"grad_norm": 1.8772791624069214,
|
5774 |
+
"learning_rate": 9.64614955111157e-05,
|
5775 |
+
"loss": 4.3813,
|
5776 |
+
"num_input_tokens_seen": 632194466,
|
5777 |
+
"step": 108150
|
5778 |
+
},
|
5779 |
+
{
|
5780 |
+
"epoch": 1.1934607607072605,
|
5781 |
+
"grad_norm": 1.9666273593902588,
|
5782 |
+
"learning_rate": 9.637333411699017e-05,
|
5783 |
+
"loss": 4.3716,
|
5784 |
+
"num_input_tokens_seen": 633058978,
|
5785 |
+
"step": 108300
|
5786 |
+
},
|
5787 |
+
{
|
5788 |
+
"epoch": 1.1951137534506224,
|
5789 |
+
"grad_norm": 1.930409550666809,
|
5790 |
+
"learning_rate": 9.628517272286466e-05,
|
5791 |
+
"loss": 4.3934,
|
5792 |
+
"num_input_tokens_seen": 633935458,
|
5793 |
+
"step": 108450
|
5794 |
+
},
|
5795 |
+
{
|
5796 |
+
"epoch": 1.1967667461939842,
|
5797 |
+
"grad_norm": 1.8000093698501587,
|
5798 |
+
"learning_rate": 9.619701132873915e-05,
|
5799 |
+
"loss": 4.3794,
|
5800 |
+
"num_input_tokens_seen": 634825634,
|
5801 |
+
"step": 108600
|
5802 |
+
},
|
5803 |
+
{
|
5804 |
+
"epoch": 1.198419738937346,
|
5805 |
+
"grad_norm": 1.8369793891906738,
|
5806 |
+
"learning_rate": 9.610884993461364e-05,
|
5807 |
+
"loss": 4.386,
|
5808 |
+
"num_input_tokens_seen": 635701666,
|
5809 |
+
"step": 108750
|
5810 |
+
},
|
5811 |
+
{
|
5812 |
+
"epoch": 1.2000727316807078,
|
5813 |
+
"grad_norm": 1.9381849765777588,
|
5814 |
+
"learning_rate": 9.602068854048813e-05,
|
5815 |
+
"loss": 4.3824,
|
5816 |
+
"num_input_tokens_seen": 636568994,
|
5817 |
+
"step": 108900
|
5818 |
+
},
|
5819 |
+
{
|
5820 |
+
"epoch": 1.2017257244240698,
|
5821 |
+
"grad_norm": 1.8089631795883179,
|
5822 |
+
"learning_rate": 9.593252714636261e-05,
|
5823 |
+
"loss": 4.3733,
|
5824 |
+
"num_input_tokens_seen": 637444034,
|
5825 |
+
"step": 109050
|
5826 |
+
},
|
5827 |
+
{
|
5828 |
+
"epoch": 1.2033787171674317,
|
5829 |
+
"grad_norm": 1.7429847717285156,
|
5830 |
+
"learning_rate": 9.584436575223709e-05,
|
5831 |
+
"loss": 4.3766,
|
5832 |
+
"num_input_tokens_seen": 638321634,
|
5833 |
+
"step": 109200
|
5834 |
+
},
|
5835 |
+
{
|
5836 |
+
"epoch": 1.2050317099107934,
|
5837 |
+
"grad_norm": 1.9182720184326172,
|
5838 |
+
"learning_rate": 9.575620435811159e-05,
|
5839 |
+
"loss": 4.3724,
|
5840 |
+
"num_input_tokens_seen": 639189538,
|
5841 |
+
"step": 109350
|
5842 |
+
},
|
5843 |
+
{
|
5844 |
+
"epoch": 1.2066847026541554,
|
5845 |
+
"grad_norm": 1.9700244665145874,
|
5846 |
+
"learning_rate": 9.566804296398607e-05,
|
5847 |
+
"loss": 4.3859,
|
5848 |
+
"num_input_tokens_seen": 640080354,
|
5849 |
+
"step": 109500
|
5850 |
+
},
|
5851 |
+
{
|
5852 |
+
"epoch": 1.2083376953975171,
|
5853 |
+
"grad_norm": 1.86391019821167,
|
5854 |
+
"learning_rate": 9.557988156986057e-05,
|
5855 |
+
"loss": 4.3875,
|
5856 |
+
"num_input_tokens_seen": 640977634,
|
5857 |
+
"step": 109650
|
5858 |
+
},
|
5859 |
+
{
|
5860 |
+
"epoch": 1.209990688140879,
|
5861 |
+
"grad_norm": 1.9451704025268555,
|
5862 |
+
"learning_rate": 9.549230791836256e-05,
|
5863 |
+
"loss": 4.3928,
|
5864 |
+
"num_input_tokens_seen": 641871874,
|
5865 |
+
"step": 109800
|
5866 |
+
},
|
5867 |
+
{
|
5868 |
+
"epoch": 1.211643680884241,
|
5869 |
+
"grad_norm": 2.063884735107422,
|
5870 |
+
"learning_rate": 9.540414652423704e-05,
|
5871 |
+
"loss": 4.3704,
|
5872 |
+
"num_input_tokens_seen": 642751170,
|
5873 |
+
"step": 109950
|
5874 |
+
},
|
5875 |
+
{
|
5876 |
+
"epoch": 1.2132966736276027,
|
5877 |
+
"grad_norm": 1.8499351739883423,
|
5878 |
+
"learning_rate": 9.531598513011154e-05,
|
5879 |
+
"loss": 4.3886,
|
5880 |
+
"num_input_tokens_seen": 643629698,
|
5881 |
+
"step": 110100
|
5882 |
+
},
|
5883 |
+
{
|
5884 |
+
"epoch": 1.2149496663709647,
|
5885 |
+
"grad_norm": 1.9735474586486816,
|
5886 |
+
"learning_rate": 9.522782373598601e-05,
|
5887 |
+
"loss": 4.3854,
|
5888 |
+
"num_input_tokens_seen": 644509698,
|
5889 |
+
"step": 110250
|
5890 |
+
},
|
5891 |
+
{
|
5892 |
+
"epoch": 1.2166026591143264,
|
5893 |
+
"grad_norm": 1.9430962800979614,
|
5894 |
+
"learning_rate": 9.513966234186051e-05,
|
5895 |
+
"loss": 4.3905,
|
5896 |
+
"num_input_tokens_seen": 645395394,
|
5897 |
+
"step": 110400
|
5898 |
+
},
|
5899 |
+
{
|
5900 |
+
"epoch": 1.2182556518576884,
|
5901 |
+
"grad_norm": 1.9608047008514404,
|
5902 |
+
"learning_rate": 9.505150094773499e-05,
|
5903 |
+
"loss": 4.383,
|
5904 |
+
"num_input_tokens_seen": 646254626,
|
5905 |
+
"step": 110550
|
5906 |
+
},
|
5907 |
+
{
|
5908 |
+
"epoch": 1.21990864460105,
|
5909 |
+
"grad_norm": 1.9237737655639648,
|
5910 |
+
"learning_rate": 9.4963927296237e-05,
|
5911 |
+
"loss": 4.3886,
|
5912 |
+
"num_input_tokens_seen": 647146658,
|
5913 |
+
"step": 110700
|
5914 |
+
},
|
5915 |
+
{
|
5916 |
+
"epoch": 1.221561637344412,
|
5917 |
+
"grad_norm": 1.9678759574890137,
|
5918 |
+
"learning_rate": 9.487576590211147e-05,
|
5919 |
+
"loss": 4.3858,
|
5920 |
+
"num_input_tokens_seen": 648004962,
|
5921 |
+
"step": 110850
|
5922 |
+
},
|
5923 |
+
{
|
5924 |
+
"epoch": 1.223214630087774,
|
5925 |
+
"grad_norm": 1.8643629550933838,
|
5926 |
+
"learning_rate": 9.478760450798597e-05,
|
5927 |
+
"loss": 4.3718,
|
5928 |
+
"num_input_tokens_seen": 648877602,
|
5929 |
+
"step": 111000
|
5930 |
+
},
|
5931 |
+
{
|
5932 |
+
"epoch": 1.2248676228311357,
|
5933 |
+
"grad_norm": 1.8100017309188843,
|
5934 |
+
"learning_rate": 9.469944311386045e-05,
|
5935 |
+
"loss": 4.38,
|
5936 |
+
"num_input_tokens_seen": 649743970,
|
5937 |
+
"step": 111150
|
5938 |
+
},
|
5939 |
+
{
|
5940 |
+
"epoch": 1.2265206155744977,
|
5941 |
+
"grad_norm": 1.8271883726119995,
|
5942 |
+
"learning_rate": 9.461128171973495e-05,
|
5943 |
+
"loss": 4.3911,
|
5944 |
+
"num_input_tokens_seen": 650620130,
|
5945 |
+
"step": 111300
|
5946 |
+
},
|
5947 |
+
{
|
5948 |
+
"epoch": 1.2281736083178596,
|
5949 |
+
"grad_norm": 1.9749687910079956,
|
5950 |
+
"learning_rate": 9.452312032560942e-05,
|
5951 |
+
"loss": 4.3715,
|
5952 |
+
"num_input_tokens_seen": 651492738,
|
5953 |
+
"step": 111450
|
5954 |
+
},
|
5955 |
+
{
|
5956 |
+
"epoch": 1.2298266010612213,
|
5957 |
+
"grad_norm": 1.9666537046432495,
|
5958 |
+
"learning_rate": 9.44349589314839e-05,
|
5959 |
+
"loss": 4.3823,
|
5960 |
+
"num_input_tokens_seen": 652359170,
|
5961 |
+
"step": 111600
|
5962 |
+
},
|
5963 |
+
{
|
5964 |
+
"epoch": 1.2314795938045833,
|
5965 |
+
"grad_norm": 1.9260027408599854,
|
5966 |
+
"learning_rate": 9.43467975373584e-05,
|
5967 |
+
"loss": 4.3862,
|
5968 |
+
"num_input_tokens_seen": 653229570,
|
5969 |
+
"step": 111750
|
5970 |
+
},
|
5971 |
+
{
|
5972 |
+
"epoch": 1.233132586547945,
|
5973 |
+
"grad_norm": 1.8240337371826172,
|
5974 |
+
"learning_rate": 9.425863614323288e-05,
|
5975 |
+
"loss": 4.3771,
|
5976 |
+
"num_input_tokens_seen": 654109090,
|
5977 |
+
"step": 111900
|
5978 |
+
},
|
5979 |
+
{
|
5980 |
+
"epoch": 1.234785579291307,
|
5981 |
+
"grad_norm": 1.957507848739624,
|
5982 |
+
"learning_rate": 9.417047474910738e-05,
|
5983 |
+
"loss": 4.3817,
|
5984 |
+
"num_input_tokens_seen": 654980482,
|
5985 |
+
"step": 112050
|
5986 |
+
},
|
5987 |
+
{
|
5988 |
+
"epoch": 1.2364385720346687,
|
5989 |
+
"grad_norm": 1.8944330215454102,
|
5990 |
+
"learning_rate": 9.408231335498185e-05,
|
5991 |
+
"loss": 4.3812,
|
5992 |
+
"num_input_tokens_seen": 655849634,
|
5993 |
+
"step": 112200
|
5994 |
+
},
|
5995 |
+
{
|
5996 |
+
"epoch": 1.2380915647780306,
|
5997 |
+
"grad_norm": 1.8677889108657837,
|
5998 |
+
"learning_rate": 9.399415196085636e-05,
|
5999 |
+
"loss": 4.3803,
|
6000 |
+
"num_input_tokens_seen": 656736738,
|
6001 |
+
"step": 112350
|
6002 |
+
},
|
6003 |
+
{
|
6004 |
+
"epoch": 1.2397445575213926,
|
6005 |
+
"grad_norm": 1.8283082246780396,
|
6006 |
+
"learning_rate": 9.390599056673083e-05,
|
6007 |
+
"loss": 4.3933,
|
6008 |
+
"num_input_tokens_seen": 657615938,
|
6009 |
+
"step": 112500
|
6010 |
+
},
|
6011 |
+
{
|
6012 |
+
"epoch": 1.2413975502647543,
|
6013 |
+
"grad_norm": 1.9106853008270264,
|
6014 |
+
"learning_rate": 9.381782917260533e-05,
|
6015 |
+
"loss": 4.3847,
|
6016 |
+
"num_input_tokens_seen": 658494850,
|
6017 |
+
"step": 112650
|
6018 |
+
},
|
6019 |
+
{
|
6020 |
+
"epoch": 1.2430505430081162,
|
6021 |
+
"grad_norm": 1.8882030248641968,
|
6022 |
+
"learning_rate": 9.372966777847981e-05,
|
6023 |
+
"loss": 4.3862,
|
6024 |
+
"num_input_tokens_seen": 659363618,
|
6025 |
+
"step": 112800
|
6026 |
+
},
|
6027 |
+
{
|
6028 |
+
"epoch": 1.244703535751478,
|
6029 |
+
"grad_norm": 1.964934229850769,
|
6030 |
+
"learning_rate": 9.36415063843543e-05,
|
6031 |
+
"loss": 4.3805,
|
6032 |
+
"num_input_tokens_seen": 660234946,
|
6033 |
+
"step": 112950
|
6034 |
+
},
|
6035 |
+
{
|
6036 |
+
"epoch": 1.24635652849484,
|
6037 |
+
"grad_norm": 1.8856420516967773,
|
6038 |
+
"learning_rate": 9.355334499022878e-05,
|
6039 |
+
"loss": 4.3794,
|
6040 |
+
"num_input_tokens_seen": 661115810,
|
6041 |
+
"step": 113100
|
6042 |
+
},
|
6043 |
+
{
|
6044 |
+
"epoch": 1.2480095212382019,
|
6045 |
+
"grad_norm": 1.8618583679199219,
|
6046 |
+
"learning_rate": 9.346518359610327e-05,
|
6047 |
+
"loss": 4.3883,
|
6048 |
+
"num_input_tokens_seen": 661994434,
|
6049 |
+
"step": 113250
|
6050 |
+
},
|
6051 |
+
{
|
6052 |
+
"epoch": 1.2496625139815636,
|
6053 |
+
"grad_norm": 1.9158508777618408,
|
6054 |
+
"learning_rate": 9.337702220197776e-05,
|
6055 |
+
"loss": 4.3739,
|
6056 |
+
"num_input_tokens_seen": 662868834,
|
6057 |
+
"step": 113400
|
6058 |
+
},
|
6059 |
+
{
|
6060 |
+
"epoch": 1.2513155067249255,
|
6061 |
+
"grad_norm": 1.8499860763549805,
|
6062 |
+
"learning_rate": 9.328886080785225e-05,
|
6063 |
+
"loss": 4.379,
|
6064 |
+
"num_input_tokens_seen": 663752002,
|
6065 |
+
"step": 113550
|
6066 |
+
},
|
6067 |
+
{
|
6068 |
+
"epoch": 1.2529684994682873,
|
6069 |
+
"grad_norm": 1.8565645217895508,
|
6070 |
+
"learning_rate": 9.320069941372673e-05,
|
6071 |
+
"loss": 4.3854,
|
6072 |
+
"num_input_tokens_seen": 664622402,
|
6073 |
+
"step": 113700
|
6074 |
+
},
|
6075 |
+
{
|
6076 |
+
"epoch": 1.2546214922116492,
|
6077 |
+
"grad_norm": 2.060188055038452,
|
6078 |
+
"learning_rate": 9.311253801960123e-05,
|
6079 |
+
"loss": 4.3758,
|
6080 |
+
"num_input_tokens_seen": 665495618,
|
6081 |
+
"step": 113850
|
6082 |
+
},
|
6083 |
+
{
|
6084 |
+
"epoch": 1.2562744849550112,
|
6085 |
+
"grad_norm": 1.892635464668274,
|
6086 |
+
"learning_rate": 9.30243766254757e-05,
|
6087 |
+
"loss": 4.3884,
|
6088 |
+
"num_input_tokens_seen": 666361922,
|
6089 |
+
"step": 114000
|
6090 |
+
},
|
6091 |
+
{
|
6092 |
+
"epoch": 1.2579274776983729,
|
6093 |
+
"grad_norm": 1.9154144525527954,
|
6094 |
+
"learning_rate": 9.29362152313502e-05,
|
6095 |
+
"loss": 4.3752,
|
6096 |
+
"num_input_tokens_seen": 667241410,
|
6097 |
+
"step": 114150
|
6098 |
+
},
|
6099 |
+
{
|
6100 |
+
"epoch": 1.2595804704417348,
|
6101 |
+
"grad_norm": 1.9253753423690796,
|
6102 |
+
"learning_rate": 9.284805383722468e-05,
|
6103 |
+
"loss": 4.3875,
|
6104 |
+
"num_input_tokens_seen": 668132226,
|
6105 |
+
"step": 114300
|
6106 |
+
},
|
6107 |
+
{
|
6108 |
+
"epoch": 1.2612334631850965,
|
6109 |
+
"grad_norm": 1.9465709924697876,
|
6110 |
+
"learning_rate": 9.275989244309918e-05,
|
6111 |
+
"loss": 4.3742,
|
6112 |
+
"num_input_tokens_seen": 669015202,
|
6113 |
+
"step": 114450
|
6114 |
+
},
|
6115 |
+
{
|
6116 |
+
"epoch": 1.2628864559284585,
|
6117 |
+
"grad_norm": 1.9070016145706177,
|
6118 |
+
"learning_rate": 9.267173104897366e-05,
|
6119 |
+
"loss": 4.3737,
|
6120 |
+
"num_input_tokens_seen": 669892578,
|
6121 |
+
"step": 114600
|
6122 |
+
},
|
6123 |
+
{
|
6124 |
+
"epoch": 1.2645394486718202,
|
6125 |
+
"grad_norm": 1.9075013399124146,
|
6126 |
+
"learning_rate": 9.258356965484816e-05,
|
6127 |
+
"loss": 4.3789,
|
6128 |
+
"num_input_tokens_seen": 670773314,
|
6129 |
+
"step": 114750
|
6130 |
+
},
|
6131 |
+
{
|
6132 |
+
"epoch": 1.2661924414151822,
|
6133 |
+
"grad_norm": 1.8648816347122192,
|
6134 |
+
"learning_rate": 9.249540826072263e-05,
|
6135 |
+
"loss": 4.3583,
|
6136 |
+
"num_input_tokens_seen": 671644514,
|
6137 |
+
"step": 114900
|
6138 |
+
},
|
6139 |
+
{
|
6140 |
+
"epoch": 1.2678454341585441,
|
6141 |
+
"grad_norm": 1.9572055339813232,
|
6142 |
+
"learning_rate": 9.240724686659714e-05,
|
6143 |
+
"loss": 4.3871,
|
6144 |
+
"num_input_tokens_seen": 672523202,
|
6145 |
+
"step": 115050
|
6146 |
+
},
|
6147 |
+
{
|
6148 |
+
"epoch": 1.2694984269019058,
|
6149 |
+
"grad_norm": 1.9419187307357788,
|
6150 |
+
"learning_rate": 9.231908547247161e-05,
|
6151 |
+
"loss": 4.3802,
|
6152 |
+
"num_input_tokens_seen": 673387298,
|
6153 |
+
"step": 115200
|
6154 |
+
},
|
6155 |
+
{
|
6156 |
+
"epoch": 1.2711514196452678,
|
6157 |
+
"grad_norm": 1.9556363821029663,
|
6158 |
+
"learning_rate": 9.223092407834611e-05,
|
6159 |
+
"loss": 4.3922,
|
6160 |
+
"num_input_tokens_seen": 674262786,
|
6161 |
+
"step": 115350
|
6162 |
+
},
|
6163 |
+
{
|
6164 |
+
"epoch": 1.2728044123886297,
|
6165 |
+
"grad_norm": 1.8693435192108154,
|
6166 |
+
"learning_rate": 9.214276268422059e-05,
|
6167 |
+
"loss": 4.3719,
|
6168 |
+
"num_input_tokens_seen": 675145058,
|
6169 |
+
"step": 115500
|
6170 |
+
},
|
6171 |
+
{
|
6172 |
+
"epoch": 1.2744574051319915,
|
6173 |
+
"grad_norm": 1.9475206136703491,
|
6174 |
+
"learning_rate": 9.205460129009508e-05,
|
6175 |
+
"loss": 4.38,
|
6176 |
+
"num_input_tokens_seen": 676008962,
|
6177 |
+
"step": 115650
|
6178 |
+
},
|
6179 |
+
{
|
6180 |
+
"epoch": 1.2761103978753534,
|
6181 |
+
"grad_norm": 1.8718332052230835,
|
6182 |
+
"learning_rate": 9.196643989596957e-05,
|
6183 |
+
"loss": 4.3734,
|
6184 |
+
"num_input_tokens_seen": 676887042,
|
6185 |
+
"step": 115800
|
6186 |
+
},
|
6187 |
+
{
|
6188 |
+
"epoch": 1.2777633906187151,
|
6189 |
+
"grad_norm": 1.8318613767623901,
|
6190 |
+
"learning_rate": 9.187827850184405e-05,
|
6191 |
+
"loss": 4.3857,
|
6192 |
+
"num_input_tokens_seen": 677766690,
|
6193 |
+
"step": 115950
|
6194 |
}
|
6195 |
],
|
6196 |
"logging_steps": 150,
|
6197 |
"max_steps": 272232,
|
6198 |
+
"num_input_tokens_seen": 678060130,
|
6199 |
"num_train_epochs": 3,
|
6200 |
"save_steps": 500,
|
6201 |
"stateful_callbacks": {
|
|
|
6210 |
"attributes": {}
|
6211 |
}
|
6212 |
},
|
6213 |
+
"total_flos": 1.04696823656832e+16,
|
6214 |
"train_batch_size": 32,
|
6215 |
"trial_name": null,
|
6216 |
"trial_params": null
|